Ejemplo n.º 1
0
def filterArticles(articles):
    relevant_articles = {}
    correct = [0] * (len(int2tags) - 1)
    gold_num = [0] * (len(int2tags) - 1)
    filtered_correct = [0] * (len(int2tags) - 1)
    filtered_gold_num = [0] * (len(int2tags) - 1)
    helper.load_constants()
    print "Num incidents", len(incidents)
    print "Num unfilitered articles", len(articles)
    for incident_id in incidents.keys():
        incident = incidents[incident_id]
        if not 'citations' in incident:
            continue
        for citation_ind, citation in enumerate(incident['citations']):
            saveFile = "../data/raw_data/" + incident_id + "_" + str(
                citation_ind) + ".raw"
            print "checking it for savefile", saveFile
            if not saveFile in articles:
                continue

            article = tokenizer.tokenize(articles[saveFile])
            ents = [incident[e.replace('-', '_')] for e in int2tags[1:]]

            tags, cleanArticle = getTags(article, ents)

            ##Calculate scores for filitered and unflitered articles
            for i in range(1, len(int2tags)):
                correct[i - 1] += 1 if i in tags else 0
                gold_num[i - 1] += ents[i -
                                        1].strip().lower() not in ["unknown"]

            if len(set(tags)) > 2:  ##This is the filtering
                for i in range(1, len(int2tags)):
                    filtered_correct[i - 1] += 1 if i in tags else 0
                    filtered_gold_num[i -
                                      1] += ents[i -
                                                 1].strip().lower() not in [
                                                     "unknown"
                                                 ]
                #Store article in convenient format to writing to tagfile
                relavant_article = {}
                relavant_article['tokens'] = cleanArticle[:1000]
                relavant_article['tags'] = tags
                relavant_article['title'] = citation['Title']
                relavant_article['ents'] = [cleanDelimiters(e) for e in ents]
                relevant_articles[saveFile] = relavant_article
    pickle.dump(relevant_articles, open('EMA_filtered_articles.2.p', 'wb'))

    oracle_scores = [(correct[i] * 1. / gold_num[i],
                      int2tags[i + 1]) if gold_num[i] > 0 else 0
                     for i in range(len(correct))]
    filtered_oracle_scores = [
        (filtered_correct[i] * 1. / filtered_gold_num[i],
         int2tags[i + 1]) if filtered_gold_num[i] > 0 else 0
        for i in range(len(correct))
    ]
    print "num articles is", len(relevant_articles)
    print "oracle scores", oracle_scores
    print "filtered_oracle_scores", filtered_oracle_scores
    return relevant_articles
def filterArticles(articles):
    relevant_articles = {}
    correct = [0] * (len(int2tags) -1 )
    gold_num = [0] * (len(int2tags)-1)
    filtered_correct = [0] * (len(int2tags) -1 )
    filtered_gold_num = [0] * (len(int2tags)-1)
    helper.load_constants()
    print "Num incidents", len(incidents)
    print "Num unfilitered articles", len(articles)
    for incident_id in incidents.keys():
        incident = incidents[incident_id]
        if not 'citations' in incident:
            continue
        for citation_ind, citation in enumerate(incident['citations']):
            saveFile = "../data/raw_data/"+ incident_id+"_"+str(citation_ind)+".raw"
            print "checking it for savefile", saveFile
            if not saveFile in articles:
                continue

            article = tokenizer.tokenize(articles[saveFile])
            ents = [incident[e.replace('-','_')] for e in int2tags[1:]]

            tags, cleanArticle = getTags(article, ents)

            ##Calculate scores for filitered and unflitered articles
            for i in range(1, len(int2tags) ):
                correct[i-1] += 1 if i in tags else 0
                gold_num[i-1]    += ents[i-1].strip().lower() not in ["unknown"]

            if len(set(tags)) > 2: ##This is the filtering
                for i in range(1, len(int2tags) ):
                    filtered_correct[i-1] += 1 if i in tags else 0
                    filtered_gold_num[i-1]    += ents[i-1].strip().lower() not in ["unknown"]
                #Store article in convenient format to writing to tagfile
                relavant_article = {}
                relavant_article['tokens'] = cleanArticle[:1000]
                relavant_article['tags']   = tags
                relavant_article['title']  = citation['Title']
                relavant_article['ents']   = [cleanDelimiters(e) for e in ents]
                relevant_articles[saveFile] = relavant_article
    pickle.dump(relevant_articles, open('EMA_filtered_articles.2.p', 'wb'))
                    
    oracle_scores = [(correct[i]*1./gold_num[i], int2tags[i+1]) if gold_num[i] > 0 else 0 for i in range(len(correct))]
    filtered_oracle_scores = [(filtered_correct[i]*1./filtered_gold_num[i], int2tags[i+1]) if filtered_gold_num[i] > 0 else 0 for i in range(len(correct))]
    print "num articles is", len(relevant_articles)
    print "oracle scores", oracle_scores
    print "filtered_oracle_scores", filtered_oracle_scores
    return relevant_articles
def main(training_file, trained_model, previous_n, next_n, c, prune,
         test_file):
    helper.load_constants()
    train_data, identifier = load_data(training_file)

    test_data, test_ident = load_data(test_file)

    ## extract features
    tic = time.clock()
    print "get word_vocab"
    num_words, word_vocab = get_word_vocab(train_data, prune)
    print "feature extract for train"
    trainX, trainY = get_feature_matrix_n(previous_n, next_n, train_data,
                                          num_words, word_vocab,
                                          helper.other_features)
    print 'feature extract for test'
    testX, testY = get_feature_matrix_n(previous_n, next_n, test_data,
                                        num_words, word_vocab,
                                        helper.other_features)
    print time.clock() - tic

    ## train LR
    print("training")
    tic = time.clock()
    clf = LogisticRegression(C=c, multi_class='multinomial', solver='lbfgs')
    clf.fit(trainX, trainY)
    print time.clock() - tic

    print "predicting"
    predictY = clf.predict(testX)
    assert len(predictY) == len(testY)

    print "evaluating"

    evaluatePredictions(predictY, testY)

    feature_list = (word_vocab.keys() + helper.other_features) * (
        previous_n + next_n + 1) + word_vocab.keys() + ['previous_one'] * len(
            tags) + ['previous_two'] * len(tags) + ['previous_three'
                                                    ] * len(tags)
    # getTopFeatures(clf,tags,feature_list)
    if trained_model != "":
        pickle.dump(
            [clf, previous_n, next_n, word_vocab, helper.other_features],
            open(trained_model, "wb"))
    return [clf, previous_n, next_n, word_vocab, helper.other_features]
def main(training_file,trained_model,previous_n,next_n, c, prune, test_file):
    helper.load_constants()
    train_data, identifier = load_data(training_file)
 
    test_data, test_ident = load_data(test_file)

    ## extract features
    tic = time.clock()
    print "get word_vocab"
    num_words, word_vocab = get_word_vocab(train_data, prune)
    print "feature extract for train"
    trainX, trainY = get_feature_matrix_n(previous_n,next_n,train_data, num_words, word_vocab, helper.other_features)
    print 'feature extract for test'
    testX, testY   = get_feature_matrix_n(previous_n, next_n, test_data, num_words, word_vocab, helper.other_features)    
    print time.clock()-tic

    ## train LR
    print("training")
    tic = time.clock()
    clf = LogisticRegression(C=c, multi_class='multinomial', solver='lbfgs')
    clf.fit(trainX,trainY)
    print time.clock()-tic

    print "predicting"
    predictY = clf.predict(testX)
    assert len(predictY) == len(testY)

    print "evaluating"

    evaluatePredictions(predictY, testY)

    feature_list = (word_vocab.keys() + helper.other_features) * (previous_n+next_n+1)  + word_vocab.keys() + ['previous_one'] * len(tags) + ['previous_two'] * len(tags)+ ['previous_three'] * len(tags)
    # getTopFeatures(clf,tags,feature_list)
    if trained_model != "":
        pickle.dump([clf, previous_n,next_n, word_vocab,helper.other_features], open( trained_model, "wb" ) )
    return [clf, previous_n,next_n, word_vocab,helper.other_features]
Ejemplo n.º 5
0
from train import load_data
import helper
import re, pdb, collections
import constants
import re

p = inflect.engine()

int2tags = [
    'TAG'
] + constants.int2tags  #since the constants file does not include the 'TAG' tag
NUM_ENTITIES = len(constants.int2tags)
tags2int = constants.tags2int
tags = range(len(int2tags))

helper.load_constants()
mode = constants.mode

CORRECT = collections.defaultdict(lambda: 0.)
GOLD = collections.defaultdict(lambda: 0.)
PRED = collections.defaultdict(lambda: 0.)


def splitBars(w):
    return [q.strip() for q in w.split('|')]


# main loop
def main(trained_model,
         testing_file,
         viterbi,
import pickle
import inflect
import train_crf as crf
from train import load_data
import helper
import re, pdb, collections
import constants
import re

p = inflect.engine()

int2tags = ['TAG'] + constants.int2tags #since the constants file does not include the 'TAG' tag
tags2int = constants.tags2int
tags = range(len(int2tags)) 

helper.load_constants()
mode = constants.mode

# main loop
def main(trained_model,testing_file,viterbi,output_tags="output.tag", output_predictions="output.pred"):
    test_data, identifier = load_data(testing_file)

    evaluate = True

    ## extract features
    if not "crf" in trained_model: 
        if not isinstance(trained_model, list):
            clf, previous_n, next_n, word_vocab,other_features = pickle.load( open( trained_model, "rb" ) )
        else:
            clf, previous_n, next_n, word_vocab,other_features = trained_model