Esempio n. 1
0
def generateKaggleSubmission(tagger,outfilename):
    with open(outfilename, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=['Id', 'Prediction'])
        writer.writeheader()

        alltags = set()
        for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)):
            for tag in tags:
                alltags.add(tag)

        i=0
        for words,_ in preproc.conllSeqGenerator(TEST_FILE):
            pred_tags = tagger(words,alltags)
            if isinstance(pred_tags, tuple):
                pred_tags = pred_tags[0] 
            for tag in pred_tags:
                writer.writerow({
                    'Id': 'test-{}'.format(i),
                    'Prediction':tag})
                i+=1
        i=0 
        for words,_ in preproc.conllSeqGenerator(DEV_FILE):
            pred_tags = tagger(words,alltags)
            if isinstance(pred_tags, tuple):
                pred_tags = pred_tags[0]
            for tag in pred_tags:
                # print >>outfile, tag
                writer.writerow({
                    'Id': 'dev-{}'.format(i),
                    'Prediction':tag})
                i+=1
Esempio n. 2
0
def evalTagger(tagger,outfilename,testfile=DEV_FILE):
    alltags = set()
    for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)):
        for tag in tags:
            alltags.add(tag)
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conllSeqGenerator(testfile):
            pred_tags = tagger(words,alltags)
            for tag in pred_tags:
                print >>outfile, tag
            print >>outfile, ""
    return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
Esempio n. 3
0
def evalTagger(tagger,outfilename,testfile=DEV_FILE):
    alltags = set()
    for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)):
        for tag in tags:
            alltags.add(tag)
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conllSeqGenerator(testfile):
            pred_tags = tagger(words,alltags)
            for tag in pred_tags:
                print >>outfile, tag
            print >>outfile, ""
    return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
Esempio n. 4
0
def test_nb_prob_mass ():
    probability_masses = defaultdict(float)
    allwords = set([])
    for words, _ in preproc.conllSeqGenerator(TRAIN_FILE):
        for word in words:
            allwords.add(word)

    for tag in alltags:
        total_prob = sum(np.exp(weights_nb[(tag, word)]) for word in allwords)
        assert_almost_equals (1.0, total_prob, places=2,
            msg="UNEQUAL Expected tag %s to have total prob of 1.0, but instead has %s" %(tag, total_prob))
Esempio n. 5
0
def evalTagger(tagger,outfilename,testfile=DEV_FILE):
    """Calculate confusion_matrix for a given tagger

    Parameters:
    tagger -- Function mapping (words, possible_tags) to an optimal
              sequence of tags for the words
    outfilename -- Filename to write tagger predictions to
    testfile -- (optional) Filename containing true labels

    Returns:
    confusion_matrix -- dict of occurences of (true_label, pred_label)
    """
    alltags = set()
    for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)):
        for tag in tags:
            alltags.add(tag)
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conllSeqGenerator(testfile):
            pred_tags = tagger(words,alltags)
            for tag in pred_tags:
                print >>outfile, tag
            print >>outfile, ""
    return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
Esempio n. 6
0
def evalTagger(tagger, outfilename, testfile=DEV_FILE):
    """Calculate confusion_matrix for a given tagger

    Parameters:
    tagger -- Function mapping (words, possible_tags) to an optimal
              sequence of tags for the words
    outfilename -- Filename to write tagger predictions to
    testfile -- (optional) Filename containing true labels

    Returns:
    confusion_matrix -- dict of occurences of (true_label, pred_label)
    """
    alltags = set()
    for i, (words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)):
        for tag in tags:
            alltags.add(tag)
    with open(outfilename, 'w') as outfile:
        for words, _ in preproc.conllSeqGenerator(testfile):
            pred_tags = tagger(words, alltags)
            for tag in pred_tags:
                print >> outfile, tag
            print >> outfile, ""
    return scorer.getConfusion(
        testfile, outfilename)  #run the scorer on the prediction file
Esempio n. 7
0
def test_nb_prob_mass():
    probability_masses = defaultdict(float)
    allwords = set([])
    for words, _ in preproc.conllSeqGenerator(TRAIN_FILE):
        for word in words:
            allwords.add(word)

    for tag in alltags:
        total_prob = sum(np.exp(weights_nb[(tag, word)]) for word in allwords)
        assert_almost_equals(
            1.0,
            total_prob,
            places=2,
            msg=
            "UNEQUAL Expected tag %s to have total prob of 1.0, but instead has %s"
            % (tag, total_prob))