def generateKaggleSubmission(tagger,outfilename): with open(outfilename, 'w') as f: writer = csv.DictWriter(f, fieldnames=['Id', 'Prediction']) writer.writeheader() alltags = set() for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)): for tag in tags: alltags.add(tag) i=0 for words,_ in preproc.conllSeqGenerator(TEST_FILE): pred_tags = tagger(words,alltags) if isinstance(pred_tags, tuple): pred_tags = pred_tags[0] for tag in pred_tags: writer.writerow({ 'Id': 'test-{}'.format(i), 'Prediction':tag}) i+=1 i=0 for words,_ in preproc.conllSeqGenerator(DEV_FILE): pred_tags = tagger(words,alltags) if isinstance(pred_tags, tuple): pred_tags = pred_tags[0] for tag in pred_tags: # print >>outfile, tag writer.writerow({ 'Id': 'dev-{}'.format(i), 'Prediction':tag}) i+=1
def evalTagger(tagger,outfilename,testfile=DEV_FILE): alltags = set() for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)): for tag in tags: alltags.add(tag) with open(outfilename,'w') as outfile: for words,_ in preproc.conllSeqGenerator(testfile): pred_tags = tagger(words,alltags) for tag in pred_tags: print >>outfile, tag print >>outfile, "" return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
def evalTagger(tagger,outfilename,testfile=DEV_FILE): alltags = set() for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)): for tag in tags: alltags.add(tag) with open(outfilename,'w') as outfile: for words,_ in preproc.conllSeqGenerator(testfile): pred_tags = tagger(words,alltags) for tag in pred_tags: print >>outfile, tag print >>outfile, "" return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
def test_nb_prob_mass (): probability_masses = defaultdict(float) allwords = set([]) for words, _ in preproc.conllSeqGenerator(TRAIN_FILE): for word in words: allwords.add(word) for tag in alltags: total_prob = sum(np.exp(weights_nb[(tag, word)]) for word in allwords) assert_almost_equals (1.0, total_prob, places=2, msg="UNEQUAL Expected tag %s to have total prob of 1.0, but instead has %s" %(tag, total_prob))
def evalTagger(tagger,outfilename,testfile=DEV_FILE): """Calculate confusion_matrix for a given tagger Parameters: tagger -- Function mapping (words, possible_tags) to an optimal sequence of tags for the words outfilename -- Filename to write tagger predictions to testfile -- (optional) Filename containing true labels Returns: confusion_matrix -- dict of occurences of (true_label, pred_label) """ alltags = set() for i,(words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)): for tag in tags: alltags.add(tag) with open(outfilename,'w') as outfile: for words,_ in preproc.conllSeqGenerator(testfile): pred_tags = tagger(words,alltags) for tag in pred_tags: print >>outfile, tag print >>outfile, "" return scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file
def evalTagger(tagger, outfilename, testfile=DEV_FILE): """Calculate confusion_matrix for a given tagger Parameters: tagger -- Function mapping (words, possible_tags) to an optimal sequence of tags for the words outfilename -- Filename to write tagger predictions to testfile -- (optional) Filename containing true labels Returns: confusion_matrix -- dict of occurences of (true_label, pred_label) """ alltags = set() for i, (words, tags) in enumerate(preproc.conllSeqGenerator(TRAIN_FILE)): for tag in tags: alltags.add(tag) with open(outfilename, 'w') as outfile: for words, _ in preproc.conllSeqGenerator(testfile): pred_tags = tagger(words, alltags) for tag in pred_tags: print >> outfile, tag print >> outfile, "" return scorer.getConfusion( testfile, outfilename) #run the scorer on the prediction file
def test_nb_prob_mass(): probability_masses = defaultdict(float) allwords = set([]) for words, _ in preproc.conllSeqGenerator(TRAIN_FILE): for word in words: allwords.add(word) for tag in alltags: total_prob = sum(np.exp(weights_nb[(tag, word)]) for word in allwords) assert_almost_equals( 1.0, total_prob, places=2, msg= "UNEQUAL Expected tag %s to have total prob of 1.0, but instead has %s" % (tag, total_prob))