def tfRecordToCaffe(self, datasetName, outputPath, nameAsGroundTruth=False): MkDataSetStructure(os.path.join(outputPath, datasetName)) fileManager = Tagger(os.path.join(outputPath, datasetName)) dataBuffer = self.readTFRecord() for data in dataBuffer: if nameAsGroundTruth: imageName = data['filename'] else: imageName = data["sourceID"] imageFilename = imageName + ".jpg" imageOutputPath = os.path.join(outputPath, datasetName, "Images") fileManager.AppendTrainingImg(imageName) self.saveFromRawImageData(data["imgEncoded"], data["height"], data["width"], imageOutputPath, imageFilename) for xMin, yMin, xMax, yMax, classText, classID in zip( data["xMins"], data["yMins"], data["xMaxs"], data["yMaxs"], data["classesText"], data["classesID"]): fileManager.AppendAnnotation( (xMin, yMin), (xMax, yMax), imageName, classText.decode('utf-8') + " " + str(classID))
def path_input(self, pth): """Handle request for new input file. If new tag and display else display""" self.clear_results() tfiles = None # Single image path if not isdir(pth): new = True # SQL exception if path is not unique try: DataBase.add_image(pth) except IntegrityError: new = False # Tag new if new: tags = Tagger.tag_file(pth) for tag in tags: DataBase.tag_image(tag, pth=pth) # Directory path else: tags, tfiles = Tagger.tag_dir(pth) for i in range(len(tfiles)): f = tfiles[i] # Full path to image fpth = join(pth, f) tfiles[i] = fpth # Continue if already present if DataBase.exists(pth=fpth): continue else: DataBase.add_image(fpth) # Tuple results if not isinstance(tags[i], str): for t in tags[i]: DataBase.tag_image(t, pth=fpth) # String result else: DataBase.tag_image(tags[i], pth=fpth) L = 1 # Display if tfiles is None: self.queue_images(pth) else: #Number of listbox for results length L = len(tfiles) nlb = max(3, L) nlb = min(nlb, 12) self.ui.builder.get_object("ListResults").config(height=nlb) self.queue_images(tfiles) self.update_info("Processed " + str(L) + " images")
def cross_validate(count_words): ''' Runs cross validation on the Tagger, count_words = True iff the Tagger counts all the words, so P(word | tag) is known, but P(tag | prev_tag) is still only 90% known ''' global total_errs, total_matches sum_err = 0 print 'Fold Err Match Frac_Match' # files in the corpus range from 0-100, so test and train ranges are slices of this # range, such that train_range and test_range together make range(0,100) for i in xrange(folds): train_range = range(0, chunk * i) + range(chunk * (i + 1), 100) test_range = range(chunk * i, chunk * (i + 1)) total_errs = 0 total_matches = 0 tm = time.time() c = TagCounter() c.parse_corpus_range(train_range) if (count_words): c.only_words = count_words c.parse_corpus_range(test_range) t = Tagger(c) tm = time.time() - tm if timing: print tm, # file_validate is mapped across all files in test_range def file_validate(f): global total_errs, total_matches sentences = parse_file(f) for sent in sentences[:size]: words = [] for word in sent[1:]: # sent[0] will always be START words.append(word.true_chars) tagged = t.tag_words(words) matches = 0 errs = 0 for (actual_w, pred_w) in zip(sent, tagged): if actual_w.tag != pred_w.tag: #print actual_w, pred_w # prints the mistagged pairs errs += 1 else: matches += 1 total_errs += errs total_matches += matches tm = time.time() map_files(file_validate, test_range) tm = time.time() - tm if timing: print tm print '%3d %6d %7d %0.4f' % (i, total_errs, total_matches, percent_match( total_errs, total_matches)) print '' sum_err += percent_match(total_errs, total_matches) print 'cumulative averaged error:', (sum_err * 1.0) / folds
def extract_entities(pmid): """ This function tags genes and species in the PubMed article (based on the id provided). If possible these genes and species are further annotated using information present in the NCBi gene/taxonomy database. :param pmid: The id of the article that should be tagged. :return: The genes (as Gene objects) and the organisms (as Organism object) found in the article. """ tagger = Tagger() tag_object = tagger.tag([pmid]) genes = [] organisms = [] if tag_object: tag_object = tag_object[0] annotation = tag_object.get_annotation() genes = annotation.get("Gene", {None}) organisms = annotation.get("Species", {None}) genes = [convert_to_object(gene, "Gene") for gene in genes] organisms = [ convert_to_object(organism, "Species") for organism in organisms ] return genes, organisms
class TaggerHandler: def __init__(self, dataDir, table): sys.stderr.write("TaggerHandler: Constructor\n") self.__TaggerInstance = Tagger() self.__dataDir = dataDir self.__tableFile = open(table,'w') def __updateTable(self, setting, accuracies): self.__tableFile.write(setting+'\t'+'\t'.join(map(lambda x:str(x), accuracies))+'\n') def __runTagger(self, trainFile, testFile): self.__TaggerInstance.loadData(trainFile, testFile) self.__TaggerInstance.train() accuracies = self.__TaggerInstance.test() setting = trainFile.split("_")[0] self.__updateTable(setting, accuracies) def run(self, trainFiles, testFiles): trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)] testFiles = [self.__dataDir+line.strip() for line in open(testFiles)] for trainFile in trainFiles: for testFile in testFiles: self.__runTagger(trainFile, testFile) self.__tableFile.close()
def save_results(self): abs_filepath = os.path.abspath(self.outfile) total = Utils.get_count_of_sentences(self.testfile) * 1.0 logging.info("{} sentences found".format(total)) with open(abs_filepath, 'w') as f: for i, s in enumerate(Utils.get_sentence(self.testfile)): original_sentence = s['o'] # original raw sentence from file s = s['c'] # cleaned sentence untagged_sentence = self.remove_tags(s) untagged_original_sentence = self.remove_tags( original_sentence) tags = Tagger(self.model, untagged_sentence).tag() tagged_sentence = self.attach_tags(untagged_original_sentence, tags) Utils.write_sentence(f, tagged_sentence) if i % 100 == 0: logging.info("{}% done. Last tagged: {}".format( round(i / total * 100.0, 2), s.replace("\n", " "))) # break f.write(Utils.SENTENCE_SEPARATOR)
# -*- coding: cp1254 -*- # Onur Yilmaz # Imports from Tagger import Tagger # Open the file where tagger is saved taggerFileName = 'my_tagger.yaml' myTagger = Tagger.load(taggerFileName) # Keep the original functionality intact def tag(sentence): return myTagger.tag(sentence) # End of code
# Setting up directory mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/' trainingPath = 'data/training' directory = os.fsencode(mypath) # Runs the ontology classification print('Running Ontology Classification: \n') ontology = Ontology() ontology.run(mypath) # Begins tagging print("\nTagging progress beginning. Get a brew, it'll take a while... \n") extractor = DataExtractor() # Trains our model extractor.train(trainingPath) tagger = Tagger() # Tags all emails in the directory given tagger.tag_seminar(mypath, directory, extractor) # Calculates how long the program took seconds = time.time() - start_time m, s = divmod(seconds, 60) print("The program has been running for {0} minutes and {1} seconds \n".format( round(m), round(s))) # Evaluates results eval = Evaluation() eval.run()
def test_load_incorrectly(self): temporaryFileName = os.path.join(self.tempDir, 'temporary_file.txt') with open(temporaryFileName, 'w') as file: file.write("This is a line that won't be able to be read") with self.assertRaises(TypeError): Tagger.load(temporaryFileName)
def setUp(self): self.filePath = 'my_tagger.yaml' self.tag = Tagger.load(self.filePath)
class TaggerHandler: def __init__(self, dataDir, table): sys.stderr.write("TaggerHandler: Constructor\n") self.__TaggerInstance = Tagger() self.__dataDir = dataDir self.__tableFile = open(table, 'w', 1) self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n') def __updateTable(self, setting, accuracies): ##print accuracies self.__tableFile.write(setting + '\t' + '\t'.join(map(lambda x:str(x), accuracies)) + '\n') def __getSetting(self, string): string = string.split("/")[-1].split("TrainCS")[1] cstype = string.split("CS")[0] csSplit = string.split("CS")[1].split("Pure")[0] pureSplit = string.split("Pure")[1].split("Total")[0] pureCSSplit = pureSplit + '-' + csSplit totalSize = string.split("Total")[1].split("_")[0] return '\t'.join([cstype, pureCSSplit, totalSize]) def __runTagger(self, trainFile, testFile): self.__TaggerInstance.loadData(trainFile, testFile) self.__TaggerInstance.train() accuracies = self.__TaggerInstance.test() trainSetting = self.__getSetting(trainFile) testSetting = self.__getSetting(testFile) tagset = self.__tagset(trainFile) self.__updateTable(trainSetting + '\t' + testSetting + '\t' + tagset, accuracies) def __runTagger2(self, trainFile, testFile, expType): trainFile = self.__dataDir+trainFile testFile = self.__dataDir+testFile self.__TaggerInstance.loadData(trainFile, testFile) self.__TaggerInstance.train() accuracies = self.__TaggerInstance.test() trainSetting = self.__getSetting(trainFile) testSetting = self.__getSetting(testFile) tagset = self.__tagset(trainFile) self.__updateTable(trainSetting + '\t' + expType + '\t' + testSetting + '\t' + tagset, accuracies) def __tagset(self, string): tagset = "Mixed" if len(string.split(".")) > 1 and string.split(".")[1] == "uni": tagset = 'Universal' if string.find(".uniq") >= 0: tagset += ".uniq" return tagset def run(self, trainFiles, testFiles): #trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)] #testFiles = [self.__dataDir+line.strip() for line in open(testFiles)] trainFiles = [line.strip() for line in open(trainFiles)] testFiles = [line.strip() for line in open(testFiles)] for trainFile in trainFiles: #if trainFile.find("Type1")>=0 or trainFile.find("Type0")>=0: # continue for testFile in testFiles: if self.__tagset(trainFile) != "Mixed" or self.__tagset(trainFile) != self.__tagset(testFile): #if self.__tagset(trainFile)!= self.__tagset(testFile): continue ##if testFile.find("CS0Pure100")<0: ## continue ##print testFile self.__runTagger(trainFile, testFile) self.__tableFile.close() def run2(self, trainFiles, testFiles): trainFiles = [line.strip() for line in open(trainFiles)] testFiles = [line.strip() for line in open(testFiles)] for trainFile in trainFiles: for testFile in testFiles: if self.__tagset(trainFile) != self.__tagset(testFile): continue controlTrainFile = trainFile + "_Control" if self.__tagset(trainFile) == "Universal": controlTrainFile = trainFile.split(".uni")[0] + "_Control" + ".uni" ##if testFile.find("CS0Pure100")<0: ## continue ##print testFile self.__runTagger2(trainFile, testFile, "Experiment") self.__runTagger2(controlTrainFile, testFile, "Control") self.__tableFile.close()
print "("+token+")", print "\n\nNUMBER OF TOKENS ANSWER,TEST" print len(gold_tokens),len(test_tokens) print "\n\nDIFFERENCE\nONLY IN TEST" difference = set(test_tokens)-set(gold_tokens) for token in difference: print token+" ", print "\n\nONLY IN ANSWER" difference = set(gold_tokens)-set(test_tokens) for token in difference: print token+" ", print "\n" print "--TAGGER--" tagger = Tagger(tagged_train_sents) print "EVALUATE TAGGER" print "RATE" tagger.evaluate(tagged_gold_sents) print "\n\nANSWER" tagged_gold_tokens = sum(tagged_gold_sents,[]) for tup in tagged_gold_tokens: print str(tup[0])+"/"+str(tup[1]), print "\n\nTEST" gold_tokens = [] for sent in tagged_gold_sents: for tup in sent: gold_tokens.append(tup[0]) tagged_test_tokens = tagger.tag(gold_tokens) for tup in tagged_test_tokens: print str(tup[0])+"/"+str(tup[1]),
return float(orphan_count) / len(tag_occurrences) def calculate_combined(tagging_user): """ returns the combined measure """ return (get_orphaniness(tagging_user) + get_cond_entropy_normalized(tagging_user)) / 2 if __name__ == '__main__': TAGGER = Tagger("hugo") TAG_SET = set() TAG_SET.add("computer") TAG_SET.add("reference") TAGGER.add_post("1", TAG_SET) TAG_SET.clear() TAG_SET.add("reference") TAG_SET.add("calculator") TAG_SET.add("rate") TAGGER.add_post("2", TAG_SET) TAG_SET.clear()
######### hmm-tagger.py ######### from Tagger import Tagger # import the tagging controller import os # for path info # initialize a tagging object with the cleaned corpus file(s) t = Tagger(os.getcwd()+'/', ['text_1.txt', 'text_2.txt', 'text_3.txt'], ['text_5.txt']) # perform ten-fold cross-validation t.run_test_cycles()
def test_load_nonexisting(self): with self.assertRaises(FileNotFoundError): Tagger.load("this_file_definitely_doesnt_exist.txt")
def test_load_file(self): tagger = Tagger.load(self.filePath) self.assertIsInstance(tagger.myTagger, BrillTagger)
def __init__(self, dataDir, table): sys.stderr.write("TaggerHandler: Constructor\n") self.__TaggerInstance = Tagger() self.__dataDir = dataDir self.__tableFile = open(table, 'w', 1) self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
######### hmm-tagger.py ######### from TreebankCleaner import TreebankCleaner # import cleaning class from Tagger import Tagger # import the tagging controller import os # for path info import sys # for command line options if '--clean' in sys.argv: # initialize treebank cleaner with the current path and pre-downloaded file(s) t = TreebankCleaner(os.getcwd() + '/', ['treebank3_sect2.txt']) # do cleaning t.clean() # initialize a tagging object with the cleaned corpus file(s) t = Tagger(os.getcwd() + '/', ['treebank3_sect2.txt_cleaned']) # perform ten-fold cross-validation t.run_test_cycles()
def __init__(self, dataDir, table): sys.stderr.write("TaggerHandler: Constructor\n") self.__TaggerInstance = Tagger() self.__dataDir = dataDir self.__tableFile = open(table,'w')