def tfRecordToCaffe(self,
                        datasetName,
                        outputPath,
                        nameAsGroundTruth=False):
        MkDataSetStructure(os.path.join(outputPath, datasetName))
        fileManager = Tagger(os.path.join(outputPath, datasetName))

        dataBuffer = self.readTFRecord()
        for data in dataBuffer:
            if nameAsGroundTruth: imageName = data['filename']
            else: imageName = data["sourceID"]

            imageFilename = imageName + ".jpg"
            imageOutputPath = os.path.join(outputPath, datasetName, "Images")

            fileManager.AppendTrainingImg(imageName)
            self.saveFromRawImageData(data["imgEncoded"], data["height"],
                                      data["width"], imageOutputPath,
                                      imageFilename)

            for xMin, yMin, xMax, yMax, classText, classID in zip(
                    data["xMins"], data["yMins"], data["xMaxs"], data["yMaxs"],
                    data["classesText"], data["classesID"]):
                fileManager.AppendAnnotation(
                    (xMin, yMin), (xMax, yMax), imageName,
                    classText.decode('utf-8') + " " + str(classID))
Example #2
0
    def path_input(self, pth):
        """Handle request for new input file. If new tag and display else display"""

        self.clear_results()
        tfiles = None

        #       Single image path
        if not isdir(pth):
            new = True
            #           SQL exception if path is not unique
            try:
                DataBase.add_image(pth)
            except IntegrityError:
                new = False

#          Tag new
            if new:
                tags = Tagger.tag_file(pth)
                for tag in tags:
                    DataBase.tag_image(tag, pth=pth)
#       Directory path
        else:
            tags, tfiles = Tagger.tag_dir(pth)
            for i in range(len(tfiles)):
                f = tfiles[i]
                #               Full path to image
                fpth = join(pth, f)
                tfiles[i] = fpth
                #               Continue if already present
                if DataBase.exists(pth=fpth):
                    continue
                else:
                    DataBase.add_image(fpth)
                    #                   Tuple results
                    if not isinstance(tags[i], str):
                        for t in tags[i]:
                            DataBase.tag_image(t, pth=fpth)
#                   String result
                    else:
                        DataBase.tag_image(tags[i], pth=fpth)

        L = 1
        #       Display
        if tfiles is None:
            self.queue_images(pth)
        else:
            #Number of listbox for results length
            L = len(tfiles)
            nlb = max(3, L)
            nlb = min(nlb, 12)
            self.ui.builder.get_object("ListResults").config(height=nlb)

            self.queue_images(tfiles)

        self.update_info("Processed " + str(L) + " images")
Example #3
0
def cross_validate(count_words):
    ''' Runs cross validation on the Tagger, count_words = True iff the Tagger counts all the words,
        so P(word | tag) is known, but P(tag | prev_tag) is still only 90% known '''

    global total_errs, total_matches
    sum_err = 0
    print 'Fold    Err    Match    Frac_Match'

    # files in the corpus range from 0-100, so test and train ranges are slices of this
    # range, such that train_range and test_range together make range(0,100)
    for i in xrange(folds):
        train_range = range(0, chunk * i) + range(chunk * (i + 1), 100)
        test_range = range(chunk * i, chunk * (i + 1))
        total_errs = 0
        total_matches = 0

        tm = time.time()
        c = TagCounter()
        c.parse_corpus_range(train_range)
        if (count_words):
            c.only_words = count_words
            c.parse_corpus_range(test_range)
        t = Tagger(c)
        tm = time.time() - tm
        if timing:
            print tm,

        # file_validate is mapped across all files in test_range
        def file_validate(f):
            global total_errs, total_matches
            sentences = parse_file(f)
            for sent in sentences[:size]:
                words = []
                for word in sent[1:]:  # sent[0] will always be START
                    words.append(word.true_chars)
                tagged = t.tag_words(words)
                matches = 0
                errs = 0
                for (actual_w, pred_w) in zip(sent, tagged):
                    if actual_w.tag != pred_w.tag:
                        #print actual_w, pred_w     # prints the mistagged pairs
                        errs += 1
                    else:
                        matches += 1
                total_errs += errs
                total_matches += matches

        tm = time.time()
        map_files(file_validate, test_range)
        tm = time.time() - tm
        if timing:
            print tm
        print '%3d  %6d  %7d      %0.4f' % (i, total_errs, total_matches,
                                            percent_match(
                                                total_errs, total_matches))
        print ''

        sum_err += percent_match(total_errs, total_matches)

    print 'cumulative averaged error:', (sum_err * 1.0) / folds
Example #4
0
def extract_entities(pmid):
    """
    This function tags genes and species in the PubMed article (based on the id provided).
    If possible these genes and species are further annotated using information present
    in the NCBi gene/taxonomy database. 
    :param pmid: The id of the article that should be tagged.
    :return: The genes (as Gene objects) and the organisms (as Organism object) found in the article. 
    """
    tagger = Tagger()
    tag_object = tagger.tag([pmid])
    genes = []
    organisms = []
    if tag_object:
        tag_object = tag_object[0]
        annotation = tag_object.get_annotation()
        genes = annotation.get("Gene", {None})
        organisms = annotation.get("Species", {None})
        genes = [convert_to_object(gene, "Gene") for gene in genes]
        organisms = [
            convert_to_object(organism, "Species") for organism in organisms
        ]
    return genes, organisms
Example #5
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table,'w')
    
  def __updateTable(self, setting, accuracies):
    self.__tableFile.write(setting+'\t'+'\t'.join(map(lambda x:str(x), accuracies))+'\n')
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    setting = trainFile.split("_")[0]
    self.__updateTable(setting, accuracies)
  
  def run(self, trainFiles, testFiles):
    trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
    def save_results(self):
        abs_filepath = os.path.abspath(self.outfile)

        total = Utils.get_count_of_sentences(self.testfile) * 1.0
        logging.info("{} sentences found".format(total))

        with open(abs_filepath, 'w') as f:
            for i, s in enumerate(Utils.get_sentence(self.testfile)):
                original_sentence = s['o']  # original raw sentence from file
                s = s['c']  # cleaned sentence
                untagged_sentence = self.remove_tags(s)
                untagged_original_sentence = self.remove_tags(
                    original_sentence)
                tags = Tagger(self.model, untagged_sentence).tag()
                tagged_sentence = self.attach_tags(untagged_original_sentence,
                                                   tags)
                Utils.write_sentence(f, tagged_sentence)

                if i % 100 == 0:
                    logging.info("{}% done. Last tagged: {}".format(
                        round(i / total * 100.0, 2), s.replace("\n", " ")))
                # break

            f.write(Utils.SENTENCE_SEPARATOR)
Example #7
0
# -*- coding: cp1254 -*-

# Onur Yilmaz

# Imports
from Tagger import Tagger

# Open the file where tagger is saved
taggerFileName = 'my_tagger.yaml'
myTagger = Tagger.load(taggerFileName)


# Keep the original functionality intact
def tag(sentence):
    return myTagger.tag(sentence)


# End of code
# -*- coding: cp1254 -*-

# Onur Yilmaz

# Imports
from Tagger import Tagger

# Open the file where tagger is saved
taggerFileName = 'my_tagger.yaml'
myTagger = Tagger.load(taggerFileName)

# Keep the original functionality intact
def tag(sentence):
    return myTagger.tag(sentence)

# End of code
# Setting up directory
mypath = os.getcwd() + '/data/seminar_testdata/test_untagged/'
trainingPath = 'data/training'
directory = os.fsencode(mypath)

# Runs the ontology classification
print('Running Ontology Classification: \n')
ontology = Ontology()
ontology.run(mypath)

# Begins tagging
print("\nTagging progress beginning. Get a brew, it'll take a while... \n")
extractor = DataExtractor()

# Trains our model
extractor.train(trainingPath)
tagger = Tagger()

# Tags all emails in the directory given
tagger.tag_seminar(mypath, directory, extractor)

# Calculates how long the program took
seconds = time.time() - start_time
m, s = divmod(seconds, 60)
print("The program has been running for {0} minutes and {1} seconds \n".format(
    round(m), round(s)))

# Evaluates results
eval = Evaluation()
eval.run()
 def test_load_incorrectly(self):
     temporaryFileName = os.path.join(self.tempDir, 'temporary_file.txt')
     with open(temporaryFileName, 'w') as file:
         file.write("This is a line that won't be able to be read")
     with self.assertRaises(TypeError):
         Tagger.load(temporaryFileName)
 def setUp(self):
     self.filePath = 'my_tagger.yaml'
     self.tag = Tagger.load(self.filePath)
Example #12
0
class TaggerHandler:
  def __init__(self, dataDir, table):
    sys.stderr.write("TaggerHandler: Constructor\n")
    self.__TaggerInstance = Tagger()
    self.__dataDir = dataDir
    self.__tableFile = open(table, 'w', 1)
    self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
    
  def __updateTable(self, setting, accuracies):
    ##print accuracies
    self.__tableFile.write(setting + '\t' + '\t'.join(map(lambda x:str(x), accuracies)) + '\n')
  
  def __getSetting(self, string):
    string = string.split("/")[-1].split("TrainCS")[1]
    cstype = string.split("CS")[0]
    csSplit = string.split("CS")[1].split("Pure")[0]
    pureSplit = string.split("Pure")[1].split("Total")[0]
    pureCSSplit = pureSplit + '-' + csSplit 
    totalSize = string.split("Total")[1].split("_")[0]
    return '\t'.join([cstype, pureCSSplit, totalSize])
  
  def __runTagger(self, trainFile, testFile):
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __runTagger2(self, trainFile, testFile, expType):
    trainFile = self.__dataDir+trainFile
    testFile = self.__dataDir+testFile
    self.__TaggerInstance.loadData(trainFile, testFile)
    self.__TaggerInstance.train()
    accuracies = self.__TaggerInstance.test()
    trainSetting = self.__getSetting(trainFile)
    testSetting = self.__getSetting(testFile)
    tagset = self.__tagset(trainFile)
    self.__updateTable(trainSetting + '\t' + expType + '\t' + testSetting + '\t' + tagset, accuracies)
    
  def __tagset(self, string):
    tagset = "Mixed"
    if len(string.split(".")) > 1 and string.split(".")[1] == "uni":
      tagset = 'Universal'
    if string.find(".uniq") >= 0:
      tagset += ".uniq"
    return tagset
  
  def run(self, trainFiles, testFiles):
    #trainFiles = [self.__dataDir+line.strip() for line in open(trainFiles)]
    #testFiles = [self.__dataDir+line.strip() for line in open(testFiles)]
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      #if trainFile.find("Type1")>=0 or trainFile.find("Type0")>=0:
      #  continue
      for testFile in testFiles:
        if self.__tagset(trainFile) != "Mixed" or self.__tagset(trainFile) != self.__tagset(testFile):
        #if self.__tagset(trainFile)!= self.__tagset(testFile):
          continue
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger(trainFile, testFile)
    self.__tableFile.close()
    
  def run2(self, trainFiles, testFiles):
    trainFiles = [line.strip() for line in open(trainFiles)]
    testFiles = [line.strip() for line in open(testFiles)]
    for trainFile in trainFiles:
      for testFile in testFiles:
        if self.__tagset(trainFile) != self.__tagset(testFile):
          continue
        controlTrainFile = trainFile + "_Control"
        if self.__tagset(trainFile) == "Universal":
          controlTrainFile = trainFile.split(".uni")[0] + "_Control" + ".uni"
        ##if testFile.find("CS0Pure100")<0:
        ##  continue
        ##print testFile
        self.__runTagger2(trainFile, testFile, "Experiment")
        self.__runTagger2(controlTrainFile, testFile, "Control")
    self.__tableFile.close()
Example #13
0
    print "("+token+")",
print "\n\nNUMBER OF TOKENS ANSWER,TEST"
print len(gold_tokens),len(test_tokens)
print "\n\nDIFFERENCE\nONLY IN TEST"
difference = set(test_tokens)-set(gold_tokens)
for token in difference:
    print token+"  ",
print "\n\nONLY IN ANSWER"
difference = set(gold_tokens)-set(test_tokens)
for token in difference:
    print token+"  ",
print "\n"


print "--TAGGER--"
tagger = Tagger(tagged_train_sents)
print "EVALUATE TAGGER"
print "RATE"
tagger.evaluate(tagged_gold_sents)
print "\n\nANSWER"
tagged_gold_tokens = sum(tagged_gold_sents,[])
for tup in tagged_gold_tokens:
        print str(tup[0])+"/"+str(tup[1]),
print "\n\nTEST"
gold_tokens = []
for sent in tagged_gold_sents:
    for tup in sent:
        gold_tokens.append(tup[0])
tagged_test_tokens = tagger.tag(gold_tokens)
for tup in tagged_test_tokens:
    print str(tup[0])+"/"+str(tup[1]),
    return float(orphan_count) / len(tag_occurrences)


def calculate_combined(tagging_user):
    """
    returns the combined measure
    """
    return (get_orphaniness(tagging_user) + get_cond_entropy_normalized(tagging_user)) / 2

    
        
                
                                
if __name__ == '__main__':
    TAGGER = Tagger("hugo")
    TAG_SET = set()
    TAG_SET.add("computer")
    TAG_SET.add("reference")
    
    TAGGER.add_post("1", TAG_SET)
    TAG_SET.clear()
    
    TAG_SET.add("reference")
    TAG_SET.add("calculator")
    TAG_SET.add("rate")
    
    TAGGER.add_post("2", TAG_SET)
    
    TAG_SET.clear()
    
Example #15
0
######### hmm-tagger.py #########

from Tagger import Tagger # import the tagging controller
import os # for path info


# initialize a tagging object with the cleaned corpus file(s)
t = Tagger(os.getcwd()+'/', ['text_1.txt', 'text_2.txt', 'text_3.txt'], ['text_5.txt'])

# perform ten-fold cross-validation
t.run_test_cycles()
Example #16
0
 def setUp(self):
     self.filePath = 'my_tagger.yaml'
     self.tag = Tagger.load(self.filePath)
Example #17
0
 def test_load_incorrectly(self):
     temporaryFileName = os.path.join(self.tempDir, 'temporary_file.txt')
     with open(temporaryFileName, 'w') as file:
         file.write("This is a line that won't be able to be read")
     with self.assertRaises(TypeError):
         Tagger.load(temporaryFileName)
Example #18
0
 def test_load_nonexisting(self):
     with self.assertRaises(FileNotFoundError):
         Tagger.load("this_file_definitely_doesnt_exist.txt")
Example #19
0
 def test_load_file(self):
     tagger = Tagger.load(self.filePath)
     self.assertIsInstance(tagger.myTagger, BrillTagger)
Example #20
0
 def __init__(self, dataDir, table):
   sys.stderr.write("TaggerHandler: Constructor\n")
   self.__TaggerInstance = Tagger()
   self.__dataDir = dataDir
   self.__tableFile = open(table, 'w', 1)
   self.__tableFile.write('TrainCSType\tTrainPureCSSplit\tTrainSize\tExperimentType\tTestCSType\tTestPureCSSplit\tTestSize\tTagset\tOverallAccuracy\tSameContextAccuracy\tDifferentContextAccuracy\tPrevWordDifferentAccuracy\tPrePrevWordDifferentAccuracy\tUnknowns\n')
Example #21
0
######### hmm-tagger.py #########

from TreebankCleaner import TreebankCleaner  # import cleaning class
from Tagger import Tagger  # import the tagging controller
import os  # for path info
import sys  # for command line options

if '--clean' in sys.argv:
    # initialize treebank cleaner with the current path and pre-downloaded file(s)
    t = TreebankCleaner(os.getcwd() + '/', ['treebank3_sect2.txt'])
    # do cleaning
    t.clean()

# initialize a tagging object with the cleaned corpus file(s)
t = Tagger(os.getcwd() + '/', ['treebank3_sect2.txt_cleaned'])

# perform ten-fold cross-validation
t.run_test_cycles()
 def test_load_nonexisting(self):
     with self.assertRaises(FileNotFoundError):
         Tagger.load("this_file_definitely_doesnt_exist.txt")
 def test_load_file(self):
     tagger = Tagger.load(self.filePath)
     self.assertIsInstance(tagger.myTagger, BrillTagger)
Example #24
0
 def __init__(self, dataDir, table):
   sys.stderr.write("TaggerHandler: Constructor\n")
   self.__TaggerInstance = Tagger()
   self.__dataDir = dataDir
   self.__tableFile = open(table,'w')