Example #1
0
def fix_modified_test_xml():

    train_corpus = Corpus('./data/gold')
    i = 0
    for doc in train_corpus.documents():
        name=doc.name.replace('gold','Test++')
        with open(name,'r') as fo:
            text=fo.read()
            lines=text.split('\n')
        i=0
        for s in doc.tokenizer.tokenize_text().sentences:
            sent = s.as_pairs()
            for tok,lex in sent:
                while not re.match('\s*<lex',lines[i]):
                    i+=1
                lines[i] = re.sub("begin='\d+?'",
                             "begin='{}'".format(lex.begin),
                             lines[i])
                lines[i] = re.sub("start='\d+?'",
                             "".format(lex.begin),
                             lines[i])
                lines[i] = re.sub("end='\d+?'",
                             "end='{}'".format(lex.end),
                             lines[i])
                #print tok,',',lex.begin,',',lex.end
                #print re.findall('<lex .+?>(.+?)</lex>',lines[i])
                #print lines[i]
                i+=1
        with open(doc.name.replace('gold','gold++'),'wb') as fo:
            for line in lines:
                fo.write(re.sub('\r','',line)) #clean carriage returns
                fo.write('\n')
 def run_demo(self, verbose=0):
     """ test functionality of loading data, classification and evaluation """
     # load training data
     train_corpus = Corpus(self.train_path)
     extents = list(train_corpus.extents(self.indices_function,
                                         self.extent_class))
     # load test data
     if self.test_path:
         train_data = extents
         test_corpus = HypotheticalCorpus(self.test_path)
         test_data = list(test_corpus.extents(self.indices_function,
                                              self.extent_class))
     else:
         i = int(len(extents) * self.split)
         train_data = extents[:i]
         test_data = extents[i:]
     # verbosity functionality
     if verbose >= 1:
         print "data loaded"
     labels = [self.label_function(x) for x in extents]
     if verbose >= 2:
         fd = {}
         for l in labels:
             fd[l] = fd.get(l, 1) + 1
         print fd
     # train model
     clf = SKClassifier(LogisticRegression(),
                        self.label_function,
                        self.feature_functions)
     clf.add_labels(set(labels))
     clf.train(train_data)
     if verbose >= 1:
         print "model trained"
     # classify
     pred = clf.classify(test_data, 
                         keys = ["{a},{b},{c}".format(a=extent.basename,
                                                      b=extent.lex[0].begin, 
                                                      c=extent.lex[-1].end)
                                 for extent in test_data])
     # evaluate
     if self.gold_path:
         gold_corpus = Corpus(self.gold_path)
         gold_data = list(gold_corpus.extents(self.indices_function,
                                         self.extent_class))
     else:
         gold_data = test_data
     gold_labels = dict([
                         ("{a},{b},{c}".format(a=extent.basename,
                                               b=extent.lex[0].begin, 
                                               c=extent.lex[-1].end),  
                         self.label_function(extent)) 
                     for extent in gold_data])        
     clf.evaluate(pred, gold_labels)
    def generate_test_train(self):
        """ create test and training instances based on provided paths """

        train_corpus = Corpus(self.train_path)
        extents = list(train_corpus.extents(self.indices_function,
                                            self.extent_class))
        # load test data
        if self.test_path:
            train_data = extents
            test_corpus = HypotheticalCorpus(self.test_path)
            test_data = list(test_corpus.extents(self.indices_function,
                                                 self.extent_class))
        else:
            i = int(len(extents) * self.split)
            train_data = extents[:i]
            test_data = extents[i:]
        return train_data, test_data
    def evaluate_qs_o_link(self):
        """ given test and gold copora paths with matching docs, 
        compare proposed tags with actual tags """
        clf = SKClassifier(LogisticRegression(),
                           self.label_function,
                           self.feature_functions)

        # hyp
        test_corpus = Corpus(self.test_path)
        test_data = list(test_corpus.qs_o_link_triples(self.indices_function,
                                                       self.extent_class))

        test_labels = dict([
            (link_key.format(doc_name=extent.basename,
                       trigger_start = extent.token[0]['start'], 
                       trigger_end = extent.token[0]['end'],
                       from_start = extent.token[1]['start'], 
                       from_end = extent.token[1]['end'],
                       to_start = extent.token[2]['start'],
                       to_end = extent.token[2]['end']),
                self.label_function(extent)) 
            for extent in test_data])
        clf.add_labels(test_labels.values())
        
        # ref
        gold_corpus = Corpus(self.gold_path)
        gold_data = list(gold_corpus.qs_o_link_triples(self.indices_function,
                                                       self.extent_class))
        gold_labels = dict([
            (link_key.format(doc_name=extent.basename,
                       trigger_start = extent.token[0]['start'], 
                       trigger_end = extent.token[0]['end'],
                       from_start = extent.token[1]['start'], 
                       from_end = extent.token[1]['end'],
                       to_start = extent.token[2]['start'],
                       to_end = extent.token[2]['end']),
                self.label_function(extent))
            for extent in gold_data])
        clf.add_labels(gold_labels.values())
        cm = clf.evaluate(test_labels, gold_labels)
        # output dict for 
        return cm
    def evaluate(self):
        """ given test and gold copora paths with matching docs, 
        compare proposed tags with actual tags """
        clf = SKClassifier(LogisticRegression(),
                           self.label_function,
                           self.feature_functions)

        # hyp
        test_corpus = Corpus(self.test_path)
        test_data = list(test_corpus.extents(self.indices_function,
                                             self.extent_class))
        test_labels = dict([
                            ("{a},{b},{c}".format(a=extent.basename,
                                                  b=extent.lex[0].begin, 
                                                  c=extent.lex[-1].end),  
                            self.label_function(extent)) 
                        for extent in test_data])   
        clf.add_labels(test_labels.values())
        
        # ref
        gold_corpus = Corpus(self.gold_path)
        gold_data = list(gold_corpus.extents(self.indices_function,
                                            self.extent_class))
        gold_labels = dict([
                            ("{a},{b},{c}".format(a=extent.basename,
                                                  b=extent.lex[0].begin, 
                                                  c=extent.lex[-1].end),  
                             self.label_function(extent)) 
                        for extent in gold_data])        

        for key in test_labels.keys():
            if key not in gold_labels:
                gold_labels[key] = 'False'
        for key in gold_labels.keys():
            if key not in test_labels:
                test_labels[key] = 'False'

        clf.add_labels(test_labels.values())        
        clf.add_labels(gold_labels.values())            
        cm = clf.evaluate(test_labels, gold_labels)
        # output dict for 
        return cm