def fix_modified_test_xml(): train_corpus = Corpus('./data/gold') i = 0 for doc in train_corpus.documents(): name=doc.name.replace('gold','Test++') with open(name,'r') as fo: text=fo.read() lines=text.split('\n') i=0 for s in doc.tokenizer.tokenize_text().sentences: sent = s.as_pairs() for tok,lex in sent: while not re.match('\s*<lex',lines[i]): i+=1 lines[i] = re.sub("begin='\d+?'", "begin='{}'".format(lex.begin), lines[i]) lines[i] = re.sub("start='\d+?'", "".format(lex.begin), lines[i]) lines[i] = re.sub("end='\d+?'", "end='{}'".format(lex.end), lines[i]) #print tok,',',lex.begin,',',lex.end #print re.findall('<lex .+?>(.+?)</lex>',lines[i]) #print lines[i] i+=1 with open(doc.name.replace('gold','gold++'),'wb') as fo: for line in lines: fo.write(re.sub('\r','',line)) #clean carriage returns fo.write('\n')
def run_demo(self, verbose=0): """ test functionality of loading data, classification and evaluation """ # load training data train_corpus = Corpus(self.train_path) extents = list(train_corpus.extents(self.indices_function, self.extent_class)) # load test data if self.test_path: train_data = extents test_corpus = HypotheticalCorpus(self.test_path) test_data = list(test_corpus.extents(self.indices_function, self.extent_class)) else: i = int(len(extents) * self.split) train_data = extents[:i] test_data = extents[i:] # verbosity functionality if verbose >= 1: print "data loaded" labels = [self.label_function(x) for x in extents] if verbose >= 2: fd = {} for l in labels: fd[l] = fd.get(l, 1) + 1 print fd # train model clf = SKClassifier(LogisticRegression(), self.label_function, self.feature_functions) clf.add_labels(set(labels)) clf.train(train_data) if verbose >= 1: print "model trained" # classify pred = clf.classify(test_data, keys = ["{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end) for extent in test_data]) # evaluate if self.gold_path: gold_corpus = Corpus(self.gold_path) gold_data = list(gold_corpus.extents(self.indices_function, self.extent_class)) else: gold_data = test_data gold_labels = dict([ ("{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end), self.label_function(extent)) for extent in gold_data]) clf.evaluate(pred, gold_labels)
def generate_test_train(self): """ create test and training instances based on provided paths """ train_corpus = Corpus(self.train_path) extents = list(train_corpus.extents(self.indices_function, self.extent_class)) # load test data if self.test_path: train_data = extents test_corpus = HypotheticalCorpus(self.test_path) test_data = list(test_corpus.extents(self.indices_function, self.extent_class)) else: i = int(len(extents) * self.split) train_data = extents[:i] test_data = extents[i:] return train_data, test_data
def evaluate_qs_o_link(self): """ given test and gold copora paths with matching docs, compare proposed tags with actual tags """ clf = SKClassifier(LogisticRegression(), self.label_function, self.feature_functions) # hyp test_corpus = Corpus(self.test_path) test_data = list(test_corpus.qs_o_link_triples(self.indices_function, self.extent_class)) test_labels = dict([ (link_key.format(doc_name=extent.basename, trigger_start = extent.token[0]['start'], trigger_end = extent.token[0]['end'], from_start = extent.token[1]['start'], from_end = extent.token[1]['end'], to_start = extent.token[2]['start'], to_end = extent.token[2]['end']), self.label_function(extent)) for extent in test_data]) clf.add_labels(test_labels.values()) # ref gold_corpus = Corpus(self.gold_path) gold_data = list(gold_corpus.qs_o_link_triples(self.indices_function, self.extent_class)) gold_labels = dict([ (link_key.format(doc_name=extent.basename, trigger_start = extent.token[0]['start'], trigger_end = extent.token[0]['end'], from_start = extent.token[1]['start'], from_end = extent.token[1]['end'], to_start = extent.token[2]['start'], to_end = extent.token[2]['end']), self.label_function(extent)) for extent in gold_data]) clf.add_labels(gold_labels.values()) cm = clf.evaluate(test_labels, gold_labels) # output dict for return cm
def evaluate(self): """ given test and gold copora paths with matching docs, compare proposed tags with actual tags """ clf = SKClassifier(LogisticRegression(), self.label_function, self.feature_functions) # hyp test_corpus = Corpus(self.test_path) test_data = list(test_corpus.extents(self.indices_function, self.extent_class)) test_labels = dict([ ("{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end), self.label_function(extent)) for extent in test_data]) clf.add_labels(test_labels.values()) # ref gold_corpus = Corpus(self.gold_path) gold_data = list(gold_corpus.extents(self.indices_function, self.extent_class)) gold_labels = dict([ ("{a},{b},{c}".format(a=extent.basename, b=extent.lex[0].begin, c=extent.lex[-1].end), self.label_function(extent)) for extent in gold_data]) for key in test_labels.keys(): if key not in gold_labels: gold_labels[key] = 'False' for key in gold_labels.keys(): if key not in test_labels: test_labels[key] = 'False' clf.add_labels(test_labels.values()) clf.add_labels(gold_labels.values()) cm = clf.evaluate(test_labels, gold_labels) # output dict for return cm