class LearningBasedInformationExtractor(TaggerI): def train(self, sentences, model_file='resources/extractor.model'): self.tagger = IOBTagger(patterns=[ '*', '*:tl1=%X[-1,1]', '*:t=%X[0,1]', '*:tr1=%X[1,1]', '*:cp=%m[0,2,"..$"]', '*:c=%X[0,2]', '*:c0l1=%X[-1,2]/%X[0,2]', '*:c0r1=%X[0,2]/%X[1,2]', '*:cl1=%X[-1,2]', '*:cl2=%X[-2,2]', '*:cr1=%X[1,2]', '*:cr2=%X[2,2]', ]) self.tagger.train(sentences) self.tagger.save_model(model_file) def tag(self, sent): return self.tagger.tag(sent)
def train(self, sentences, model_file='resources/extractor.model'): self.tagger = IOBTagger(patterns=[ '*', '*:tl1=%X[-1,1]', '*:t=%X[0,1]', '*:tr1=%X[1,1]', '*:cp=%m[0,2,"..$"]', '*:c=%X[0,2]', '*:c0l1=%X[-1,2]/%X[0,2]', '*:c0r1=%X[0,2]/%X[1,2]', '*:cl1=%X[-1,2]', '*:cl2=%X[-2,2]', '*:cr1=%X[1,2]', '*:cr2=%X[2,2]', ]) self.tagger.train(sentences) self.tagger.save_model(model_file)
sentences = [] evaluation_sents = [] for gold_sent in gold: sentences.append([w for w, t, c, l in gold_sent]) #tokens = tagger.tag_sents(sentences) #chunk_trees = list(chunker.parse_sents(tokens)) #dep_trees = parser.parse_sents(sentences) dep_tagged_sents = [] chunk_tagged_sents = [] for number, gold_sent in enumerate(gold): sentence = ' '.join(sentences[number]) chunk_tree = chunk_trees[number] dep_tree = dep_trees[number] chunk_informations = list(chunk_extractor.extract(chunk_tree)) dep_informations = list(dep_extractor.extract(dep_tree)) evaluation_sent = [(w, l) for w, t, c, l in gold_sent] dep_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, dep_informations)]] chunk_tagged_sent = [(w,l) for w, t, c, l in [tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations)]] if len(evaluation_sent) == len(dep_tagged_sent): evaluation_sents.append(evaluation_sent) dep_tagged_sents.append(dep_tagged_sent) chunk_tagged_sents.append(chunk_tagged_sent) else: print(chunk_tagged_sent) print() print('dependency accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(dep_tagged_sents, [])))) print('chunk accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(chunk_tagged_sents, [])))) information_tagger = IOBTagger(model='informations-all.model') print(information_tagger.evaluate(gold))
#dep_trees = parser.parse_sents(sentences) dep_tagged_sents = [] chunk_tagged_sents = [] for number, gold_sent in enumerate(gold): sentence = ' '.join(sentences[number]) chunk_tree = chunk_trees[number] dep_tree = dep_trees[number] chunk_informations = list(chunk_extractor.extract(chunk_tree)) dep_informations = list(dep_extractor.extract(dep_tree)) evaluation_sent = [(w, l) for w, t, c, l in gold_sent] dep_tagged_sent = [(w, l) for w, t, c, l in [ tokens for tokens in info2iob(sentence, chunk_tree, dep_informations) ]] chunk_tagged_sent = [(w, l) for w, t, c, l in [ tokens for tokens in info2iob(sentence, chunk_tree, chunk_informations) ]] if len(evaluation_sent) == len(dep_tagged_sent): evaluation_sents.append(evaluation_sent) dep_tagged_sents.append(dep_tagged_sent) chunk_tagged_sents.append(chunk_tagged_sent) else: print(chunk_tagged_sent) print() print('dependency accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(dep_tagged_sents, [])))) print('chunk accuracy: %f' % (accuracy(sum(evaluation_sents, []), sum(chunk_tagged_sents, [])))) information_tagger = IOBTagger(model='informations-all.model') print(information_tagger.evaluate(gold))