def __init__(self, chunked_sents, **kwargs): chunked_sents = [tree2conlltags(sent) for sent in chunked_sents ] #converts the sentences to IOB form chunked_sents = [[((word, pos), chunk) for (word, pos, chunk) in sent] for sent in chunked_sents ] #convert from triplets to pairs #self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs) #init the tagger
def __init__(self, chunked_sents, **kwargs): assert isinstance(chunked_sents, Iterable) chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs)
def __init__(self, chunked_sents, **kwargs): # Transform the trees in IOB annotated sentences [(word, pos, chunk)] chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] # Make tags compatible with the tagger interface [((word, pos), chunk)] def get_tagged_pairs(chunked_sent): return [((word, pos), chunk) for word, pos, chunk in chunked_sent] chunked_sents = [get_tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs)
def __init__(self, chunked_sents, **kwargs): assert isinstance(chunked_sents, Iterable) # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...] chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs)
class ClassifierChunkParser(ChunkParserI): def __init__(self, chunked_sents, **kwargs): assert isinstance(chunked_sents, Iterable) # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...] chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
class ClassifierChunkParser(ChunkParserI): def __init__(self, chunked_sents, **kwargs): assert isinstance(chunked_sents, Iterable) chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) iob_triplets = [(w, t, c) for ((w, t), c) in chunks] return conlltags2tree(iob_triplets)
class ClassifierChunkParser(ChunkParserI): #Constructor def __init__(self, chunked_sents, **kwargs): chunked_sents = [tree2conlltags(sent) for sent in chunked_sents ] #converts the sentences to IOB form chunked_sents = [[((word, pos), chunk) for (word, pos, chunk) in sent] for sent in chunked_sents ] #convert from triplets to pairs #self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs) #init the tagger #Parses the tagged sentences and returns the chunks in the IOB format def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) #tag the sentences iob_triplets = [(w, t, c) for ((w, t), c) in chunks ] #convert from pairs to triplets return iob_triplets #convert to tree format
def train_merger(self, train_file_path, test_split=0.1): print("Loading Data...") file = open(train_file_path, "r", encoding='utf-8') file_content = file.read() file_content = file_content.split("\n\n") data_list = [] for line in file_content: line = nltk.chunk.util.conllstr2tree(line, chunk_types=('NP', ), root_label='S') if (len(line) > 0): data_list.append(line) # train_sents, test_sents = train_test_split(data_list, test_size=test_split, random_state=91) train_sents = data_list test_sents = [] print("Training the model ...") # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...] chunked_sents = [tree2conlltags(sent) for sent in train_sents] # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = self.features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=self.features) token_merger_model = self.tagger if len(test_sents) > 0: print("evaluating...") print(token_merger_model.evaluate(test_sents)) return token_merger_model
class FooChunkParser(ChunkParserI): def __init__(self, chunked_sents, **kwargs): # Transform the trees in IOB annotated sentences [(word, pos, chunk)] chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] # Make tags compatible with the tagger interface [((word, pos), chunk)] def get_tagged_pairs(chunked_sent): return [((word, pos), chunk) for word, pos, chunk in chunked_sent] chunked_sents = [get_tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = features self.tagger = ClassifierBasedTagger(train=chunked_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)