class SequenceTagger(TaggerI): """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger >>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]']) >>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) >>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] >>> tagger.save_model('resources/test.model') >>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] """ def __init__(self, patterns=[], **options): from wapiti import Model self.model = Model(patterns='\n'.join(patterns), **options) def train(self, sentences): self.model.train(['\n'.join([' '.join(word) for word in sentence]) for sentence in sentences]) def tag(self, tokens): return self.tag_sents([tokens])[0] def save_model(self, filename): self.model.save(filename) def tag_sents(self, sentences): sentences = list(sentences) lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') results = self.model.label_sequence(lines).decode('utf8') tags = iter(results.strip().split('\n')) return [[(word, next(tags)) for word in sentence] for sentence in sentences]
class SequenceTagger(TaggerI): """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger >>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]']) >>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) >>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] >>> tagger.save_model('resources/test.model') >>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] """ def __init__(self, patterns=[], **options): from wapiti import Model self.model = Model(patterns='\n'.join(patterns), **options) def train(self, sentences): self.model.train([ '\n'.join([' '.join(word) for word in sentence]) for sentence in sentences ]) def save_model(self, filename): self.model.save(filename) def tag_sents(self, sentences): sentences = list(sentences) lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') results = self.model.label_sequence(lines).decode('utf8') tags = iter(results.strip().split('\n')) return [[(word, next(tags)) for word in sentence] for sentence in sentences]
class WapitiPOSTagger(TaggerI): """docstring for WapitiPOSTagger""" def __init__(self, *args, **kwargs): if 'model' not in kwargs: kwargs['model'] = 'resources/model.txt' if 'pattern' not in kwargs: kwargs['pattern'] = 'resources/pattern.txt' super(WapitiPOSTagger, self).__init__() option_dict = {} option_dict['pattern'] = kwargs['pattern'] option_dict['model'] = kwargs['model'] self.model = Model(**option_dict) def tag_sents(self, sents): for words in sents: tags = self.model.label_sequence('\n'.join(words)).split('\n') yield zip(words, tags) def tag(self, sent): tags = self.model.label_sequence('\n'.join(sent)).split('\n') return zip(sent, tags)
class POSTagger(): def __init__(self, stanford_postagger_model=None, wapiti_postagger_model=None, jar_tagger_path=None, jdk_variable_path="C:/Program Files/Java/jdk1.8.0_121/bin/java.exe", tagging_model="wapiti"): import platform if platform.system() == "Windows": self.tagging_model = "stanford" else: self.tagging_model = tagging_model self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" if stanford_postagger_model is None: self.stanford_postagger_model = self.dir_path + "resource/postagger/NC_model" else: self.stanford_postagger_model = stanford_postagger_model if jar_tagger_path is None: self.jar_tagger_path = self.dir_path + 'resource/postagger/stanford-postagger.jar' else: self.jar_tagger_path = jar_tagger_path if wapiti_postagger_model is None: self.wapiti_postagger_model = self.dir_path + "resource/postagger/UPC_full_model_wapiti" else: self.wapiti_postagger_model = wapiti_postagger_model if self.tagging_model == "stanford": java_path = jdk_variable_path os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(model_filename=self.stanford_postagger_model, path_to_jar=self.jar_tagger_path, encoding='utf-8', java_options='-mx5000m') elif self.tagging_model == "wapiti": from wapiti import Model self.tagger = Model(model=self.wapiti_postagger_model) def is_all_latin(self, word): pattern = '[a-zA-Z]*' w = re.sub(pattern, '', word) if len(w) == 0: return True else: return False def parse(self, token_list): tagged_tuples = [] if self.tagging_model == "stanford": postags = self.tagger.tag(token_list) for element in postags: tmp = '_'.join(t for t in element) tmp = tmp.strip("_") tmp = tmp.split('/') tag = tmp[-1] tmp = tmp[:-1] tmp = '/'.join(i for i in tmp) tmp = tmp.strip('/') if self.is_all_latin(tmp): tagged_tuples.append((tmp, "FW")) else: tagged_tuples.append((tmp, tag)) elif self.tagging_model == "wapiti": sent_line = "\n".join(x for x in token_list) postags = self.tagger.label_sequence(sent_line).decode('utf-8') postags = postags.strip().split('\n') for i, el in enumerate(token_list): if self.is_all_latin(el): tagged_tuples.append((el, u"FW")) else: tagged_tuples.append((el, postags[i])) return tagged_tuples