def lemmatize(file, output_file): morphodita_model = os.path.join( dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join( list( map( lambda x: str(x.lemma).strip() + '___' + str(x.tag) .strip(), lemmas)))) out.write('\n')
def __init__(self): self.morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') self.tagger = Tagger.load(self.morphodita_model) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self): self.morphodita_model = os.path.join( dir_cur, 'czech-morfflex-131112.tagger-fast') self.tagger = Tagger.load(self.morphodita_model) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model): if not os.path.isfile(tagger_model): raise IOError('File %s does not exist' % tagger_model) self._tagger = Tagger.load(tagger_model) self._tokenizer = self._tagger.newTokenizer() self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._tags_buf = TaggedLemmas()
def __init__(self, derinet_file_name, morfflex_file_name, morpho_file_name): logger.info("Loading derivations.") derinet_db = DeriNetDatabase(derinet_file_name) logger.info("Derivations loaded.") if morfflex_file_name is not None: logger.info("Loading inflections.") db = MorfFlexDatabase(morfflex_file_name, derinet_db) logger.info("Inflections loaded.") else: logger.info("Not loading inflections.") db = derinet_db logger.info("Detecting stem bounds.") for node in db.iter(): node.detect_stems() logger.info("Stem bounds detected.") logger.info("Propagating morph bounds.") for root in db.iter_trees(): root.propagate_morph_bounds() logger.info("Morph bounds propagated.") lemmas = [] tagger = None if morpho_file_name is not None: logger.info("Loading morphology") if morphodita_available: tagger = Tagger.load(morpho_file_name) else: logger.error( "You need to install the MorphoDiTa Python bindings!") if not tagger: logger.critical( "Cannot load morphological dictionary from file '%s'.", morpho_file_name) sys.exit(1) lemmas = TaggedLemmas() logger.info("Morphology loaded.") else: logger.info( "No morphological dictionary specified. Inflectional morphology will not be available." ) tagger = None self.db = db self.tagger = tagger self.lemmas = lemmas
def __init__(self, model_file): """ Instantiates Morphodita from a provided model file. :param model_file: Path to the model file, :type model_file: str """ from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges self.tagger = Tagger.load(model_file) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._sf_max_len = 0
def load_tagger(self, path: str): """ Load morphodita tagger from path. :param path: :return: """ self.tagger = Tagger.load(path) if self.tagger is None: raise Exception("[morpho_tagger] Wrong path in tagger") # create tokenizer self.tokenizer = self.tagger.newTokenizer() if self.tagger is None: raise Exception("[morpho_tagger] tokenizer not created")
def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._rev_sf_dict = {} self._sf_max_len = 0
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len( sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len(sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def lemmatize(file, output_file): morphodita_model = os.path.join(dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join(list(map(lambda x: str(x.lemma).strip() + '___' + str(x.tag).strip(), lemmas)))) out.write('\n')
def __init__(self, tagger_model): self.__tagger = Tagger.load(tagger_model) self.__tokenizer = self.__tagger.newTokenizer() self.__forms_buf = Forms() self.__tokens_buf = TokenRanges() self.__lemmas_buf = TaggedLemmas()
def load(self): self.tagger = Tagger.load("app/LM/czech-morfflex-pdt-161115.tagger") self.tokenizer = self.tagger.newTokenizer()