def lemmatize(file, output_file): morphodita_model = os.path.join( dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join( list( map( lambda x: str(x.lemma).strip() + '___' + str(x.tag) .strip(), lemmas)))) out.write('\n')
def __init__(self): self.morphodita_model = os.path.join( dir_cur, 'czech-morfflex-131112.tagger-fast') self.tagger = Tagger.load(self.morphodita_model) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model): if not os.path.isfile(tagger_model): raise IOError('File %s does not exist' % tagger_model) self._tagger = Tagger.load(tagger_model) self._tokenizer = self._tagger.newTokenizer() self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._tags_buf = TaggedLemmas()
def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._sf_max_len = 0
def __init__(self, model_file): """ Instantiates Morphodita from a provided model file. :param model_file: Path to the model file, :type model_file: str """ from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges self.tagger = Tagger.load(model_file) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def create_lemmas(self, text): _forms = Forms() _lemmas = TaggedLemmas() _tokens = TokenRanges() self.tokenizer.setText(text) lemmas = [] while self.tokenizer.nextSentence(_forms, _tokens): self.tagger.tag(_forms, _lemmas) for i in range(len(_lemmas)): lemma = _lemmas[i] token = _tokens[i] form = _forms[i] lemmas.append(Lemma(lemma.lemma, lemma.tag, form)) return lemmas
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len( sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def __init__(self, tagger_model): self.__tagger = Tagger.load(tagger_model) self.__tokenizer = self.__tagger.newTokenizer() self.__forms_buf = Forms() self.__tokens_buf = TokenRanges() self.__lemmas_buf = TaggedLemmas()
def pos_tagging(self, text: str, stem=False, preprocess=True): """ Perform pos tagging of given text :param text: input text :param stem: use stem of word or just lemma :param preprocess: use preprocess :return: list of list of tagged words: List[List[WordPos]] """ lemmas = TaggedLemmas() tokens = TokenRanges() forms = Forms() sentences = [] vanilla_text = text # remove diacritic text = unidecode(text) if preprocess: # remove stop words text = " ".join([ w if w not in self.preprocesor.stop_words else "" for w in text.split() ]) # lower all text text = text.lower() # replace smileys text = self.preprocesor.replace_emoji(text) vanilla_text = text # POS taging self.tokenizer.setText(text) while self.tokenizer.nextSentence(forms, tokens): sentence = [] self.tagger.tag(forms, lemmas) for i in range(len(lemmas)): lemma = lemmas[i].lemma tag = lemmas[i].tag token = tokens[i] token_text = vanilla_text[token.start:token.start + token.length] # remove diacritic lemma = unidecode(lemma) # eng flag eng_word = False # '-' is not boundary token # boundary token if tag[0] == "Z" and lemma != "-": if not preprocess: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) sentence = [] continue # dont stem english words if lemma.find("angl") != -1: eng_word = True # remove additional informations lemma = lemma.split("_")[0] lemma = re.sub(r'-\d*$', '', lemma) # Stem if stem and not eng_word: lemma = cz_stem(lemma) if lemma and not preprocess or len(lemma) > 2: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) return sentences