Beispiel #1
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(
        dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(
                    list(
                        map(
                            lambda x: str(x.lemma).strip() + '___' + str(x.tag)
                            .strip(), lemmas))))
            out.write('\n')
Beispiel #2
0
 def __init__(self):
     self.morphodita_model = os.path.join(
         dir_cur, 'czech-morfflex-131112.tagger-fast')
     self.tagger = Tagger.load(self.morphodita_model)
     self.forms = Forms()
     self.lemmas = TaggedLemmas()
     self.tokens = TokenRanges()
     self.tokenizer = self.tagger.newTokenizer()
Beispiel #3
0
 def __init__(self, tagger_model):
     if not os.path.isfile(tagger_model):
         raise IOError('File %s does not exist' % tagger_model)
     self._tagger = Tagger.load(tagger_model)
     self._tokenizer = self._tagger.newTokenizer()
     self._forms_buf = Forms()
     self._tokens_buf = TokenRanges()
     self._tags_buf = TaggedLemmas()
Beispiel #4
0
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._sf_max_len = 0
Beispiel #5
0
    def __init__(self, model_file):
        """
        Instantiates Morphodita from a provided model file.

        :param model_file: Path to the model file,
        :type model_file: str
        """
        from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges
        self.tagger = Tagger.load(model_file)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
Beispiel #6
0
 def create_lemmas(self, text):
     _forms = Forms()
     _lemmas = TaggedLemmas()
     _tokens = TokenRanges()
     self.tokenizer.setText(text)
     lemmas = []
     while self.tokenizer.nextSentence(_forms, _tokens):
         self.tagger.tag(_forms, _lemmas)
         for i in range(len(_lemmas)):
             lemma = _lemmas[i]
             token = _tokens[i]
             form = _forms[i]
             lemmas.append(Lemma(lemma.lemma, lemma.tag, form))
     return lemmas
Beispiel #7
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur,
                                    'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(
                            sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start +
                                             sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
Beispiel #8
0
 def __init__(self, tagger_model):
     self.__tagger = Tagger.load(tagger_model)
     self.__tokenizer = self.__tagger.newTokenizer()
     self.__forms_buf = Forms()
     self.__tokens_buf = TokenRanges()
     self.__lemmas_buf = TaggedLemmas()
    def pos_tagging(self, text: str, stem=False, preprocess=True):
        """
        Perform pos tagging of given text
        :param text: input text
        :param stem: use stem of word or just lemma
        :param preprocess: use preprocess
        :return: list of list of tagged words: List[List[WordPos]]
        """
        lemmas = TaggedLemmas()
        tokens = TokenRanges()
        forms = Forms()
        sentences = []

        vanilla_text = text
        # remove diacritic
        text = unidecode(text)
        if preprocess:
            # remove stop words
            text = " ".join([
                w if w not in self.preprocesor.stop_words else ""
                for w in text.split()
            ])
            # lower all text
            text = text.lower()
            # replace smileys
            text = self.preprocesor.replace_emoji(text)
            vanilla_text = text

        # POS taging
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(forms, tokens):
            sentence = []
            self.tagger.tag(forms, lemmas)
            for i in range(len(lemmas)):
                lemma = lemmas[i].lemma
                tag = lemmas[i].tag
                token = tokens[i]
                token_text = vanilla_text[token.start:token.start +
                                          token.length]
                # remove diacritic
                lemma = unidecode(lemma)
                # eng flag
                eng_word = False

                # '-' is not boundary token
                # boundary token
                if tag[0] == "Z" and lemma != "-":
                    if not preprocess:
                        sentence.append(WordPos(lemma, tag, token_text))
                    if sentence:
                        sentences.append(sentence)
                    sentence = []
                    continue
                # dont stem english words
                if lemma.find("angl") != -1:
                    eng_word = True

                # remove additional informations
                lemma = lemma.split("_")[0]
                lemma = re.sub(r'-\d*$', '', lemma)

                # Stem
                if stem and not eng_word:
                    lemma = cz_stem(lemma)
                if lemma and not preprocess or len(lemma) > 2:
                    sentence.append(WordPos(lemma, tag, token_text))
            if sentence:
                sentences.append(sentence)

        return sentences