Beispiel #1
0
    def _get_surface_form_taggedlemmas(self, forms_in):
        """Given a tokens deque, return the form & list of tagged lemmas (analyses)
        for the proper name in the list of forms at the current position, if applicable.
        If there is no proper name at the beginning of the tokens deque, return (None, None).

        @param forms_in: a deque of forms tokens
        @return: (form, tagged lemmas list) or (None, None)
        """
        for test_len in range(min(self._sf_max_len, len(forms_in)), 0, -1):
            # test the string, handle number placeholders
            full_substr = [form for form in islice(forms_in, 0, test_len)]
            test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower()
                                 for form in full_substr])
            if test_substr in self._sf_dict:
                tls = TaggedLemmas()
                nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)]
                for lemma, tag in self._sf_dict[test_substr]:
                    tls.push_back(TaggedLemma())
                    for num in nums:  # replace number placeholders by actual values
                        lemma = re.sub(r'_', num, lemma, count=1)
                    tls[-1].lemma = lemma
                    tls[-1].tag = tag
                for _ in range(len(test_substr)):  # move on in the sentence
                    forms_in.popleft()
                return " ".join(full_substr), tls
        return None, None
Beispiel #2
0
    def _get_surface_form_taggedlemmas(self, forms_in):
        """Given a tokens deque, return the form & list of tagged lemmas (analyses)
        for the proper name in the list of forms at the current position, if applicable.
        If there is no proper name at the beginning of the tokens deque, return (None, None).

        @param forms_in: a deque of forms tokens
        @return: (form, tagged lemmas list) or (None, None)
        """
        for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1):
            # test the string, handle number placeholders
            full_substr = [form for form in islice(forms_in, 0, test_len)]
            test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower()
                                 for form in full_substr])
            if test_substr in self._sf_dict:
                tls = TaggedLemmas()
                nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)]
                for lemma, tag in self._sf_dict[test_substr]:
                    tls.push_back(TaggedLemma())
                    for num in nums:  # replace number placeholders by actual values
                        lemma = re.sub(r'_', num, lemma, count=1)
                    tls[-1].lemma = lemma
                    tls[-1].tag = tag
                for _ in xrange(len(test_substr)):  # move on in the sentence
                    forms_in.popleft()
                return " ".join(full_substr), tls
        return None, None
Beispiel #3
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(
        dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(
                    list(
                        map(
                            lambda x: str(x.lemma).strip() + '___' + str(x.tag)
                            .strip(), lemmas))))
            out.write('\n')
Beispiel #4
0
    def analyze(self, sent):
        """Perform morphological analysis on the given sentence, preferring analyses from the
        list of surface forms. Return a list of tuples (form, lemma, tag)."""
        self._tokenizer.setText(sent)
        analyzed = []
        while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf):

            forms_in = deque(self._forms_buf)
            self._forms_buf.resize(0)
            self._analyses_buf.resize(0)  # reset previous analyses

            while forms_in:
                form, analyses = self._get_surface_form_taggedlemmas(forms_in)
                if form:
                    # our custom analysis
                    self._analyses_buf.push_back(analyses)
                else:
                    # Morphodita analysis
                    form = forms_in.popleft()
                    analyses = TaggedLemmas()
                    self._analyzer.analyze(form, 1, analyses)
                    for i in range(len(analyses)):  # shorten lemmas (must access the vector directly)
                        analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma)
                    self._analyses_buf.push_back(analyses)

                self._forms_buf.push_back(form)

            # tag according to the given analysis
            self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf)
            analyzed.extend([(f, a[idx].lemma, a[idx].tag)
                             for (f, a, idx)
                             in zip(self._forms_buf, self._analyses_buf, self._indices_buf)])
        return analyzed
Beispiel #5
0
 def __init__(self):
     self.morphodita_model = os.path.join(
         dir_cur, 'czech-morfflex-131112.tagger-fast')
     self.tagger = Tagger.load(self.morphodita_model)
     self.forms = Forms()
     self.lemmas = TaggedLemmas()
     self.tokens = TokenRanges()
     self.tokenizer = self.tagger.newTokenizer()
Beispiel #6
0
 def __init__(self, tagger_model):
     if not os.path.isfile(tagger_model):
         raise IOError('File %s does not exist' % tagger_model)
     self._tagger = Tagger.load(tagger_model)
     self._tokenizer = self._tagger.newTokenizer()
     self._forms_buf = Forms()
     self._tokens_buf = TokenRanges()
     self._tags_buf = TaggedLemmas()
 def lemmatize(self, token):
     from ufal.morphodita import TaggedLemmas
     lemmas = TaggedLemmas()  # container for the result
     result = morpho.analyze(token, morpho.GUESSER, lemmas)  # result is int
     if result != 0:
         # sometimes uppercasing the first character helps
         result = morpho.analyze(token.title(), morpho.GUESSER, lemmas)
     return morpho.rawLemma(lemmas[0].lemma).lower()
Beispiel #8
0
 def analyze_form(self, form, guesser=True):
     """Return all lemma-tag analyses (a list of FormInfo tuples) of a given form."""
     use_guesser = 1 if guesser else 0
     tagged_lemmas = TaggedLemmas()
     used_guesser = self.tool.analyze(form, use_guesser, tagged_lemmas)
     result = []
     for tl in tagged_lemmas:
         result.append(FormInfo(form, tl.lemma, tl.tag, used_guesser))
     return result
Beispiel #9
0
    def __init__(self, derinet_file_name, morfflex_file_name,
                 morpho_file_name):
        logger.info("Loading derivations.")
        derinet_db = DeriNetDatabase(derinet_file_name)
        logger.info("Derivations loaded.")

        if morfflex_file_name is not None:
            logger.info("Loading inflections.")
            db = MorfFlexDatabase(morfflex_file_name, derinet_db)
            logger.info("Inflections loaded.")
        else:
            logger.info("Not loading inflections.")
            db = derinet_db

        logger.info("Detecting stem bounds.")

        for node in db.iter():
            node.detect_stems()

        logger.info("Stem bounds detected.")
        logger.info("Propagating morph bounds.")

        for root in db.iter_trees():
            root.propagate_morph_bounds()

        logger.info("Morph bounds propagated.")

        lemmas = []
        tagger = None
        if morpho_file_name is not None:
            logger.info("Loading morphology")
            if morphodita_available:
                tagger = Tagger.load(morpho_file_name)
            else:
                logger.error(
                    "You need to install the MorphoDiTa Python bindings!")

            if not tagger:
                logger.critical(
                    "Cannot load morphological dictionary from file '%s'.",
                    morpho_file_name)
                sys.exit(1)

            lemmas = TaggedLemmas()
            logger.info("Morphology loaded.")
        else:
            logger.info(
                "No morphological dictionary specified. Inflectional morphology will not be available."
            )
            tagger = None

        self.db = db
        self.tagger = tagger
        self.lemmas = lemmas
Beispiel #10
0
    def __init__(self, model_file):
        """
        Instantiates Morphodita from a provided model file.

        :param model_file: Path to the model file,
        :type model_file: str
        """
        from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges
        self.tagger = Tagger.load(model_file)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
Beispiel #11
0
 def create_lemmas(self, text):
     _forms = Forms()
     _lemmas = TaggedLemmas()
     _tokens = TokenRanges()
     self.tokenizer.setText(text)
     lemmas = []
     while self.tokenizer.nextSentence(_forms, _tokens):
         self.tagger.tag(_forms, _lemmas)
         for i in range(len(_lemmas)):
             lemma = _lemmas[i]
             token = _tokens[i]
             form = _forms[i]
             lemmas.append(Lemma(lemma.lemma, lemma.tag, form))
     return lemmas
Beispiel #12
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur,
                                    'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(
                            sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start +
                                             sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
Beispiel #13
0
 def __init__(self, tagger_model):
     self.__tagger = Tagger.load(tagger_model)
     self.__tokenizer = self.__tagger.newTokenizer()
     self.__forms_buf = Forms()
     self.__tokens_buf = TokenRanges()
     self.__lemmas_buf = TaggedLemmas()
    def pos_tagging(self, text: str, stem=False, preprocess=True):
        """
        Perform pos tagging of given text
        :param text: input text
        :param stem: use stem of word or just lemma
        :param preprocess: use preprocess
        :return: list of list of tagged words: List[List[WordPos]]
        """
        lemmas = TaggedLemmas()
        tokens = TokenRanges()
        forms = Forms()
        sentences = []

        vanilla_text = text
        # remove diacritic
        text = unidecode(text)
        if preprocess:
            # remove stop words
            text = " ".join([
                w if w not in self.preprocesor.stop_words else ""
                for w in text.split()
            ])
            # lower all text
            text = text.lower()
            # replace smileys
            text = self.preprocesor.replace_emoji(text)
            vanilla_text = text

        # POS taging
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(forms, tokens):
            sentence = []
            self.tagger.tag(forms, lemmas)
            for i in range(len(lemmas)):
                lemma = lemmas[i].lemma
                tag = lemmas[i].tag
                token = tokens[i]
                token_text = vanilla_text[token.start:token.start +
                                          token.length]
                # remove diacritic
                lemma = unidecode(lemma)
                # eng flag
                eng_word = False

                # '-' is not boundary token
                # boundary token
                if tag[0] == "Z" and lemma != "-":
                    if not preprocess:
                        sentence.append(WordPos(lemma, tag, token_text))
                    if sentence:
                        sentences.append(sentence)
                    sentence = []
                    continue
                # dont stem english words
                if lemma.find("angl") != -1:
                    eng_word = True

                # remove additional informations
                lemma = lemma.split("_")[0]
                lemma = re.sub(r'-\d*$', '', lemma)

                # Stem
                if stem and not eng_word:
                    lemma = cz_stem(lemma)
                if lemma and not preprocess or len(lemma) > 2:
                    sentence.append(WordPos(lemma, tag, token_text))
            if sentence:
                sentences.append(sentence)

        return sentences