def lemmatize_file(filename):
    print('lemmatizing ' + filename)

    v = Voikko("fi")
    lemmatized_filename = filename + '_lemmatized'
    lemmatized_file = open(lemmatized_filename, 'w')

    with open(filename, 'r') as f:
        for sentence in f:
            sent_toks = v.tokens(sentence)

            words_baseform = []
            for word in sent_toks:
                if word.tokenType == 1:
                    word_analyzed = v.analyze(word.tokenText)
                    if len(word_analyzed) > 0:
                        words_baseform.append(word_analyzed[0].get('BASEFORM'))
                    else:
                        words_baseform.append(word.tokenText)
                else:
                    words_baseform.append(word.tokenText)

            sent_baseform = ''.join(words_baseform)
            lemmatized_file.write(sent_baseform)

    lemmatized_file.close()
    v.terminate()
    return lemmatized_filename
Esempio n. 2
0
class VoikkoTokenizer():

    """
    Voikko Tokenizer
    ~~~~~~~~~~~~~~~~

    Getting Voikko to work on Windows
    =================================
    - Download voikko DLL into application directory from:
      https://www.puimula.org/htp/testing/voikko-sdk/win-crossbuild/
    - Download and extract dictionary files into `instance/voikko` directory:
      https://www.puimula.org/htp/testing/voikko-snapshot-v5/

      Select one contain morphological data.

    """

    """ Tokenize text """
    def __init__(self, lang="fi"):

        # Voikko dictrionary path.
        dict_path = instance_path() / "voikko"
        path = str(dict_path) if dict_path.exists() else None

        self.stem_map = {}
        self.voikko = Voikko(lang, path=path)
        self.regex_words = re.compile(r"""
            (\w+-(?:\w+)+  # Get wordcharacters conjucated by dash (-)
            |\w{1,}        # OR all word characters len() > 1
            )|(?::[\w]*)   # ignore word characters after colon
        """, re.VERBOSE + re.MULTILINE)
        self.err_treshold = 0.5

    def tokenize(self, text: str) -> List[str]:
        """ Return list of words """
        # Split into paragraphs.
        paragraphs = text.splitlines()
        tokens = chain(*map(self.tokenize_paragraph, paragraphs))

        return tokens

    def tokenize_paragraph(self, sentence, use_suggestions=True):
        """ Tokenize words using :class:`~Voikko`

        ..todo:
            - Detect abbrevations from CAPITAL letters.

        :param use_suggestions:  Should stemming use spell checking.
        """

        # Spell check mistake counters
        err_count = 0

        def _stem(word: str) -> List[str]:
            """ Return :type:`list` of stemmed words.

            If word is found on voikko dataset, uses suggestion to lookup for first candidate.
            """
            nonlocal err_count

            # See: https://github.com/voikko/voikko-sklearn/blob/master/voikko_sklearn.py
            FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana"]

            # Check for previous stemming result
            stemmed_word = self.stem_map.get(word, None)
            if stemmed_word is not None:
                return [stemmed_word]

            analysis = self.analyze(word)

            if not analysis:
                # If analyze didn't produce results, try spellcheking
                err_count += 1
                analysis = []

                if use_suggestions:
                    # Get first suggestion.
                    suggested, *xs = self.voikko.suggest(word) or [None]
                    logger.debug(f"Voikko did not found word {word!r}; suggested spelling: {suggested!r}")

                    if suggested is not None:
                        # return tokenized suggestion - It can be two or more words.
                        return self.tokenize_paragraph(suggested, use_suggestions=False)

            # Prefer nimisana over others
            analysis = sorted(analysis, key=lambda x: -1 if x.get('CLASS') in ["nimisana"] else 0)

            for _word in analysis:
                # Find first suitable iteration of word.
                _class = _word.get("CLASS", None)
                if _class not in FINNISH_STOPWORD_CLASSES:
                    baseform = _word.get('BASEFORM').lower()
                    self.stem_map[word] = baseform
                    return [baseform]

            # Fall back to given word.
            self.stem_map[word] = word.lower()
            return [word.lower()]

        # Create list of words from string, separating from non-word characters.
        r = [x for x in re.findall(self.regex_words, sentence.lower()) if x != ""]

        r = [x for x in chain(*map(_stem, r)) if x]
        if len(r) * self.err_treshold < err_count:
            # Too many spelling errors. Presume incorrect language, and disregard paragraph.
            logger.debug("Too many spelling errors: %d out of %d", err_count, len(r))
            return []

        return r

    @cached(LFUCache(maxsize=512))
    def analyze(self, word: str) -> List[Dict]:
        """ Analyze word, returning morhpological data.

            Uses :class:`LFUCache` - least frequently used - cache.
         """
        return self.voikko.analyze(word)

    def __getstate__(self):
        """ Return pickleable attributes.

        :class:`Voikko` can't be serialized, so remove it.
        """

        state = self.__dict__.copy()
        state['voikko_lang'] = self.voikko.listDicts()[0].language
        del state['voikko']
        return state

    def __setstate__(self, state):
        state['voikko'] = Voikko(state['voikko_lang'])
        del state['voikko_lang']
        self.__dict__.update(state)
Esempio n. 3
0
					bf = word[:word.index(":")]
					cl = "lukusana" if re.fullmatch(r'\d+', bf) else "nimisana"
					output += [AltWords(word, [Word(word, bf, case, number, cl)])]
					cont = True
		if cont:
			continue
		for case in ORDINAL_CASE_REGEXES:
			if re.fullmatch(ORDINAL_CASE_REGEXES[case], word):
				bf = word[:word.index(":")]
				cl = "lukusana" if re.fullmatch(r'\d+', bf) else "nimisana"
				output += [AltWords(word, [Word(word, bf, case, number, cl, ordinal_like=True)])]
				cont = True
		if cont:
			continue
		
		analysis_list = voikko.analyze(word)
		prefix = ""
		if len(analysis_list) == 0 and "-" in word:
			i = word.rindex("-")+1
			analysis_list = voikko.analyze(word[i:])
			prefix = word[:i].lower()
		alternatives = []
		for analysis in analysis_list:
			bf = prefix+analysis["BASEFORM"]
			cl = analysis["CLASS"]
			if bf in ORDINALS+CARDINALS or re.fullmatch(r'\d+', bf):
				cl = "lukusana"
			elif "PARTICIPLE" in analysis and analysis["PARTICIPLE"] == "agent":
				cl = "laatusana"
			number = analysis.get("NUMBER", "")
			person = analysis.get("PERSON", "")