def read_data(file_path):
    ''' Read data into a list of words and store the words into a file
    if the relevant word file does not exist'''

    if os.path.exists(file_path + '_words'):
        print('reading from word file...')
        with open(file_path + '_words', 'r') as f:
            words = f.read().split('\n')
            return words

    print('reading from data file...')
    v = Voikko("fi")

    with open(file_path) as f:
        words = [
            word.tokenText.lower() for word in v.tokens(f.read())
            if word.tokenType == 1 or word.tokenType == 2
        ]
        v.terminate()

        file = open(file_path + '_words', 'w')
        file.write('\n'.join(words))
        file.close()

        return words
def sentence_to_index(index_file, file_path, dictionary):
    '''Read sentences from file and replace them with
    their corresponding word indices in the dictionary'''

    print("converting sentences to indices...")
    v = Voikko("fi")

    index_f = open(index_file, 'wb')
    with open(file_path) as f:
        index_sentences = []
        for sentence in f:
            words = [
                word.tokenText.lower() for word in v.tokens(sentence)
                if word.tokenType == 1 or word.tokenType == 2
            ]

            index_words = [
                dictionary[word] if word in dictionary else 0 for word in words
            ]
            index_sentences.append(index_words)
        v.terminate()

        # save sentence indices into a index_file
        pkl.dump(index_sentences, index_f, -1)
        index_f.close()

        return index_sentences
Esempio n. 3
0
    def __init__(self, lang="fi"):

        # Voikko dictrionary path.
        dict_path = instance_path() / "voikko"
        path = str(dict_path) if dict_path.exists() else None

        self.stem_map = {}
        self.voikko = Voikko(lang, path=path)
        self.regex_words = re.compile(r"""
            (\w+-(?:\w+)+  # Get wordcharacters conjucated by dash (-)
            |\w{1,}        # OR all word characters len() > 1
            )|(?::[\w]*)   # ignore word characters after colon
        """, re.VERBOSE + re.MULTILINE)
        self.err_treshold = 0.5
def lemmatize_file(filename):
    print('lemmatizing ' + filename)

    v = Voikko("fi")
    lemmatized_filename = filename + '_lemmatized'
    lemmatized_file = open(lemmatized_filename, 'w')

    with open(filename, 'r') as f:
        for sentence in f:
            sent_toks = v.tokens(sentence)

            words_baseform = []
            for word in sent_toks:
                if word.tokenType == 1:
                    word_analyzed = v.analyze(word.tokenText)
                    if len(word_analyzed) > 0:
                        words_baseform.append(word_analyzed[0].get('BASEFORM'))
                    else:
                        words_baseform.append(word.tokenText)
                else:
                    words_baseform.append(word.tokenText)

            sent_baseform = ''.join(words_baseform)
            lemmatized_file.write(sent_baseform)

    lemmatized_file.close()
    v.terminate()
    return lemmatized_filename
Esempio n. 5
0
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, vectorizer)
    pyLDAvis.save_html(LDAvis_prepared,
                       str(_instance_path() / "pyldavis.html"))

    joblib.dump(lda, LDA_FILE)
    joblib.dump(vectorizer, WORDS_FILE)

number_words = 15

suitable_topic_classes = ["nimisana", "nimi"]

topic_labels = {}
topics = np.array([x.argsort()[::-1] for x in lda.components_])

v = Voikko("fi")


def _is_suitable_label(word) -> bool:
    r = v.analyze(word) or []
    for w in r:
        if w.get("CLASS") in suitable_topic_classes:
            return True
        else:
            logger.debug("%s CLASS is %s", word, w.get("CLASS"))
    return False


i = 0
while len(topic_labels) < number_topics:
Esempio n. 6
0
class VoikkoTokenizer():

    """
    Voikko Tokenizer
    ~~~~~~~~~~~~~~~~

    Getting Voikko to work on Windows
    =================================
    - Download voikko DLL into application directory from:
      https://www.puimula.org/htp/testing/voikko-sdk/win-crossbuild/
    - Download and extract dictionary files into `instance/voikko` directory:
      https://www.puimula.org/htp/testing/voikko-snapshot-v5/

      Select one contain morphological data.

    """

    """ Tokenize text """
    def __init__(self, lang="fi"):

        # Voikko dictrionary path.
        dict_path = instance_path() / "voikko"
        path = str(dict_path) if dict_path.exists() else None

        self.stem_map = {}
        self.voikko = Voikko(lang, path=path)
        self.regex_words = re.compile(r"""
            (\w+-(?:\w+)+  # Get wordcharacters conjucated by dash (-)
            |\w{1,}        # OR all word characters len() > 1
            )|(?::[\w]*)   # ignore word characters after colon
        """, re.VERBOSE + re.MULTILINE)
        self.err_treshold = 0.5

    def tokenize(self, text: str) -> List[str]:
        """ Return list of words """
        # Split into paragraphs.
        paragraphs = text.splitlines()
        tokens = chain(*map(self.tokenize_paragraph, paragraphs))

        return tokens

    def tokenize_paragraph(self, sentence, use_suggestions=True):
        """ Tokenize words using :class:`~Voikko`

        ..todo:
            - Detect abbrevations from CAPITAL letters.

        :param use_suggestions:  Should stemming use spell checking.
        """

        # Spell check mistake counters
        err_count = 0

        def _stem(word: str) -> List[str]:
            """ Return :type:`list` of stemmed words.

            If word is found on voikko dataset, uses suggestion to lookup for first candidate.
            """
            nonlocal err_count

            # See: https://github.com/voikko/voikko-sklearn/blob/master/voikko_sklearn.py
            FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana"]

            # Check for previous stemming result
            stemmed_word = self.stem_map.get(word, None)
            if stemmed_word is not None:
                return [stemmed_word]

            analysis = self.analyze(word)

            if not analysis:
                # If analyze didn't produce results, try spellcheking
                err_count += 1
                analysis = []

                if use_suggestions:
                    # Get first suggestion.
                    suggested, *xs = self.voikko.suggest(word) or [None]
                    logger.debug(f"Voikko did not found word {word!r}; suggested spelling: {suggested!r}")

                    if suggested is not None:
                        # return tokenized suggestion - It can be two or more words.
                        return self.tokenize_paragraph(suggested, use_suggestions=False)

            # Prefer nimisana over others
            analysis = sorted(analysis, key=lambda x: -1 if x.get('CLASS') in ["nimisana"] else 0)

            for _word in analysis:
                # Find first suitable iteration of word.
                _class = _word.get("CLASS", None)
                if _class not in FINNISH_STOPWORD_CLASSES:
                    baseform = _word.get('BASEFORM').lower()
                    self.stem_map[word] = baseform
                    return [baseform]

            # Fall back to given word.
            self.stem_map[word] = word.lower()
            return [word.lower()]

        # Create list of words from string, separating from non-word characters.
        r = [x for x in re.findall(self.regex_words, sentence.lower()) if x != ""]

        r = [x for x in chain(*map(_stem, r)) if x]
        if len(r) * self.err_treshold < err_count:
            # Too many spelling errors. Presume incorrect language, and disregard paragraph.
            logger.debug("Too many spelling errors: %d out of %d", err_count, len(r))
            return []

        return r

    @cached(LFUCache(maxsize=512))
    def analyze(self, word: str) -> List[Dict]:
        """ Analyze word, returning morhpological data.

            Uses :class:`LFUCache` - least frequently used - cache.
         """
        return self.voikko.analyze(word)

    def __getstate__(self):
        """ Return pickleable attributes.

        :class:`Voikko` can't be serialized, so remove it.
        """

        state = self.__dict__.copy()
        state['voikko_lang'] = self.voikko.listDicts()[0].language
        del state['voikko']
        return state

    def __setstate__(self, state):
        state['voikko'] = Voikko(state['voikko_lang'])
        del state['voikko_lang']
        self.__dict__.update(state)
Esempio n. 7
0
 def __setstate__(self, state):
     state['voikko'] = Voikko(state['voikko_lang'])
     del state['voikko_lang']
     self.__dict__.update(state)
Esempio n. 8
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import html, re
from voikko.libvoikko import Voikko, Token
from fatal_error import syntaxError
from inflect import *

LANGUAGE = "fi-x-morpho"
ENCODING = "UTF-8"

voikko = Voikko(LANGUAGE)

def lexCode(code):
	output = []
	for word in re.split(r'(\s|\.|,|;|\[|\]|"[^"]*"|#[^\n]*\n|\([^()]*\))', code):
		if word == "":
			continue
		if re.fullmatch(r'\s|\.|,|;|\[|\]|"[^"]*"|#[^\n]*\n|\([^()]*\)', word):
			output += [Punctuation(word)]
			continue
		
		cont = False
		for number in CASE_REGEXES:
			for case in CASE_REGEXES[number]:
				if re.fullmatch(CASE_REGEXES[number][case], word):
					bf = word[:word.index(":")]