def lemmatize_file(filename):
    print('lemmatizing ' + filename)

    v = Voikko("fi")
    lemmatized_filename = filename + '_lemmatized'
    lemmatized_file = open(lemmatized_filename, 'w')

    with open(filename, 'r') as f:
        for sentence in f:
            sent_toks = v.tokens(sentence)

            words_baseform = []
            for word in sent_toks:
                if word.tokenType == 1:
                    word_analyzed = v.analyze(word.tokenText)
                    if len(word_analyzed) > 0:
                        words_baseform.append(word_analyzed[0].get('BASEFORM'))
                    else:
                        words_baseform.append(word.tokenText)
                else:
                    words_baseform.append(word.tokenText)

            sent_baseform = ''.join(words_baseform)
            lemmatized_file.write(sent_baseform)

    lemmatized_file.close()
    v.terminate()
    return lemmatized_filename
def read_data(file_path):
    ''' Read data into a list of words and store the words into a file
    if the relevant word file does not exist'''

    if os.path.exists(file_path + '_words'):
        print('reading from word file...')
        with open(file_path + '_words', 'r') as f:
            words = f.read().split('\n')
            return words

    print('reading from data file...')
    v = Voikko("fi")

    with open(file_path) as f:
        words = [
            word.tokenText.lower() for word in v.tokens(f.read())
            if word.tokenType == 1 or word.tokenType == 2
        ]
        v.terminate()

        file = open(file_path + '_words', 'w')
        file.write('\n'.join(words))
        file.close()

        return words
def sentence_to_index(index_file, file_path, dictionary):
    '''Read sentences from file and replace them with
    their corresponding word indices in the dictionary'''

    print("converting sentences to indices...")
    v = Voikko("fi")

    index_f = open(index_file, 'wb')
    with open(file_path) as f:
        index_sentences = []
        for sentence in f:
            words = [
                word.tokenText.lower() for word in v.tokens(sentence)
                if word.tokenType == 1 or word.tokenType == 2
            ]

            index_words = [
                dictionary[word] if word in dictionary else 0 for word in words
            ]
            index_sentences.append(index_words)
        v.terminate()

        # save sentence indices into a index_file
        pkl.dump(index_sentences, index_f, -1)
        index_f.close()

        return index_sentences