コード例 #1
0
def voikko_simplify(text, language):
    """Reduces the given text to its base forms using libvoikko.

    This allows the search engine to ignore spelling variation and
    match more things at the cost of exactness.

    """
    if language not in voikot:
        try:
            voikot[language] = libvoikko.Voikko(language)
        except (libvoikko.VoikkoException, OSError):
            # either voikko isn't installed or voikko doesn't have the
            # language we request. never try this language again
            voikot[language] = None
            return text

    v = voikot[language]

    if v is None:
        return text

    new_words = []
    for word in text.split():
        analysis = v.analyze(word)
        if analysis:
            analysis = analysis[0]
            word_class = analysis.get("CLASS", "nimisana")
            if word_class not in ACCEPTABLE_WORD_CLASSES:
                # filter uninteresting word types outright
                continue
            if "BASEFORM" in analysis:
                new_words.append(analysis["BASEFORM"])

    return u" ".join(new_words)
コード例 #2
0
def analyzeWords(words):
    v = libvoikko.Voikko(u"fi")
    errors = []
    for word in words:
        if not v.spell(word["value"]):
            word["suggestions"] = v.suggest(word["value"])
            errors.append(word)
    v.terminate()
    return errors
コード例 #3
0
def init_libvoikko_error() -> str:
    if IMPORT_LIBVOIKKO_SUCCESSFUL:
        try:
            voikko = libvoikko.Voikko('fi')
            if voikko:
                return ''
            return 'Intialization of Voikko failed: object empty'
        except (libvoikko.VoikkoException, ) as error:
            return str(error)
    return 'import libvoikko failed.'
コード例 #4
0
 def setUp(self):
     info = MorphologyInfo()
     info.variant = VARIANT_NAME
     info.morphology = u"null"
     info.speller = u"AllOk"
     info.suggestion = u"null"
     self.dataDir = TestDataDir()
     self.dataDir.createMorphology(VARIANT_NAME, info)
     self.voikko = libvoikko.Voikko("fi-x-" + VARIANT_NAME,
                                    path=self.dataDir.getDirectory())
コード例 #5
0
def get_baseword_frequencies(topic):
    voikko = libvoikko.Voikko('fi')
    orig_words = topic.split()
    baseform_words = []
    for orig_word in orig_words:
        word = orig_word.strip('-')
        if word:
            baseform_words += get_baseform_word(voikko, word)

    baseform_words = remove_common_words(baseform_words)
    baseform_words = remove_numbers(baseform_words)
    return count_frequencies(baseform_words)
コード例 #6
0
def voikko_lemmatizer(words):
    lemmatizer = libvoikko.Voikko("fi")
    root_words = []

    for word in words:
        v_dict = lemmatizer.analyze(word.strip())
        try:
            root_words.append(v_dict[0]["BASEFORM"])
        except:
            pass

    return root_words
コード例 #7
0
def stemming_message_voikko(message):
    import libvoikko
    voikko = libvoikko.Voikko('fi')

    stemmed_message = []
    for token in voikko.tokens(message):
        if token.tokenType == token.WORD:
            res = voikko.analyze(token.tokenText)
            if len(res) > 0:
                stemmed_message += [res[0]['BASEFORM']]
                baseform_to_original[res[0]['BASEFORM']] = token.tokenText
            else:
                stemmed_message += [token.tokenText]
    return ' '.join(stemmed_message), baseform_to_original
コード例 #8
0
def run_spell_checker():
    db = MysqlDB()

    garbwords = db.get_all_words()
    if not garbwords:
        print('\tError: No garbage words found in database')
        return False

    # initialize voikko
    voikko = libvoikko.Voikko(u'fi')

    # remove word from database if it passes spell checker
    for json in garbwords:
        word = str(json.get('word'))
        #        print(word)
        if voikko.spell(word):
            print(f'Removing {word} because it passed the spell ckecker')
            db.words_remove_garbword(word)
            db.commit()
コード例 #9
0
ファイル: NorthSamiTest.py プロジェクト: xyuebai/corevoikko
 def setUp(self):
     self.voikko = libvoikko.Voikko("se")
コード例 #10
0
 def __init__(self):
     self.v = libvoikko.Voikko('fi')
コード例 #11
0
SWE_STOPS.update(['swe', 'svenska', 'dag', 'buu', 'klubben', 'fråga', 'veckans', 'jag'])

EN_STOPS = set(stopwords.words('english'))
EN_STOPS.update(['of', 'and', 'you', 'for', 'what', 'have', 'can'])
DEL_FROM_ENSTOPS = ['on', 'as', 'a', 'd', 'm', 'o', 's', 't', 'me', 'no', 'y']
for word in DEL_FROM_ENSTOPS:
    EN_STOPS.remove(word)

OTHER_STOPS = set([
    'mailto', 'subject', 'from', 'to', 'vs', 'message', 'original', 'date',
    're', 'terv', 'sent', 'from', 'kello', 'fin', 'swe', 'uutisikkuna'
])

FINAL_STOPS = FIN_STOPS | OTHER_STOPS

voikko = libvoikko.Voikko('fi')
voikko.setIgnoreDot(True)


def _fin_lemmatize_word(string):
    voikkofied = voikko.analyze(string)
    if len(voikkofied) > 0 and voikkofied[0].get('BASEFORM') is not None:
        return voikkofied[0]['BASEFORM']
    else:
        return string


def _finnish_detector(text):
    token_set = set(text.split())
    n_fi = len(token_set.intersection(FIN_STOPS))
    n_swe = len(token_set.intersection(SWE_STOPS))
コード例 #12
0
    def pre_processing(self):

        initial_df = self.dataframe
        # print(initial_df)
        # print("789456123")
        # initial_df = str(initial_df)
        initial_df['Index'] = np.arange(1, len(initial_df) + 1)
        initial_df = initial_df[['Index', 'documents']]
        initial_df['documents'] = initial_df['documents'].astype(str)
        new_df = pd.DataFrame(initial_df, index=initial_df.Index).stack()
        # new_df = pd.DataFrame(initial_df.documents.str.split('[.?!,]').tolist(), index=initial_df.Index).stack()
        new_df = new_df.reset_index([0, 'Index'])
        new_df.columns = ['Index', 'documents']
        new_df['documents'] = new_df['documents'].str.replace('[œ,Œ]', '-')
        new_df['documents'] = new_df['documents'].str.replace('ƒ⁄fifi⁄', '')
        new_df['documents'] = new_df['documents'].str.replace('*', '')
        new_df['documents'] = new_df['documents'].str.lstrip()

        # # Remove empty row
        new_df['documents'].replace('', np.nan, inplace=True)
        new_df.dropna(subset=['documents'], inplace=True)
        # new_df.to_excel('checking.xlsx')
        # Capitalize the first letter
        new_df['documents'] = new_df['documents'].map(
            lambda x: x[0].upper() + x[1:])
        # new_df.to_excel('checking_upper.xlsx')
        # Converting into lower case
        # new_df['documents1'] = new_df.documents.map(lambda x: x.lower())
        new_df['documents1'] = new_df['documents'].str.replace(
            '[-,:,/,(,),",;,>,<,?,_,\n,❤,\t,??,ӻ,كw,큞,ԃ,ˮ,ĭ,fifi,fl,•,*,.,!]', '')
        # new_df['documents1'] = new_df['documents1'].str.replace('[^\w]', '')
        # new_df['documents1'] = new_df['documents1'].str.replace('[^\s]', ' ')
        new_df['documents1'] = new_df['documents1'].str.lstrip()
        # remove empty strings
        new_df['new_col'] = new_df['documents1'].astype(str).str[0]
        # new_df['documents1'] = new_df['documents1'].str.replace('[^\w]', '')
        # new_df['documents1'] = new_df['documents1'].str.replace('[^\s]', ' ')
        nan_value = float("NaN")
        # Convert NaN values to empty string
        new_df.replace("", nan_value, inplace=True)
        new_df.dropna(subset=["new_col"], inplace=True)
        new_df.drop('new_col', inplace=True, axis=1)
        # Convert articles ino the tokens

        new_df['docuemnt_tokens'] = new_df.documents.map(
            lambda x: RegexpTokenizer(r'\w+').tokenize(x))

        # Apply Lemmatization (Voikko)
        # os.add_dll_directory(r'C:\Voikko')
        C = libvoikko.Voikko(u"fi")

        # C.setLibrarySearchPath("C:\Voikko")

        # Apply lemmatizations to the words
        def lemmatize_text(text):
            bf_list = []
            for w in text:
                voikko_dict = C.analyze(w)
                if voikko_dict:
                    bf_word = voikko_dict[0]['BASEFORM']
                else:
                    bf_word = w
                bf_list.append(bf_word)
            return bf_list

        new_df['lemmatized'] = new_df.docuemnt_tokens.apply(lemmatize_text)
        # new_df['documents'] = new_df['documents'].map(lambda x: [t for t in x if t not in self.stop_words])
        # stop_en = stopwords.words('finnish')
        new_df['article'] = new_df.lemmatized.map(
            lambda x: [t for t in x if t not in self.stop_words])
        # make sure the datatype of column 'article_removed_stop_words' is string
        new_df['article'] = new_df['article'].astype(str)
        new_df['article'] = new_df['article'].apply(eval).apply(' '.join)
        new_df['Index'] = np.arange(1, len(new_df['article']) + 1)
        new_df.to_excel('../static/assets/text_preprocessing.xlsx')
        return new_df
コード例 #13
0
def normalise(filename, lemmatize=True):
    """
    Normalise a corpus from /data/corpora/
    """
    import libvoikko
    #Define a Voikko class for Finnish
    analyzer = libvoikko.Voikko(u"fi")

    #Open the text file
    print("Reading the input text file...")
    with open(os.path.join('data', 'corpora', filename), 'r',
              encoding='utf-8') as f:
        text = f.read()

    #Print text
    #print("TEXT BEFORE NORMALISATION")
    #print(text)

    #Remove numbers
    #text = ''.join(c for c in text if not c.isdigit())

    #Tokenize & remove punctuation and special characters
    #print("Tokenizing & removing punctuation and special characters...")
    #tokenizer = RegexpTokenizer(r'\w+','.')
    #text = tokenizer.tokenize(text)

    #Tokenize
    print("Tokenizing...")
    text = word_tokenize(text)

    #Join dots with ordinal numbers
    print("Merging ordinal numbers and dots...")
    for idx, word in enumerate(text):
        if word.isdigit() and text[idx + 1] == '.' and text[idx +
                                                            2][0].islower():
            text[idx:idx + 2] = [''.join(text[idx:idx + 2])]

    #Lemmatize tokens if lemmatize=True
    text_length = len(text)
    pbar = tqdm(total=text_length,
                ascii=True,
                desc='Lemmatizing...',
                position=0,
                unit='keys',
                unit_scale=True)
    for idx, word in enumerate(text):

        #Lemmatize the word. analyze() function returns
        #various info for the word
        if lemmatize:

            #Check if word is found from dictionary
            analyzed = analyzer.analyze(word)
            if analyzed:

                #Check if word starts with lowercase
                if word[0].islower():

                    #Check if there are more than 1 possible lemmas in the vocabulary
                    if len(analyzed) > 1:
                        #Esclude classes paikannimi, sukunimi, etunimi, nimi
                        analyzed_mod = [
                            element for element in analyzed
                            if 'paikannimi' not in element.values()
                            and 'sukunumi' not in element.values()
                            and 'etunumi' not in element.values()
                            and 'nimi' not in element.values()
                        ]

                        #Avoid an error if it turns out to be empty list after
                        #excluding these classes
                        if len(analyzed_mod) > 0:
                            text[idx] = analyzed_mod[0]['BASEFORM'].lower()
                        else:
                            text[idx] = analyzed[0]['BASEFORM'].lower()

                    #Pick the lowercased lemma directly if there is only one lemma
                    #for the query word
                    else:
                        text[idx] = analyzed[0]['BASEFORM'].lower()

                #The word is capitalized => proper noun or/and the first word of a
                #sentence. Pick the lemma from the vocabulary.
                else:
                    text[idx] = analyzed[0]['BASEFORM']

        #If lemmatization is not needed, check only the capitalized words
        #and lowercase, if needed
        else:
            if word[0].isupper():
                analyzed = analyzer.analyze(word)

                #Lowercase the word if its lemma is
                #lowercased (if the lemma is not a proper noun)
                if analyzed and analyzed[0]['BASEFORM'][0].islower():
                    text[idx] = text[idx].lower()

        pbar.update(1)

    #Print normalized text
    #print("TEXT AFTER NORMALISATION")
    #print(' '.join(text))

    #Write tokenized text to a text file and save it in /data/corpora/
    if lemmatize:
        filename_normalized = filename[:-4] + '_normalized.txt'
    else:
        filename_normalized = filename[:-4] + '_normalized_NON-lemmatized.txt'
    print("\nWriting the normalized text to a txt file...")
    with open(filename_normalized, 'w', encoding='utf-8') as f:

        #Write the whole text in one line
        #f.write(' '.join(text))

        #Write one sentence per line
        for sentence in ' '.join(text).split(' .'):
            #Write only if sentence consists of more than one word
            if len(word_tokenize(sentence)) > 1:
                f.write(sentence)
                f.write(' .\n')
コード例 #14
0
# This is an example application

import libvoikko
import numpy as np
import pandas as pd
import string
import time
import xml.etree.ElementTree as ET
from collections import defaultdict

STOP_WORDS = set([word.strip() for word in open("stop_words.txt")])
SISALTO_FIELD_PREFIX = "sisalto_level_"
voikko = libvoikko.Voikko("fi")


def parse_text_from_xml(XML):
    LEVEL = "*/"
    root = ET.fromstring(XML)
    sisalto_dict = defaultdict(str)

    # Define absolute values being interested
    paths = {
        "tyyppi":
        "{http://www.eduskunta.fi/skeemat/siirto/2011/09/07}SiirtoMetatieto/{http://www.eduskunta.fi/skeemat/julkaisusiirtokooste/2011/12/20}JulkaisuMetatieto/*/{http://www.vn.fi/skeemat/metatietoelementit/2010/04/27}AsiakirjatyyppiNimi",
        "nimike":
        "{http://www.eduskunta.fi/skeemat/siirto/2011/09/07}SiirtoMetatieto/{http://www.eduskunta.fi/skeemat/julkaisusiirtokooste/2011/12/20}JulkaisuMetatieto/*/{http://www.vn.fi/skeemat/metatietokooste/2010/04/27}Nimeke/{http://www.vn.fi/skeemat/metatietoelementit/2010/04/27}NimekeTeksti",
    }

    # Get the values for absolutely defined paths
    for item, value in paths.items():
        field = root.find(value)
コード例 #15
0
    elif re.compile('-[a-zäöåA-ZÄÖÅ]').match(word):
        return 0
    elif re.compile('\'[a-zäöåA-ZÄÖÅ]').match(word):
        return 0
    elif re.compile('[a-zäåöA-ZÄÖÅ]+-Aalto').match(word):
        return 0
    elif re.compile('[a-zäåöA-ZÄÖÅ]+-Ilta').match(word):
        return 0
    elif re.compile('[a-zäåöA-ZÄÖÅ]+-Arvonen').match(word):
        return 0
    else:
        return 1


# Define a Voikko class for Finnish
v = libvoikko.Voikko(u"fi")

words = dict()
words["asemosana"] = []
words["etunimi"] = []
words["huudahdussana"] = []
words["kieltosana"] = []
words["laatusana"] = []
words["lukusana"] = []
words["lyhenne"] = []
words["nimi"] = []
words["nimisana"] = []
words["nimisana_laatusana"] = []
words["paikannimi"] = []
words["seikkasana"] = []
words["sidesana"] = []
コード例 #16
0
 def setUp(self):
     self.voikko = libvoikko.Voikko("fi-x-apertium")
コード例 #17
0
ファイル: schema.py プロジェクト: kausaltech/kausal-watch
from actions.models import (Plan, Action, ActionSchedule, ActionStatus,
                            Category, CategoryType, ActionTask, ActionImpact,
                            ActionResponsibleParty, ActionContactPerson,
                            ActionStatusUpdate, ImpactGroup, ImpactGroupAction,
                            MonitoringQualityPoint, Scenario)
from indicators.models import (Indicator, RelatedIndicator, ActionIndicator,
                               IndicatorGraph, IndicatorLevel, IndicatorValue,
                               IndicatorGoal, Unit, Quantity)
from content.models import (StaticPage, BlogPost, Question, SiteGeneralContent)
from people.models import Person
from django_orghierarchy.models import Organization, OrganizationClass

LOCAL_TZ = pytz.timezone('Europe/Helsinki')

try:
    voikko_fi = libvoikko.Voikko(language='fi')
    voikko_fi.setNoUglyHyphenation(True)
    voikko_fi.setMinHyphenatedWordLength(16)
except OSError:
    voikko_fi = None

_hyphenation_cache = {}


def hyphenate(s):
    if voikko_fi is None:
        return s

    tokens = voikko_fi.tokens(s)
    out = ''
    for t in tokens:
コード例 #18
0
    def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            LOGGER.debug('load_dictionary() ...\n')
        (self.dic_path, self.encoding,
         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
        if self.words:
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af',
                'ast',
                'az',
                'be',
                'bg',
                'br',
                'bs',
                'ca',
                'cs',
                'csb',
                'cv',
                'cy',
                'da',
                'de',
                'dsb',
                'el',
                'en',
                'es',
                'eu',
                'fi',
                'fo',
                'fr',
                'fur',
                'fy',
                'ga',
                'gd',
                'gl',
                'grc',
                'gv',
                'haw',
                'hr',
                'hsb',
                'ht',
                'hu',
                'ia',
                'is',
                'it',
                'kk',
                'ku',
                'ky',
                'lb',
                'ln',
                'lv',
                'mg',
                'mi',
                'mk',
                'mn',
                'mos',
                'mt',
                'nb',
                'nds',
                'nl',
                'nn',
                'nr',
                'nso',
                'ny',
                'oc',
                'pl',
                'plt',
                'pt',
                'qu',
                'quh',
                'ru',
                'sc',
                'se',
                'sh',
                'shs',
                'sk',
                'sl',
                'smj',
                'sq',
                'sr',
                'ss',
                'st',
                'sv',
                'tet',
                'tk',
                'tn',
                'ts',
                'uk',
                'uz',
                've',
                'vi',
                'wa',
                'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [(x, itb_util.remove_accents(x))
                                   for x in self.words]
            for word in self.words:
                if len(word) > self.max_word_len:
                    self.max_word_len = len(word)
            if DEBUG_LEVEL > 1:
                LOGGER.debug('max_word_len = %s\n', self.max_word_len)
            if self.name.split('_')[0] == 'fi':
                self.enchant_dict = None
                self.pyhunspell_object = None
                if IMPORT_LIBVOIKKO_SUCCESSFUL:
                    self.voikko = libvoikko.Voikko('fi')
                return
            if IMPORT_ENCHANT_SUCCESSFUL:
                try:
                    self.enchant_dict = enchant.Dict(self.name)
                except enchant.errors.DictNotFoundError:
                    LOGGER.exception('Error initializing enchant for %s',
                                     self.name)
                    self.enchant_dict = None
                except Exception:
                    LOGGER.exception(
                        'Unknown error initializing enchant for %s', self.name)
                    self.enchant_dict = None
            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
                aff_path = self.dic_path.replace('.dic', '.aff')
                try:
                    self.pyhunspell_object = hunspell.HunSpell(
                        self.dic_path, aff_path)
                except hunspell.HunSpellError:
                    LOGGER.debug('Error initializing hunspell for %s',
                                 self.name)
                    self.pyhunspell_object = None
                except Exception:
                    LOGGER.debug('Unknown error initializing hunspell for %s',
                                 self.name)
                    self.pyhunspell_object = None
コード例 #19
0
 def setUp(self):
     self.voikko = libvoikko.Voikko("mhr")