def voikko_simplify(text, language): """Reduces the given text to its base forms using libvoikko. This allows the search engine to ignore spelling variation and match more things at the cost of exactness. """ if language not in voikot: try: voikot[language] = libvoikko.Voikko(language) except (libvoikko.VoikkoException, OSError): # either voikko isn't installed or voikko doesn't have the # language we request. never try this language again voikot[language] = None return text v = voikot[language] if v is None: return text new_words = [] for word in text.split(): analysis = v.analyze(word) if analysis: analysis = analysis[0] word_class = analysis.get("CLASS", "nimisana") if word_class not in ACCEPTABLE_WORD_CLASSES: # filter uninteresting word types outright continue if "BASEFORM" in analysis: new_words.append(analysis["BASEFORM"]) return u" ".join(new_words)
def analyzeWords(words): v = libvoikko.Voikko(u"fi") errors = [] for word in words: if not v.spell(word["value"]): word["suggestions"] = v.suggest(word["value"]) errors.append(word) v.terminate() return errors
def init_libvoikko_error() -> str: if IMPORT_LIBVOIKKO_SUCCESSFUL: try: voikko = libvoikko.Voikko('fi') if voikko: return '' return 'Intialization of Voikko failed: object empty' except (libvoikko.VoikkoException, ) as error: return str(error) return 'import libvoikko failed.'
def setUp(self): info = MorphologyInfo() info.variant = VARIANT_NAME info.morphology = u"null" info.speller = u"AllOk" info.suggestion = u"null" self.dataDir = TestDataDir() self.dataDir.createMorphology(VARIANT_NAME, info) self.voikko = libvoikko.Voikko("fi-x-" + VARIANT_NAME, path=self.dataDir.getDirectory())
def get_baseword_frequencies(topic): voikko = libvoikko.Voikko('fi') orig_words = topic.split() baseform_words = [] for orig_word in orig_words: word = orig_word.strip('-') if word: baseform_words += get_baseform_word(voikko, word) baseform_words = remove_common_words(baseform_words) baseform_words = remove_numbers(baseform_words) return count_frequencies(baseform_words)
def voikko_lemmatizer(words): lemmatizer = libvoikko.Voikko("fi") root_words = [] for word in words: v_dict = lemmatizer.analyze(word.strip()) try: root_words.append(v_dict[0]["BASEFORM"]) except: pass return root_words
def stemming_message_voikko(message): import libvoikko voikko = libvoikko.Voikko('fi') stemmed_message = [] for token in voikko.tokens(message): if token.tokenType == token.WORD: res = voikko.analyze(token.tokenText) if len(res) > 0: stemmed_message += [res[0]['BASEFORM']] baseform_to_original[res[0]['BASEFORM']] = token.tokenText else: stemmed_message += [token.tokenText] return ' '.join(stemmed_message), baseform_to_original
def run_spell_checker(): db = MysqlDB() garbwords = db.get_all_words() if not garbwords: print('\tError: No garbage words found in database') return False # initialize voikko voikko = libvoikko.Voikko(u'fi') # remove word from database if it passes spell checker for json in garbwords: word = str(json.get('word')) # print(word) if voikko.spell(word): print(f'Removing {word} because it passed the spell ckecker') db.words_remove_garbword(word) db.commit()
def setUp(self): self.voikko = libvoikko.Voikko("se")
def __init__(self): self.v = libvoikko.Voikko('fi')
SWE_STOPS.update(['swe', 'svenska', 'dag', 'buu', 'klubben', 'fråga', 'veckans', 'jag']) EN_STOPS = set(stopwords.words('english')) EN_STOPS.update(['of', 'and', 'you', 'for', 'what', 'have', 'can']) DEL_FROM_ENSTOPS = ['on', 'as', 'a', 'd', 'm', 'o', 's', 't', 'me', 'no', 'y'] for word in DEL_FROM_ENSTOPS: EN_STOPS.remove(word) OTHER_STOPS = set([ 'mailto', 'subject', 'from', 'to', 'vs', 'message', 'original', 'date', 're', 'terv', 'sent', 'from', 'kello', 'fin', 'swe', 'uutisikkuna' ]) FINAL_STOPS = FIN_STOPS | OTHER_STOPS voikko = libvoikko.Voikko('fi') voikko.setIgnoreDot(True) def _fin_lemmatize_word(string): voikkofied = voikko.analyze(string) if len(voikkofied) > 0 and voikkofied[0].get('BASEFORM') is not None: return voikkofied[0]['BASEFORM'] else: return string def _finnish_detector(text): token_set = set(text.split()) n_fi = len(token_set.intersection(FIN_STOPS)) n_swe = len(token_set.intersection(SWE_STOPS))
def pre_processing(self): initial_df = self.dataframe # print(initial_df) # print("789456123") # initial_df = str(initial_df) initial_df['Index'] = np.arange(1, len(initial_df) + 1) initial_df = initial_df[['Index', 'documents']] initial_df['documents'] = initial_df['documents'].astype(str) new_df = pd.DataFrame(initial_df, index=initial_df.Index).stack() # new_df = pd.DataFrame(initial_df.documents.str.split('[.?!,]').tolist(), index=initial_df.Index).stack() new_df = new_df.reset_index([0, 'Index']) new_df.columns = ['Index', 'documents'] new_df['documents'] = new_df['documents'].str.replace('[œ,Œ]', '-') new_df['documents'] = new_df['documents'].str.replace('ƒ⁄fifi⁄', '') new_df['documents'] = new_df['documents'].str.replace('*', '') new_df['documents'] = new_df['documents'].str.lstrip() # # Remove empty row new_df['documents'].replace('', np.nan, inplace=True) new_df.dropna(subset=['documents'], inplace=True) # new_df.to_excel('checking.xlsx') # Capitalize the first letter new_df['documents'] = new_df['documents'].map( lambda x: x[0].upper() + x[1:]) # new_df.to_excel('checking_upper.xlsx') # Converting into lower case # new_df['documents1'] = new_df.documents.map(lambda x: x.lower()) new_df['documents1'] = new_df['documents'].str.replace( '[-,:,/,(,),",;,>,<,?,_,\n,❤,\t,??,ӻ,كw,큞,ԃ,ˮ,ĭ,fifi,fl,•,*,.,!]', '') # new_df['documents1'] = new_df['documents1'].str.replace('[^\w]', '') # new_df['documents1'] = new_df['documents1'].str.replace('[^\s]', ' ') new_df['documents1'] = new_df['documents1'].str.lstrip() # remove empty strings new_df['new_col'] = new_df['documents1'].astype(str).str[0] # new_df['documents1'] = new_df['documents1'].str.replace('[^\w]', '') # new_df['documents1'] = new_df['documents1'].str.replace('[^\s]', ' ') nan_value = float("NaN") # Convert NaN values to empty string new_df.replace("", nan_value, inplace=True) new_df.dropna(subset=["new_col"], inplace=True) new_df.drop('new_col', inplace=True, axis=1) # Convert articles ino the tokens new_df['docuemnt_tokens'] = new_df.documents.map( lambda x: RegexpTokenizer(r'\w+').tokenize(x)) # Apply Lemmatization (Voikko) # os.add_dll_directory(r'C:\Voikko') C = libvoikko.Voikko(u"fi") # C.setLibrarySearchPath("C:\Voikko") # Apply lemmatizations to the words def lemmatize_text(text): bf_list = [] for w in text: voikko_dict = C.analyze(w) if voikko_dict: bf_word = voikko_dict[0]['BASEFORM'] else: bf_word = w bf_list.append(bf_word) return bf_list new_df['lemmatized'] = new_df.docuemnt_tokens.apply(lemmatize_text) # new_df['documents'] = new_df['documents'].map(lambda x: [t for t in x if t not in self.stop_words]) # stop_en = stopwords.words('finnish') new_df['article'] = new_df.lemmatized.map( lambda x: [t for t in x if t not in self.stop_words]) # make sure the datatype of column 'article_removed_stop_words' is string new_df['article'] = new_df['article'].astype(str) new_df['article'] = new_df['article'].apply(eval).apply(' '.join) new_df['Index'] = np.arange(1, len(new_df['article']) + 1) new_df.to_excel('../static/assets/text_preprocessing.xlsx') return new_df
def normalise(filename, lemmatize=True): """ Normalise a corpus from /data/corpora/ """ import libvoikko #Define a Voikko class for Finnish analyzer = libvoikko.Voikko(u"fi") #Open the text file print("Reading the input text file...") with open(os.path.join('data', 'corpora', filename), 'r', encoding='utf-8') as f: text = f.read() #Print text #print("TEXT BEFORE NORMALISATION") #print(text) #Remove numbers #text = ''.join(c for c in text if not c.isdigit()) #Tokenize & remove punctuation and special characters #print("Tokenizing & removing punctuation and special characters...") #tokenizer = RegexpTokenizer(r'\w+','.') #text = tokenizer.tokenize(text) #Tokenize print("Tokenizing...") text = word_tokenize(text) #Join dots with ordinal numbers print("Merging ordinal numbers and dots...") for idx, word in enumerate(text): if word.isdigit() and text[idx + 1] == '.' and text[idx + 2][0].islower(): text[idx:idx + 2] = [''.join(text[idx:idx + 2])] #Lemmatize tokens if lemmatize=True text_length = len(text) pbar = tqdm(total=text_length, ascii=True, desc='Lemmatizing...', position=0, unit='keys', unit_scale=True) for idx, word in enumerate(text): #Lemmatize the word. analyze() function returns #various info for the word if lemmatize: #Check if word is found from dictionary analyzed = analyzer.analyze(word) if analyzed: #Check if word starts with lowercase if word[0].islower(): #Check if there are more than 1 possible lemmas in the vocabulary if len(analyzed) > 1: #Esclude classes paikannimi, sukunimi, etunimi, nimi analyzed_mod = [ element for element in analyzed if 'paikannimi' not in element.values() and 'sukunumi' not in element.values() and 'etunumi' not in element.values() and 'nimi' not in element.values() ] #Avoid an error if it turns out to be empty list after #excluding these classes if len(analyzed_mod) > 0: text[idx] = analyzed_mod[0]['BASEFORM'].lower() else: text[idx] = analyzed[0]['BASEFORM'].lower() #Pick the lowercased lemma directly if there is only one lemma #for the query word else: text[idx] = analyzed[0]['BASEFORM'].lower() #The word is capitalized => proper noun or/and the first word of a #sentence. Pick the lemma from the vocabulary. else: text[idx] = analyzed[0]['BASEFORM'] #If lemmatization is not needed, check only the capitalized words #and lowercase, if needed else: if word[0].isupper(): analyzed = analyzer.analyze(word) #Lowercase the word if its lemma is #lowercased (if the lemma is not a proper noun) if analyzed and analyzed[0]['BASEFORM'][0].islower(): text[idx] = text[idx].lower() pbar.update(1) #Print normalized text #print("TEXT AFTER NORMALISATION") #print(' '.join(text)) #Write tokenized text to a text file and save it in /data/corpora/ if lemmatize: filename_normalized = filename[:-4] + '_normalized.txt' else: filename_normalized = filename[:-4] + '_normalized_NON-lemmatized.txt' print("\nWriting the normalized text to a txt file...") with open(filename_normalized, 'w', encoding='utf-8') as f: #Write the whole text in one line #f.write(' '.join(text)) #Write one sentence per line for sentence in ' '.join(text).split(' .'): #Write only if sentence consists of more than one word if len(word_tokenize(sentence)) > 1: f.write(sentence) f.write(' .\n')
# This is an example application import libvoikko import numpy as np import pandas as pd import string import time import xml.etree.ElementTree as ET from collections import defaultdict STOP_WORDS = set([word.strip() for word in open("stop_words.txt")]) SISALTO_FIELD_PREFIX = "sisalto_level_" voikko = libvoikko.Voikko("fi") def parse_text_from_xml(XML): LEVEL = "*/" root = ET.fromstring(XML) sisalto_dict = defaultdict(str) # Define absolute values being interested paths = { "tyyppi": "{http://www.eduskunta.fi/skeemat/siirto/2011/09/07}SiirtoMetatieto/{http://www.eduskunta.fi/skeemat/julkaisusiirtokooste/2011/12/20}JulkaisuMetatieto/*/{http://www.vn.fi/skeemat/metatietoelementit/2010/04/27}AsiakirjatyyppiNimi", "nimike": "{http://www.eduskunta.fi/skeemat/siirto/2011/09/07}SiirtoMetatieto/{http://www.eduskunta.fi/skeemat/julkaisusiirtokooste/2011/12/20}JulkaisuMetatieto/*/{http://www.vn.fi/skeemat/metatietokooste/2010/04/27}Nimeke/{http://www.vn.fi/skeemat/metatietoelementit/2010/04/27}NimekeTeksti", } # Get the values for absolutely defined paths for item, value in paths.items(): field = root.find(value)
elif re.compile('-[a-zäöåA-ZÄÖÅ]').match(word): return 0 elif re.compile('\'[a-zäöåA-ZÄÖÅ]').match(word): return 0 elif re.compile('[a-zäåöA-ZÄÖÅ]+-Aalto').match(word): return 0 elif re.compile('[a-zäåöA-ZÄÖÅ]+-Ilta').match(word): return 0 elif re.compile('[a-zäåöA-ZÄÖÅ]+-Arvonen').match(word): return 0 else: return 1 # Define a Voikko class for Finnish v = libvoikko.Voikko(u"fi") words = dict() words["asemosana"] = [] words["etunimi"] = [] words["huudahdussana"] = [] words["kieltosana"] = [] words["laatusana"] = [] words["lukusana"] = [] words["lyhenne"] = [] words["nimi"] = [] words["nimisana"] = [] words["nimisana_laatusana"] = [] words["paikannimi"] = [] words["seikkasana"] = [] words["sidesana"] = []
def setUp(self): self.voikko = libvoikko.Voikko("fi-x-apertium")
from actions.models import (Plan, Action, ActionSchedule, ActionStatus, Category, CategoryType, ActionTask, ActionImpact, ActionResponsibleParty, ActionContactPerson, ActionStatusUpdate, ImpactGroup, ImpactGroupAction, MonitoringQualityPoint, Scenario) from indicators.models import (Indicator, RelatedIndicator, ActionIndicator, IndicatorGraph, IndicatorLevel, IndicatorValue, IndicatorGoal, Unit, Quantity) from content.models import (StaticPage, BlogPost, Question, SiteGeneralContent) from people.models import Person from django_orghierarchy.models import Organization, OrganizationClass LOCAL_TZ = pytz.timezone('Europe/Helsinki') try: voikko_fi = libvoikko.Voikko(language='fi') voikko_fi.setNoUglyHyphenation(True) voikko_fi.setMinHyphenatedWordLength(16) except OSError: voikko_fi = None _hyphenation_cache = {} def hyphenate(s): if voikko_fi is None: return s tokens = voikko_fi.tokens(s) out = '' for t in tokens:
def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: LOGGER.debug('load_dictionary() ...\n') (self.dic_path, self.encoding, self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) if self.words: # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fi', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [(x, itb_util.remove_accents(x)) for x in self.words] for word in self.words: if len(word) > self.max_word_len: self.max_word_len = len(word) if DEBUG_LEVEL > 1: LOGGER.debug('max_word_len = %s\n', self.max_word_len) if self.name.split('_')[0] == 'fi': self.enchant_dict = None self.pyhunspell_object = None if IMPORT_LIBVOIKKO_SUCCESSFUL: self.voikko = libvoikko.Voikko('fi') return if IMPORT_ENCHANT_SUCCESSFUL: try: self.enchant_dict = enchant.Dict(self.name) except enchant.errors.DictNotFoundError: LOGGER.exception('Error initializing enchant for %s', self.name) self.enchant_dict = None except Exception: LOGGER.exception( 'Unknown error initializing enchant for %s', self.name) self.enchant_dict = None elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: aff_path = self.dic_path.replace('.dic', '.aff') try: self.pyhunspell_object = hunspell.HunSpell( self.dic_path, aff_path) except hunspell.HunSpellError: LOGGER.debug('Error initializing hunspell for %s', self.name) self.pyhunspell_object = None except Exception: LOGGER.debug('Unknown error initializing hunspell for %s', self.name) self.pyhunspell_object = None
def setUp(self): self.voikko = libvoikko.Voikko("mhr")