Python LanguageModel Examples, wordninja.LanguageModel Python Examples

Example #1

0

Show file

File: preprocessor.py Project: MirkoVentura/sentiment_it

 def __init__(self, args, lang):
     self.others = Strategy(args.others)  # valori validi 0,1,2 //ANDATA
     self.emoji = Strategy(
         args.emoji)  #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) //
     self.emoticon = Strategy(
         args.emoticon)  #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione)
     self.url = Strategy(args.url)  # 0,1,2,3
     self.hashtag = Strategy(
         args.hashtag)  # 0,1 = #hashtag,2 ,3 (#hashtag),4,5
     self.punctuation = Strategy(args.punctuation)  #Valori validi 0,3
     self.mention = Strategy(args.mention)  #0,1,2,3
     self.lower = args.lower  #true o false
     self.lang = lang  # EN o IT
     self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
     if self.lang == 'IT':
         self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')
     else:
         self.lm = None
     self.text_processor = TextPreProcessor(
         remove=[
             'email',  #raw o nomralize.
             'percent',  #raw o nomralize: EN: percentage, IT: percentuale.
             'money',  # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute
             'phone',  # raw o nomralize: EN: phone, IT: telefono
             'time',  # raw o nomralize: EN time, It: ore 
             'date',  # raw o nomralize EN date, It data
             'number'  #raw o nomralize En number, it numero.
         ],
         annotate={},
         fix_html=True,
         unpack_hashtags=False,
         tokenizer=SocialTokenizer(lowercase=self.lower).tokenize,
         dicts=[emoticons])

Example #2

0

Show file

File: pipeline_text.py Project: uk-gov-mirror/datasciencecampus.ace2

    def fit(self, X, y=None):
        dtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        df = pd.DataFrame({'words' :[self.__keep_correctly_spelled(token, self.spell_) for token in X]})
        word_count = dict(Counter(" ".join(df['words']).split(" "))) 
        word_count_df = pd.DataFrame.from_dict(word_count, orient='index').reset_index()
        word_count_df.columns= ['words', 'n_appearances']

        # Only keep actual words
        word_count_df['wordlength'] = word_count_df['words'].str.len()
        word_count_df = word_count_df[(word_count_df['wordlength'] >=3) |
                                      (word_count_df['words'].isin(self.stopwords_list_))]

        word_count_df = word_count_df.sort_values('n_appearances', ascending=False).reset_index(drop=True)
        word_count_df['words'] = word_count_df['words'].str.lower()

        lang_filepath = path.join(config_test['lang_path'], f'my_lang_{dtime}.txt.gz')
        word_count_df['words'].to_csv(lang_filepath,
                                      index=None,
                                      header=False,
                                      compression='gzip',
                                      encoding='utf-8')
        self.language_model_ = wordninja.LanguageModel(lang_filepath)

        return self

Example #3

0

Show file

File: main.py Project: Iskrata/Bukvalno-Buro

    def slice_word(word):
        wm = wordninja.LanguageModel('words.txt.gz')
        name_list = wm.split(word)

        y = [s for s in name_list if not len(s) == 1]
        print(word, '-->', y)

        return y

Example #4

0

Show file

File: tokenizer.py Project: giangnguyenvanvsi/chatbot_nlu_preprocess

    def __init__(self, path_dict_zip=path_dict_zip__):
        self.name_class = 'TOKENIZER'
        self.status_update = False
        if path_dict_zip != None and self.__check_valid_path(path_dict_zip):
            self.path_dict_zip = path_dict_zip
            logging.debug("The module load input dictionary succesfully")
        else:
            self.path_dict_zip = path_dict_zip_default__  # note
            logging.debug(
                "If you don't change the input path_dict_zip, the module will use the default spliter"
            )

        # print(os.path.join(os.path.abspath, self.path_dict_zip))
        self.spliter = wordninja.LanguageModel(self.path_dict_zip)

Example #5

0

Show file

File: test.py Project: yf1291/nlp3

 def test_custom_model(self):
     lm = wordninja.LanguageModel('test_lang.txt.gz')
     self.assertEqual(list(lm.split('derek')), ['der', 'ek'])

Example #6

0

Show file

File: tokenizer.py Project: giangnguyenvanvsi/chatbot_nlu_preprocess

 def update_spliter(self, path_dict_zip):
     self.spliter = wordninja.LanguageModel(self.path_dict_zip)

Example #7

0

Show file

import re
pattern_space_before_capital = re.compile(r'((?<=[^\W[A-Z])[A-Z]|(?<=\S)[A-Z](?=[a-z]))')

from importlib import resources


############################
#       Segmenter
###########################
import wordninja
with resources.path("src.resources", "italian_words.txt.gz") as italian_words_gz:
    segmenter = wordninja.LanguageModel(italian_words_gz)

############################
#   unique_italian_words
###########################
with resources.path("src.resources", "parole_uniche.txt") as unique_italian_words_path:
    unique_italian_words   = {word.rstrip().lower() for word in open(unique_italian_words_path, 'r', encoding='utf8') if word.rstrip().lower() != ''}

# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer

from src.data.preprocessing.dicts.emoticons import emoticons
from src.data.preprocessing.dicts.wrong_word import wrong_word
from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms
# import spacy_udpipe
# spacy_udpipe.download("it-postwita")
# nlp = spacy_udpipe.load("it-postwita")

# social_tokenizer = lambda text : [  token.text for token in nlp(text)]

Example #8

0

Show file

File: sentiment_ita.py Project: MirkoVentura/sentiment_it

import pandas as pd
import argparse
import emoji
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
import os
import json
import wordninja
import numpy as np

# commento di Rosario

ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')


def add_pred_pos(row, model, task):
    predictions, raw_outputs = model.predict([row.text_preprocessed])
    if (task == 'opos'):
        return predictions[0]
    else:
        return predictions[0]


def trainer(train_df, OUTPUT_DIR, preproc, args):
    script_dir = os.path.dirname(__file__)
    abs_file_path = os.path.join(script_dir, args.modelConf)
    with open(abs_file_path) as f:
        model_param = json.loads(f.read())

Example #9

0

Show file

File: wordcloud.py Project: peterg-13/bios611_project1

import pandas as pd
import wordninja as wn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = set(nltk.corpus.words.words())
stop = stopwords.words('english')

wn.DEFAULT_LANGUAGE_MODEL = wn.LanguageModel('covid_words.txt.gz')


## Deletes strings that aren't in English (including strings that aren't words at all)
def drop_nonword(word):
    word = " ".join(w for w in nltk.wordpunct_tokenize(word) \
             if w.lower() in words or not w.isalpha())
    return (word)


## Splits domains, removes stopwords, lemmatizes, and drops non-words
def cloud_prep(df):
    df = df.astype(str)
    df['Match'] = df['Match'].apply(wn.split)
    df['Match'] = df['Match'].apply(
        lambda x: [item for item in x if item not in stop])
    df['Match'] = df.explode('Match', ignore_index=True)
    df = df.astype(str)
    df['Match'] = df['Match'].apply(lemmatizer.lemmatize)
    df['Match'] = df['Match'].apply(drop_nonword)