def __init__(self, args, lang):
     self.others = Strategy(args.others)  # valori validi 0,1,2 //ANDATA
     self.emoji = Strategy(
         args.emoji)  #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) //
     self.emoticon = Strategy(
         args.emoticon)  #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione)
     self.url = Strategy(args.url)  # 0,1,2,3
     self.hashtag = Strategy(
         args.hashtag)  # 0,1 = #hashtag,2 ,3 (#hashtag),4,5
     self.punctuation = Strategy(args.punctuation)  #Valori validi 0,3
     self.mention = Strategy(args.mention)  #0,1,2,3
     self.lower = args.lower  #true o false
     self.lang = lang  # EN o IT
     self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
     if self.lang == 'IT':
         self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')
     else:
         self.lm = None
     self.text_processor = TextPreProcessor(
         remove=[
             'email',  #raw o nomralize.
             'percent',  #raw o nomralize: EN: percentage, IT: percentuale.
             'money',  # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute
             'phone',  # raw o nomralize: EN: phone, IT: telefono
             'time',  # raw o nomralize: EN time, It: ore 
             'date',  # raw o nomralize EN date, It data
             'number'  #raw o nomralize En number, it numero.
         ],
         annotate={},
         fix_html=True,
         unpack_hashtags=False,
         tokenizer=SocialTokenizer(lowercase=self.lower).tokenize,
         dicts=[emoticons])
    def fit(self, X, y=None):
        dtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        df = pd.DataFrame({'words' :[self.__keep_correctly_spelled(token, self.spell_) for token in X]})
        word_count = dict(Counter(" ".join(df['words']).split(" "))) 
        word_count_df = pd.DataFrame.from_dict(word_count, orient='index').reset_index()
        word_count_df.columns= ['words', 'n_appearances']

        # Only keep actual words
        word_count_df['wordlength'] = word_count_df['words'].str.len()
        word_count_df = word_count_df[(word_count_df['wordlength'] >=3) |
                                      (word_count_df['words'].isin(self.stopwords_list_))]

        word_count_df = word_count_df.sort_values('n_appearances', ascending=False).reset_index(drop=True)
        word_count_df['words'] = word_count_df['words'].str.lower()

        lang_filepath = path.join(config_test['lang_path'], f'my_lang_{dtime}.txt.gz')
        word_count_df['words'].to_csv(lang_filepath,
                                      index=None,
                                      header=False,
                                      compression='gzip',
                                      encoding='utf-8')
        self.language_model_ = wordninja.LanguageModel(lang_filepath)

        return self
Example #3
0
    def slice_word(word):
        wm = wordninja.LanguageModel('words.txt.gz')
        name_list = wm.split(word)

        y = [s for s in name_list if not len(s) == 1]
        print(word, '-->', y)

        return y
    def __init__(self, path_dict_zip=path_dict_zip__):
        self.name_class = 'TOKENIZER'
        self.status_update = False
        if path_dict_zip != None and self.__check_valid_path(path_dict_zip):
            self.path_dict_zip = path_dict_zip
            logging.debug("The module load input dictionary succesfully")
        else:
            self.path_dict_zip = path_dict_zip_default__  # note
            logging.debug(
                "If you don't change the input path_dict_zip, the module will use the default spliter"
            )

        # print(os.path.join(os.path.abspath, self.path_dict_zip))
        self.spliter = wordninja.LanguageModel(self.path_dict_zip)
Example #5
0
File: test.py Project: yf1291/nlp3
 def test_custom_model(self):
     lm = wordninja.LanguageModel('test_lang.txt.gz')
     self.assertEqual(list(lm.split('derek')), ['der', 'ek'])
 def update_spliter(self, path_dict_zip):
     self.spliter = wordninja.LanguageModel(self.path_dict_zip)
Example #7
0
import re
pattern_space_before_capital = re.compile(r'((?<=[^\W[A-Z])[A-Z]|(?<=\S)[A-Z](?=[a-z]))')

from importlib import resources


############################
#       Segmenter
###########################
import wordninja
with resources.path("src.resources", "italian_words.txt.gz") as italian_words_gz:
    segmenter = wordninja.LanguageModel(italian_words_gz)

############################
#   unique_italian_words
###########################
with resources.path("src.resources", "parole_uniche.txt") as unique_italian_words_path:
    unique_italian_words   = {word.rstrip().lower() for word in open(unique_italian_words_path, 'r', encoding='utf8') if word.rstrip().lower() != ''}

# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer

from src.data.preprocessing.dicts.emoticons import emoticons
from src.data.preprocessing.dicts.wrong_word import wrong_word
from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms
# import spacy_udpipe
# spacy_udpipe.download("it-postwita")
# nlp = spacy_udpipe.load("it-postwita")

# social_tokenizer = lambda text : [  token.text for token in nlp(text)]
import pandas as pd
import argparse
import emoji
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
import os
import json
import wordninja
import numpy as np

# commento di Rosario

ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')


def add_pred_pos(row, model, task):
    predictions, raw_outputs = model.predict([row.text_preprocessed])
    if (task == 'opos'):
        return predictions[0]
    else:
        return predictions[0]


def trainer(train_df, OUTPUT_DIR, preproc, args):
    script_dir = os.path.dirname(__file__)
    abs_file_path = os.path.join(script_dir, args.modelConf)
    with open(abs_file_path) as f:
        model_param = json.loads(f.read())
Example #9
0
import pandas as pd
import wordninja as wn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = set(nltk.corpus.words.words())
stop = stopwords.words('english')

wn.DEFAULT_LANGUAGE_MODEL = wn.LanguageModel('covid_words.txt.gz')


## Deletes strings that aren't in English (including strings that aren't words at all)
def drop_nonword(word):
    word = " ".join(w for w in nltk.wordpunct_tokenize(word) \
             if w.lower() in words or not w.isalpha())
    return (word)


## Splits domains, removes stopwords, lemmatizes, and drops non-words
def cloud_prep(df):
    df = df.astype(str)
    df['Match'] = df['Match'].apply(wn.split)
    df['Match'] = df['Match'].apply(
        lambda x: [item for item in x if item not in stop])
    df['Match'] = df.explode('Match', ignore_index=True)
    df = df.astype(str)
    df['Match'] = df['Match'].apply(lemmatizer.lemmatize)
    df['Match'] = df['Match'].apply(drop_nonword)