def spell_correcter(tokenized_tweets): from ekphrasis.classes.spellcorrect import SpellCorrector spell_corrector = SpellCorrector(corpus="english") return tokenized_tweets.apply( lambda tweet: [spell_corrector.correct(word) for word in tweet.split(" ")])
def __init__(self, **kwargs): self.tokens_to_normalize = kwargs.get("normalize", []) self.annotate = kwargs.get("annotate", []) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.segmenter = Segmenter(corpus=self.segmenter_corpus) self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.tokenizer = kwargs.get("tokenizer", None) self.simplify_emoticons = kwargs.get("simplify_emoticons", False) self.dictionaries = kwargs.get("dictionaries", []) self.stats = {} self.preprocessed_texts = -1
import csv import nltk from ekphrasis.classes.spellcorrect import SpellCorrector from nltk.corpus import words # CHANGE PATH FOR SERVER local = "/home/ivan/Documents/git_repos/Sentiment-Analysis-on-Twitter/data/slang.csv" djurdja = '/home/ikrizanic/pycharm/zavrsni/data/slang.csv' with open(local, mode='r') as infile: reader = csv.reader(infile, delimiter=';') slang_dict = dict(reader) sp = SpellCorrector(corpus="english") nltk.download("words") words = set(words.words()) punctuations = '''!()-[]{};:\'"\,<>./?@#$%^&*_~''' def replace_slang(raw, tokenized): tokens = [] for token in tokenized: if token not in words: for key, value in slang_dict.items(): if str(key).lower() == str(token).lower(): token = value.split(" ") if type(token) is list: tokens.extend(token) else: tokens.append(token) else: tokens.append(token)
def test_pell_correct(): from ekphrasis.classes.spellcorrect import SpellCorrector sp = SpellCorrector(corpus="english") print(sp.correct("Thaaaanks"))
'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number', ], annotate={ 'hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored', }, fix_html=True, segmenter='twitter', corrector='twitter', unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons], ) sp = SpellCorrector(corpus='english')
def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False
class TextPreProcessor: def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False def __copy__(self): return self def __deepcopy__(self, memo): return self @staticmethod def add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " " @lru_cache(maxsize=4096) def handle_hashtag_match(self, m): """ Break a string to its constituent words (using Viterbi algorithm) """ text = m.group()[1:] # todo:simplify routine if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) # print(m.group(), " - ", expanded) # with open("analysis/segmenter_" + # self.segmenter_corpus + ".txt", "a") as f: # f.write(m.group() + "\t" + expanded + "\n") else: # split words following CamelCase convention expanded = self.regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") # print(m.group(), " - ", expanded) if "hashtag" in self.include_tags: expanded = self.add_special_tag(expanded, "hashtag", mode="wrap") return expanded def handle_elongated_match(self, m): text = m.group() # normalize to at most 2 repeating chars text = self.regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized # try to spell correct the word if self.spell_correct_elong: text = self.spell_corrector.correct_word(text, assume_wrong=True, fast=True) # with open("analysis/spell_corrector_" + # self.corrector_corpus + ".txt", "a") as f: # f.write(m.group() + " - " + text + "\n") # print(m.group(), "-", text) if "elongated" in self.include_tags: text = self.add_special_tag(text, "elongated") return text @lru_cache(maxsize=4096) def handle_repeated_puncts(self, m): """ return the sorted set so mathes random combinations of puncts will be mapped to the same token "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!" "!...", "...?!" --> ".!" :param m: :return: """ text = m.group() text = "".join(sorted(set(text), reverse=True)) if "repeated" in self.include_tags: text = self.add_special_tag(text, "repeated") return text @lru_cache(maxsize=4096) def handle_generic_match(self, m, tag, mode="every"): """ Args: m (): tag (): mode (): Returns: """ text = m.group() text = self.add_special_tag(text, tag, mode=mode) return text @lru_cache(maxsize=4096) def handle_emphasis_match(self, m): """ :param m: :return: """ text = m.group().replace("*", "") if "emphasis" in self.include_tags: text = self.add_special_tag(text, "emphasis") return text @staticmethod def dict_replace(wordlist, _dict): return [_dict[w] if w in _dict else w for w in wordlist] @staticmethod def remove_hashtag_allcaps(wordlist): in_hashtag = False _words = [] for word in wordlist: if word == "<hashtag>": in_hashtag = True elif word == "</hashtag>": in_hashtag = False elif word in {"<allcaps>", "</allcaps>"} and in_hashtag: continue _words.append(word) return _words @lru_cache(maxsize=4096) def handle_general_word_segment_and_spelling(self, m): """ :param m: :return: """ text = m.group() text = self.segmenter.segment(text) return text def pre_process_doc(self, doc): doc = re.sub(r' +', ' ', doc) # remove repeating spaces # ########################### # # fix bad unicode # ########################### # if self.fix_bad_unicode: # doc = textacy.preprocess.fix_bad_unicode(doc) # # ########################### # # fix html leftovers # ########################### # doc = html.unescape(doc) ########################### # fix text ########################### if self.fix_text: doc = ftfy.fix_text(doc) ########################### # BACKOFF & OMIT ########################### for item in self.backoff: # better add an extra space after the match. # Just to be safe. extra spaces will be normalized later anyway doc = self.regexes[item].sub( lambda m: " " + "<" + item + ">" + " ", doc) for item in self.omit: doc = doc.replace("<" + item + ">", '') ########################### # segment other words not hashtags ########################### # doc = self.regexes['not_hashtag'].sub( # lambda w: self.handle_general_word_segment_and_spelling(w), doc) # for word in doc.split(" "): # if(not word.startswith('#')): # word = self.segmenter.segment(word) # new_doc.append(word) # doc = " ".join(new_doc) ########################### # unpack hashtags ########################### if self.unpack_hashtags: doc = self.regexes["hashtag"].sub( lambda w: self.handle_hashtag_match(w), doc) ########################### # handle special cases ########################### if self.mode != "fast": if "allcaps" in self.include_tags: doc = self.regexes["allcaps"].sub( lambda w: self.handle_generic_match( w, "allcaps", mode=self.all_caps_tag), doc) if "elongated" in self.include_tags: doc = self.regexes["elongated"].sub( lambda w: self.handle_elongated_match(w), doc) if "repeated" in self.include_tags: doc = self.regexes["repeat_puncts"].sub( lambda w: self.handle_repeated_puncts(w), doc) if "emphasis" in self.include_tags: doc = self.regexes["emphasis"].sub( lambda w: self.handle_emphasis_match(w), doc) if "censored" in self.include_tags: doc = self.regexes["censored"].sub( lambda w: self.handle_generic_match(w, "censored"), doc) ########################### # unpack contractions: i'm -> i am, can't -> can not... ########################### # remove textacy dependency if self.unpack_contractions: doc = unpack_contractions(doc) # omit allcaps if inside hashtags doc = re.sub(r' +', ' ', doc) # remove repeating spaces # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc) # remove repeating spaces # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>') # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>') ########################### # Tokenize ########################### doc = self.remove_hashtag_allcaps(doc.split()) doc = " ".join(doc) # normalize whitespace if self.tokenizer: doc = self.tokenizer(doc) # Replace tokens with special dictionaries (slang,emoticons ...) # todo: add spell check before! if self.dicts: for d in self.dicts: doc = self.dict_replace(doc, d) return doc def pre_process_docs(self, docs, lazy=True): from tqdm import tqdm for d in tqdm(docs, desc="PreProcessing..."): yield self.pre_process_doc(d)
from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.classes.spellcorrect import SpellCorrector from ekphrasis.classes.preprocessor import TextPreProcessor sp = SpellCorrector(corpus="twitter") EMOTICONS_TOKEN = { ':*': '<kiss>', ':-*': '<kiss>', ':x': '<kiss>', ':-)': '<happy>', ':-))': '<happy>', ':-)))': '<happy>', ':-))))': '<happy>', ':-)))))': '<happy>', ':-))))))': '<happy>', ':)': '<happy>', ':))': '<happy>', ':)))': '<happy>', ':))))': '<happy>', ':)))))': '<happy>', ':))))))': '<happy>', ':)))))))': '<happy>', ':o)': '<happy>', ':]': '<happy>', ':3': '<happy>', ':c)': '<happy>', ':>': '<happy>', '=]': '<happy>', '8)': '<happy>', '=)': '<happy>',
import os import pickle from data_util.my_stopwords import * from data_util.extract_key import extract_PF from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons # from ekphrasis.classes.segmenter import Segmenter from ekphrasis.classes.segmenter import Segmenter # segmenter using the word statistics from english Wikipedia seg_eng = Segmenter(corpus="twitter") # english or twitter from ekphrasis.classes.spellcorrect import SpellCorrector sp = SpellCorrector(corpus="english") # english or twitter alphbet_stopword = ['','b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','#'] # 斷詞辭典 from nltk.corpus import stopwords as nltk_stopwords nltk_stopwords = set(nltk_stopwords.words("english")) stpwords_list3 = [f.replace("\n","") for f in open("data_util/stopwords.txt","r",encoding = "utf-8").readlines()] stpwords_list3.remove("not") stopwords = list(html_escape_table + stpwords_list2) + list(list(nltk_stopwords) + list(stpwords_list1) + list(stpwords_list3)) stopwords = stopwords + ["."] + alphbet_stopword # stopwords = list(html_escape_table) #+ list(stpwords_list1) + list(stpwords_list3) print("斷詞辭典 已取得") # Total Opinion opinion_lexicon = {}
import numpy as np from twokenize import * from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.dicts.emoticons import emoticons from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.classes.spellcorrect import SpellCorrector import re from langdetect import detect from tqdm import tqdm import nltk from cleantext import clean import spacy from spacy.lang.en import English social_tokenizer = SocialTokenizer(lowercase=False).tokenize spell_corrector = SpellCorrector(corpus="english") def extract_url(row, min_len_url=10): if len(row['rt_urls_list']) > min_len_url: tweet_url = row['rt_urls_list'].split(',')[1].split('\'')[-2] else: tweet_url = 'None' return tweet_url class SentClean: prep_default = {'spell': False, 'remove_sequences': False, 'lowercase': False, 'punctuations': [],
def preprocess_corpus(corpus,stemming=False, all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False, hugs_and_kisses=False,hearts=False, hashtag=False, hashtag_mention=False, numbers=False, number_mention=False, exclamation=False, ##OBS denne er nå ikke testet, eventuelt bare fjerne den set_to_not=False, segmentation_hash= False, spelling=False, elongation=False, remove_signs=False ): """ Function used to apply preprocessing Input: corpus: a corpus on the format as the output in creat_corpus. Default False. all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False. pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False. neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False. other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False. hearts: if true, "<3" are replaced by "heart".Default False. hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain one or more words beginning with a hashtag. Default False. numbers: if true, words that are purely numbers are removed.Default False. number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used to contain one or more words that were purely numbers. Default False. exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. set_to_not: if true, all words ending with "n't" is replaced by not.Default False. segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, eg '#iammoving' becomes 'i am moving'. Default False. spelling: if true, all words that are not a part of the english dictionary is set to the most likely word, within two alterations. Default False. elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False. remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False. Output: new_corpus: a new corpus, on same format as the input corpus. """ start = time.time() #initialising the new corpus: new_corpus=[] #Want to split the tweets using this tokenizer: tknzr = TweetTokenizer(reduce_len=True) if stemming: ps = PorterStemmer() if segmentation_hash or spelling or elongation: d = enchant.Dict("en_US") if segmentation_hash: #seg = Segmenter(corpus="english") seg = Segmenter(corpus="twitter") if spelling: sp = SpellCorrector(corpus="english") elapsed = time.time() print("Time in min before starting first for loop:", (elapsed - start) / 60 ) #Want to go though each line (tweet) in the corpus for k, line in enumerate(corpus): if hashtag_mention: there_is_hashtag=False if number_mention: there_is_number=False if exclamation: there_is_exclamation=False #Splitting the tweet using the chosen tokenizer. words=tknzr.tokenize(line) #Initializing for cleaned_tweet: cleaned_tweet=[] for i, word in enumerate(words): #Indicating that the word has not been treated yet word_not_treated=True end_=len(words)-1 if ((pos_smilies or all_smilies) and word_not_treated): if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d': cleaned_tweet.append('smile') word_not_treated=False elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' : cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and words[i-1]=='(' and (word==':' or word==';'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('smile') word_not_treated=False if ((neg_smilies or all_smilies) and word_not_treated): if i>0 and words[i-1]==')' and (word==':' or word==';'): cleaned_tweet.append('sad') word_not_treated=False elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('sad') word_not_treated=False if ((other_smilies or all_smilies) and word_not_treated): if i>0 and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^': cleaned_tweet.append('eyesmiley') word_not_treated=False elif i>0 and word=='o' and words[i-1]==':': cleaned_tweet.append('openmouthface') word_not_treated=False elif i>0 and word=='/' and words[i-1]==':': cleaned_tweet.append('slashsmiely') word_not_treated=False elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('kiss') word_not_treated=False if ((hugs_and_kisses and word_not_treated)): #want to find hearts, hugs, kisses, etc: if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"): cleaned_tweet.append('hug') cleaned_tweet.append('kiss') word_not_treated=False elif (word=='xx' or word=='xxx'or word=='xxxx'): cleaned_tweet.append('kiss') word_not_treated=False if ((hearts and word_not_treated)): if word == "<3": cleaned_tweet.append('heart') word_not_treated=False if (hashtag and word_not_treated): if word[0]=='#': there_is_hashtag=True if (len(word)>1 and segmentation_hash and not d.check(word[1:])): cleaned_tweet.append(seg.segment(word[1:])) else: cleaned_tweet.append(word[1:]) word_not_treated=False if (numbers and word_not_treated): if word.isdigit(): there_is_number=True word_not_treated=False if (exclamation and word_not_treated): if word=='!': there_is_exclamation=True cleaned_tweet.append(word) word_not_treated=False if (set_to_not and word_not_treated): if word[-3:]=='n\'t': cleaned_tweet.append('not') word_not_treated=False if (word_not_treated): if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' and word!='-' and word!='´' and word!=';'and word!=')' and word!='(' and word!='*'))): if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2): new=[] new.append(word[0]) for i,letter in enumerate(word): if i>0 and i<len(word)-1: if not( letter==word[i-1]==word[i+1]): new.append(letter) new.append(word[-1]) new_word=''.join(new) if new_word!= word: cleaned_tweet.append('elongation') word=new_word if spelling and not d.check(word)and len(word)>2: word=sp.correct(word) if stemming: word=ps.stem(word) cleaned_tweet.append(word) if (hashtag_mention and there_is_hashtag) : cleaned_tweet.append('hashtag') if (number_mention and there_is_number) : cleaned_tweet.append('number') if (exclamation and there_is_exclamation): cleaned_tweet.append('exclamation') new_words = ' '.join(cleaned_tweet) new_words = new_words.encode('utf-8') new_corpus.append(new_words) if np.mod(k,25000)==1: elapsed = time.time() print("Time in min after", k, " tweets:", (elapsed - start) / 60 ) elapsed = time.time() print("Time in min total:", (elapsed - start) / 60 ) return new_corpus
- if the most toxic word is auxiliary verb, then discard the sentence. CASS_fn: the file name of a pickle that stores the output Correction_All_Sentences_Scores output: Correction_All_Sentences_Scores: a list [0..3] of list of (original_sentence, original_score, revised_sentence, revised_toxic_score, correct_word, new_word_list, correction_word_list, correction_score, corrected_sentence), where correction_word_list is a list of (wrong_word, suggested_word). note: The input sentences are pre-processed, such that punctuations are either non-existent, or separated from words. ''' ASSF_fn = 'input/All_Sentences_Scores_Filtered.pickle' CASS_fn = "output/Correction_All_Sentences_Scores.pickle" sp = SpellCorrector(corpus="english") ekphrasis_word_correction_func = lambda w: sp.correct(w) Correction_All_Sentences_Scores = eval_spelling_correction_perspective( ASSF_fn, CASS_fn, word_correction_func=ekphrasis_word_correction_func) ''' 2018.5.20 Plot correction effects ''' CASS_fn = "output/Correction_All_Sentences_Scores.pickle" plot_correction_effects(CASS_fn) ''' Calculate 1. accuracy 2. score distribution
'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) seg_tw = Segmenter(corpus="twitter") sp = SpellCorrector(corpus="twitter") f1 = open('tokenized_tweets_golbeck.txt', 'w') c = 1 for line in data: a = line.strip().split('\t') if len(a) >= 3: b = a[2] c = a[1] b = b.split() for i in range(len(b)): if b[i].startswith('http'): b[i] = '<url>' b = ' '.join(b) a = text_processor.pre_process_doc(b) for i in range(len(a)): if a[i].isalpha():
def create_models(headlines): headline = headlines['headline'] label = headlines['label'] headlines.loc[headlines['label'] == -1, 'label'] = 0 arr_Accu = [] #Random State apo edw **************************** # for i in range(1, 20): # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.01, random_state=i) # # vect = CountVectorizer(max_features=100000, binary=True) # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3)) # headline_train_vector = vect.fit_transform(headline_train) # headline_test_vector = vect.transform(headline_test) # # # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike # # balancing = SMOTE() # # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train) # # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True) # # print(list(zip(oversampled_headlines, counts))) # print("pre-Dummy") # dummy = DummyClassifier() # print("post-Dummy") # dummy.fit(headline_train_vector, label_train) # prediction = dummy.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print("Dummy Classifier: ") # print(accuracy) # arr_Accu.append(accuracy) # print(max(arr_Accu)) # max_random_state = arr_Accu.index(max(arr_Accu)) + 1 # print(max_random_state) # for j in range(1, 20): # print("Random State : ", j, " Accuracy : ", arr_Accu[j-1]) # Random State mexri edw ******************************** # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy # Note: to accuracy edw einai xeirotero apo prin # arr_Accu = [] # for i in range(3, 15): # vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8) # headline_train_vector = vect.fit_transform(headline) # # dummy = DummyClassifier() # accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy') # # arr_Accu.append(np.mean(accuracy)) # # # print(arr_Accu) # for j in range(3, 15): # print("K-Fold : ", j, " Accuracy : ", arr_Accu[j - 3]) # Ksekina i dimiourgia montelwn me to veltisto random state # print("random state chosen: ") # print(max_random_state) # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.20, random_state=max_random_state) x = headlines['headline'] y = headlines['label'] print("Headlines", x.shape) print("Labels", y.shape) neg = sum(headlines.label == 0) pos = sum(headlines.label == 1) print("Neg", neg) print("Pos", pos) diff = abs(pos - neg) print("Class difference: ", diff) df_filter = headlines[headlines.label == 0] run_stats = pd.DataFrame() print(headlines.head()) from ekphrasis.classes.spellcorrect import SpellCorrector # Dokimastiko pre-processing twn tweets @lru_cache(maxsize=50000) def tokenization(text): text = re.split('\W+', text) return text headlines['headline'] = headlines['headline'].apply( lambda x: tokenization(x.lower())) print(headlines.head()) stopword = nltk.corpus.stopwords.words('english') # @lru_cache(maxsize=50000) def remove_stopwords(text): return [word for word in text if word not in stopword] headlines['headline'] = headlines['headline'].apply(remove_stopwords) print(headlines.head()) # stemmer = nltk.PorterStemmer() # def stemming(text): # text = [stemmer.stem(word) for word in text] # return text # # headlines['headline'] = headlines['headline'].apply(lambda x: stemming(x)) # print(headlines.head(10)) # Spell Correction, ισως να μην χρησιμοποιηθεί γιατί απαιτεί υπερβολικά πολλή RAM και χρόνο *** sp = SpellCorrector(corpus="english") def spell_corrector(text): print("**Text before correction: ", text) text = [sp.correct(word) for word in text] print(">>Text after correction:", text) return text # print("Spelling Correction") # headlines['headline'] = headlines['headline'].apply(lambda x: spell_corrector(x)) # headlines['headline'] = headlines['headline'].apply(spell_corrector) lm = nltk.WordNetLemmatizer() def lemmatizer(text): return [lm.lemmatize(word) for word in text] print("Lemmatizer") headlines['headline'] = headlines['headline'].apply(lemmatizer) print(headlines.head(10)) headlines['headline'] = headlines['headline'].str.join(" ") print(headlines.head()) headline_train, headline_test, label_train, label_test = train_test_split( headline, label, test_size=.02) x_validation, x_test, y_validation, y_test = train_test_split( headline_test, label_test, test_size=.5) print(headline_train.shape) print(headline_test.shape) # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3)) vect = TfidfVectorizer(ngram_range=(1, 3)) # Grid Searching gia veltisto apotelesma POLY XRONOVORO # lr = LogisticRegression() # text_clf = Pipeline([ # ('vect', CountVectorizer()), # ('tfidf', TfidfTransformer()), # ('clf', LogisticRegression())]) # params = { # 'clf__penalty': ['l1', 'l2'], # l1 is Lasso, l2 is Ridge # 'clf__solver': ['liblinear'], # 'clf__C': np.linspace(0.00002, 1, 10) # } # lr_gs = GridSearchCV(text_clf, params, cv=5, iid=False).fit(headline_train[:200000], label_train[:200000]) # print("Best Params", lr_gs.best_params_) # print("Best Score", lr_gs.best_score_) # Mexri edw log_regression = LogisticRegression(C=1.0, class_weight="balanced", solver="liblinear", multi_class="ovr", verbose=100, random_state=42) linear_SVC = LinearSVC(C=0.1, verbose=100, random_state=42) passive_aggressive = PassiveAggressiveClassifier() multinomial_bayes = MultinomialNB(alpha=10) complementNB = ComplementNB() ridge_clas = RidgeClassifier(solver='lsqr', random_state=42) naive_bayes = BernoulliNB() random_forest = RandomForestClassifier(max_depth=30, n_estimators=4000, verbose=100, n_jobs=2) svm = SVC(gamma=0.5, C=100, kernel="linear", verbose=100) # Edw dokimazw me GridSearch gia ta kalutero parameter tuning parameters = { 'classifier__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0], 'classifier__max_iter': [1000], 'classifier__solver': ['lsqr'], 'classifier__random_state': [42] } # EDW KANW GRIDSEARCH # pipe = Pipeline([ # ('vectorizer', TfidfVectorizer(max_features=100000)), # ('classifier', ridge_clas) # ]) # grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=parameters) # # start_time = time.time() # grid.fit(headline_train, label_train) # end_time = time.time() # print('Total fit time: {}'.format(end_time - start_time)) # # prediction = grid.predict(label_test) # print("Prediction Finished") # res = pd.DataFrame({'Prediction ': prediction}) # print(res) # MEXRI EDW # algorithms = [log_regression, complementNB, linear_SVC, passive_aggressive, multinomial_bayes, naive_bayes, ridge_clas] # algo_names = ["Logistic Regression", "Complement Naive Bayes", "Linear SVC", "Passive Aggressive", "Mutlinomial Bayes", "Naive Bayes", "Ridge Classifier"] # algo_name_pair = zip(algorithms, algo_names) algorithms = [ridge_clas] algo_names = ["Ridge Classifier"] algo_name_pair = zip(algorithms, algo_names) results = dict() for algo, name in algo_name_pair: ug_pipeline = Pipeline([('vectorizer', vect), ('classifier', algo)]) print("Classifier : ", algo) results[name] = train_test_and_evaluate(ug_pipeline, headline_train, label_train, x_validation, y_validation) dframe = pd.DataFrame.from_dict(results, orient="index").reset_index() dframe.columns = ["classifier", "prediction"] dframe.sort_values(by=["prediction"], ascending=False) print(results) sns.barplot(x='classifier', y='prediction', data=dframe) plt.title("TFidf Vectorizer, n-gram=3") fig = plt.gcf() fig.set_size_inches(20, 10) plt.show() # ta headlines tou training kommatioy ginontai fit_transform gia to fit # ta headlines tou test ginontai transform gia to test # Multionomial Bayes # mbayes = MultinomialNB() # start_time = time.time() # mbayes.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # # print(mbayes.score(headline_train_vector, label_train)) # # # actual testing me to testing set pou diaxwrisame # prediction = mbayes.predict(headline_test_vector) # # print(prediction) # accuracy = metrics.accuracy_score(label_test, prediction) # print('MBayes Accuracy : ', accuracy) # run_stats = run_stats.append({'Classifier': 'Multinomial Naive Bayes', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # results["bayes_accuracy"] = prediction # start_time = time.time() # log_regression = LogisticRegression() # log_regression.fit(headline_train_vector, label_train) # prediction = log_regression.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # runtime = time.time() - start_time # print('LogisticRegression Accuracy : ', accuracy) # print('Runtime : ', runtime) # results["Logistic_regression"] = accuracy # Teleutaia fora 0.77838 # decision_tree = DecisionTreeClassifier(criterion='entropy') # decision_tree.fit(headline_train_vector, label_train) # prediction = decision_tree.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('DecisionTree Accuracy : ', accuracy) # # # random_forest = RandomForestClassifier(criterion='entropy') # random_forest.fit(headline_train_vector, label_train) # prediction = random_forest.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('RandomForestClassifier Accuracy : ', accuracy) # Teleutaia fora, DEN ETREKSE, PIRE POLY WRA KAI TO EKLEISA # # adaboost = AdaBoostClassifier() # adaboost.fit(headline_train_vector, label_train) # prediction = adaboost.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('Adaboost Accuracy : ', accuracy) # Teleutaio accuracy 0.66687 # # bernoulli_bayes = BernoulliNB() # start_time = time.time() # bernoulli_bayes.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # prediction = bernoulli_bayes.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('BernoulliNB Accuracy : ', accuracy) # run_stats = run_stats.append({'Classifier': 'Bernoulli', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # linear_SVC = LinearSVC() # start_time = time.time() # linear_SVC.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # prediction = linear_SVC.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('Linear_SVC Accuracy : ', accuracy) # print("Runtime : ", runtime) # run_stats = run_stats.append({'Classifier': 'Linear SVC', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # Teleutaio accuracy 0.7761956 # passive_aggressive = PassiveAggressiveClassifier() # passive_aggressive.fit(headline_train_vector, label_train) # prediction = passive_aggressive.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('PassiveAggressiveClassifier Accuracy : ', accuracy) pprint(run_stats) return results
class TextPreProcessor: """ Kwargs: normalize (list) possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date'] annotate (list) possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'] unpack_hashtags (bool) unpack_contractions (bool) segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace simplify_emoticons (bool) dictionaries (list) """ def __init__(self, **kwargs): self.tokens_to_normalize = kwargs.get("normalize", []) self.annotate = kwargs.get("annotate", []) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.segmenter = Segmenter(corpus=self.segmenter_corpus) self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.tokenizer = kwargs.get("tokenizer", None) self.simplify_emoticons = kwargs.get("simplify_emoticons", False) self.dictionaries = kwargs.get("dictionaries", []) self.stats = {} self.preprocessed_texts = -1 def pre_process(self, text: str, with_stats=False): self._increment_counter() text = self._remove_repeating_spaces(text) text = self._normalize(text) text = self._unpack_hashtags(text) text = self._annotate(text) text = self._unpack_contractions(text) text = self._remove_repeating_spaces(text) tokens = self._tokenize(text) tokens = self._simplify_emoticons(tokens) tokens = self._replace_using_dictionaries(tokens) if with_stats: return tokens, self._pre_processed_text_stats() else: return tokens def _pre_processed_text_stats(self): return self.stats[self.preprocessed_texts] def _increment_counter(self): self.preprocessed_texts += 1 self.stats[self.preprocessed_texts] = {} def _normalize(self, text): for item in self.tokens_to_normalize: text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize') return text def _unpack_hashtags(self, text): if self.unpack_hashtags: return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack") return text def _annotate(self, text): text = self._annotate_allcaps(text) text = self._annotate_elongated(text) text = self._annotate_repeated(text) text = self._annotate_emphasis(text) text = self._annotate_censored(text) return text def _annotate_allcaps(self, text): if "allcaps" in self.annotate: return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'), text, "annotate") return text def _annotate_elongated(self, text): if "elongated" in self.annotate: return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate") return text def _annotate_repeated(self, text): if "repeated" in self.annotate: return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text, "annotate") return text def _annotate_emphasis(self, text): if "emphasis" in self.annotate: return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate") return text def _annotate_censored(self, text): if "censored" in self.annotate: return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text, "annotate") return text def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix): changing_result = regexes[regexp_name].subn(func, text) self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1]) return changing_result[0] def _unpack_contractions(self, text): if self.unpack_contractions: text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|" r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t", r"\1\2 not", text) text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text) text = self._unpack_selected_contrations( r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)" r"'?ve", r"\1\2 have", text) text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text) text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text) text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text) text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text) text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text) text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text) return text def _unpack_selected_contrations(self, regexp, replacement, text): unpacking_result = re.subn(regexp, replacement, text) self._update_stats("unpack_contrations", unpacking_result[1]) return unpacking_result[0] def _tokenize(self, text): if self.tokenizer: return self.tokenizer(text) else: return text.split(' ') def _simplify_emoticons(self, tokens): if self.simplify_emoticons: result = [] for token in tokens: if token in emoticons: new_emoticon = emoticons[token] if new_emoticon != token: self._update_stats('emoticon_simplification', 1) result.append(new_emoticon) else: result.append(token) return result else: return tokens def _replace_using_dictionaries(self, tokens): if len(self.dictionaries) > 0: for dictionary in self.dictionaries: for idx, token in enumerate(tokens): if token in dictionary: value = dictionary[token] if '<entity>' not in value: tokens[idx] = value self._update_stats('dictionary_replacement', 1) return ' '.join(tokens).split(' ') else: return tokens @lru_cache(maxsize=65536) def _handle_hashtag_match(self, m): text = m.group()[1:] if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) else: expanded = regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") if "hashtag" in self.annotate: expanded = self._add_special_tag(expanded, "hashtag", mode="wrap") return expanded @lru_cache(maxsize=65536) def _handle_generic_match(self, m, tag, mode="every"): text = m.group() if tag == 'allcaps': # word around for allcaps contractions like YOU'RE TODO refactor text = text.lower() text = self._add_special_tag(text, tag, mode=mode) return text def _handle_elongated_match(self, m): text = m.group() text = regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized text = self._add_special_tag(text, "elongated") return text @lru_cache(maxsize=65536) def _handle_repeated_puncts(self, m): text = m.group() text = "".join(sorted(set(text), reverse=True)) text = self._add_special_tag(text, "repeated") return text @lru_cache(maxsize=65536) def _handle_emphasis_match(self, m): text = m.group().replace("*", "") text = self._add_special_tag(text, "emphasis") return text def _update_stats(self, key, value): if value > 0: stats_for_text = self.stats[self.preprocessed_texts] if key not in stats_for_text: stats_for_text[key] = 0 stats_for_text[key] += value @staticmethod def _remove_repeating_spaces(text): return re.sub(r' +', ' ', text).strip() @staticmethod def _add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " "
def spell_correct(text): sp = SpellCorrector(corpus="twitter").correct(text) return sp