def hashtag_sentiment(tweet): hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet)) seg = Segmenter() hashtag_polarity = [] for hashtag in hash_tag: tokens = seg.segment(hashtag) ss = sid.polarity_scores(tokens) if 'not' not in tokens.split(' '): hashtag_polarity.append(ss['compound']) else: hashtag_polarity.append(-ss['compound']) sentiment = 0 if len(hashtag_polarity) > 0: sentiment = round( float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2) return sentiment
def hashtag_sentiment(tweet): hash_tag = (re.findall("#([a-zA-Z0-9]{1,25})", tweet)) hashtag_polarity = [] seg = Segmenter(corpus="twitter") for hashtag in hash_tag: tokens = seg.segment(hashtag) ss = sid.polarity_scores(tokens) # polarity_scores method of SentimentIntensityAnalyzer # object gives a sentiment dictionary. # which contains pos, neg, neu, and compound scores. if 'not' not in tokens.split(' '): hashtag_polarity.append(ss['compound']) else: hashtag_polarity.append(- ss['compound']) sentiment = 0 if len(hashtag_polarity) > 0: sentiment = round(float(sum(hashtag_polarity) / float(len(hashtag_polarity))), 2) return sentiment
def handle_tweets(df_tweets): seg_eng = Segmenter(corpus="english") texts = list(df_tweets["text"]) #f = open(data_path + "abs_tweets.txt", "w") hashtags = [] clean_tweets = [] for t in texts: pattern = r'#\w+|#\w+$' remove = re.compile(pattern) removed_t = remove.sub(r'', t) matches = re.findall(pattern, t) hashes = [seg_eng.segment(i.lstrip('#').lower()) for i in matches] tweet = tokenizer(removed_t) clean_tweets.append(tweet) hashtags.append(hashes) # f.write(tweet) # f.write("\n") #f.close() return clean_tweets, hashtags
fhc = open('finalallcomments.txt', 'a+') fhp = open('finalallposts.txt', 'a+') fht = open('finalalltags.txt', 'a+') for commentFILE, postFILE, tagFILE in zip(listOfFILEcomments, listOfFILEposts, listOfFILEtags): commentGenerator = open(commentFILE, 'r') postGenerator = open(postFILE, 'r') tagGenerator = open(tagFILE, 'r') for comment, post, tag in zip(commentGenerator, postGenerator, tagGenerator): if comment.strip() and post.strip() and tag.strip(): fhc.write(comment) fhp.write(post) fht.write(';'.join( [seg_eng.segment(w) for w in tag.split(';') if w])) commentGenerator.close() postGenerator.close() tagGenerator.close() fhc = open('finalallcomments.txt', 'r') fhp = open('finalallposts.txt', 'r') fht = open('finalalltags.txt', 'r') data = [(random.random(), line1, line2, line3) for line1, line2, line3 in zip(fhc, fhp, fht)] fhc.close() fhp.close() fht.close()
def segmentation(self): from ekphrasis.classes.segmenter import Segmenter seg_eg = Segmenter(corpus="english") seg_tw = Segmenter(corpus="twitter") self.text = [seg_tw.segment(sent) for sent in self.text] return self.text
class TextPreProcessor: def __init__(self, **kwargs): """ Kwargs: omit (list): choose what tokens that you want to omit from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False normalize (list): choose what tokens that you want to normalize from the text. possible values: ['email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'hashtag'] for example: [email protected] will be transformed to <email> Important Notes: 1 - put url at front, if you plan to use it. Messes with the regexes! 2 - if you use hashtag then unpack_hashtags will automatically be set to False unpack_contractions (bool): Replace *English* contractions in ``text`` str with their unshortened forms for example: can't -> can not, wouldn't -> would not, and so on... unpack_hashtags (bool): split a hashtag to it's constituent words. for example: #ilikedogs -> i like dogs annotate (list): add special tags to special tokens. possible values: ['hashtag', 'allcaps', 'elongated', 'repeated'] for example: [email protected] -> [email protected] <email> tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] all_caps_tag (str): how to wrap the capitalized words values [single, wrap, every] Note: applicable only when `allcaps` is included in annotate[] - single: add a tag after the last capitalized word - wrap: wrap all words with opening and closing tags - every: add a tag after each word spell_correct_elong (bool): choose if you want to perform spell correction after the normalization of elongated words. * significantly affects performance (speed) spell_correction (bool): choose if you want to perform spell correction to the text * significantly affects performance (speed) fix_text (bool): choose if you want to fix bad unicode terms and html entities. """ self.omit = kwargs.get("omit", {}) self.backoff = kwargs.get("normalize", {}) self.include_tags = kwargs.get("annotate", {}) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.tokenizer = kwargs.get("tokenizer", None) self.dicts = kwargs.get("dicts", None) self.spell_correction = kwargs.get("spell_correction", False) self.spell_correct_elong = kwargs.get("spell_correct_elong", False) self.fix_text = kwargs.get("fix_bad_unicode", False) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.all_caps_tag = kwargs.get("all_caps_tag", "wrap") self.mode = kwargs.get("mode", "normal") if self.unpack_hashtags: self.segmenter = Segmenter(corpus=self.segmenter_corpus) if self.mode != "fast": self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.regexes = ExManager().get_compiled() if 'hashtag' in self.omit or 'hashtag' in self.backoff: print("You can't omit/backoff and unpack hashtags!\n " "unpack_hashtags will be set to False") self.unpack_hashtags = False def __copy__(self): return self def __deepcopy__(self, memo): return self @staticmethod def add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " " @lru_cache(maxsize=4096) def handle_hashtag_match(self, m): """ Break a string to its constituent words (using Viterbi algorithm) """ text = m.group()[1:] # todo:simplify routine if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) # print(m.group(), " - ", expanded) # with open("analysis/segmenter_" + # self.segmenter_corpus + ".txt", "a") as f: # f.write(m.group() + "\t" + expanded + "\n") else: # split words following CamelCase convention expanded = self.regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") # print(m.group(), " - ", expanded) if "hashtag" in self.include_tags: expanded = self.add_special_tag(expanded, "hashtag", mode="wrap") return expanded def handle_elongated_match(self, m): text = m.group() # normalize to at most 2 repeating chars text = self.regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized # try to spell correct the word if self.spell_correct_elong: text = self.spell_corrector.correct_word(text, assume_wrong=True, fast=True) # with open("analysis/spell_corrector_" + # self.corrector_corpus + ".txt", "a") as f: # f.write(m.group() + " - " + text + "\n") # print(m.group(), "-", text) if "elongated" in self.include_tags: text = self.add_special_tag(text, "elongated") return text @lru_cache(maxsize=4096) def handle_repeated_puncts(self, m): """ return the sorted set so mathes random combinations of puncts will be mapped to the same token "!??!?!!", "?!!!!?!", "!!?", "!?!?" --> "?!" "!...", "...?!" --> ".!" :param m: :return: """ text = m.group() text = "".join(sorted(set(text), reverse=True)) if "repeated" in self.include_tags: text = self.add_special_tag(text, "repeated") return text @lru_cache(maxsize=4096) def handle_generic_match(self, m, tag, mode="every"): """ Args: m (): tag (): mode (): Returns: """ text = m.group() text = self.add_special_tag(text, tag, mode=mode) return text @lru_cache(maxsize=4096) def handle_emphasis_match(self, m): """ :param m: :return: """ text = m.group().replace("*", "") if "emphasis" in self.include_tags: text = self.add_special_tag(text, "emphasis") return text @staticmethod def dict_replace(wordlist, _dict): return [_dict[w] if w in _dict else w for w in wordlist] @staticmethod def remove_hashtag_allcaps(wordlist): in_hashtag = False _words = [] for word in wordlist: if word == "<hashtag>": in_hashtag = True elif word == "</hashtag>": in_hashtag = False elif word in {"<allcaps>", "</allcaps>"} and in_hashtag: continue _words.append(word) return _words @lru_cache(maxsize=4096) def handle_general_word_segment_and_spelling(self, m): """ :param m: :return: """ text = m.group() text = self.segmenter.segment(text) return text def pre_process_doc(self, doc): doc = re.sub(r' +', ' ', doc) # remove repeating spaces # ########################### # # fix bad unicode # ########################### # if self.fix_bad_unicode: # doc = textacy.preprocess.fix_bad_unicode(doc) # # ########################### # # fix html leftovers # ########################### # doc = html.unescape(doc) ########################### # fix text ########################### if self.fix_text: doc = ftfy.fix_text(doc) ########################### # BACKOFF & OMIT ########################### for item in self.backoff: # better add an extra space after the match. # Just to be safe. extra spaces will be normalized later anyway doc = self.regexes[item].sub( lambda m: " " + "<" + item + ">" + " ", doc) for item in self.omit: doc = doc.replace("<" + item + ">", '') ########################### # segment other words not hashtags ########################### # doc = self.regexes['not_hashtag'].sub( # lambda w: self.handle_general_word_segment_and_spelling(w), doc) # for word in doc.split(" "): # if(not word.startswith('#')): # word = self.segmenter.segment(word) # new_doc.append(word) # doc = " ".join(new_doc) ########################### # unpack hashtags ########################### if self.unpack_hashtags: doc = self.regexes["hashtag"].sub( lambda w: self.handle_hashtag_match(w), doc) ########################### # handle special cases ########################### if self.mode != "fast": if "allcaps" in self.include_tags: doc = self.regexes["allcaps"].sub( lambda w: self.handle_generic_match( w, "allcaps", mode=self.all_caps_tag), doc) if "elongated" in self.include_tags: doc = self.regexes["elongated"].sub( lambda w: self.handle_elongated_match(w), doc) if "repeated" in self.include_tags: doc = self.regexes["repeat_puncts"].sub( lambda w: self.handle_repeated_puncts(w), doc) if "emphasis" in self.include_tags: doc = self.regexes["emphasis"].sub( lambda w: self.handle_emphasis_match(w), doc) if "censored" in self.include_tags: doc = self.regexes["censored"].sub( lambda w: self.handle_generic_match(w, "censored"), doc) ########################### # unpack contractions: i'm -> i am, can't -> can not... ########################### # remove textacy dependency if self.unpack_contractions: doc = unpack_contractions(doc) # omit allcaps if inside hashtags doc = re.sub(r' +', ' ', doc) # remove repeating spaces # doc = re.sub(r'<hashtag><allcaps>', '<hashtag>', doc) # remove repeating spaces # doc = doc.replace('<hashtag> <allcaps>', '<hashtag>') # doc = doc.replace('</allcaps> </hashtag>', '</hashtag>') ########################### # Tokenize ########################### doc = self.remove_hashtag_allcaps(doc.split()) doc = " ".join(doc) # normalize whitespace if self.tokenizer: doc = self.tokenizer(doc) # Replace tokens with special dictionaries (slang,emoticons ...) # todo: add spell check before! if self.dicts: for d in self.dicts: doc = self.dict_replace(doc, d) return doc def pre_process_docs(self, docs, lazy=True): from tqdm import tqdm for d in tqdm(docs, desc="PreProcessing..."): yield self.pre_process_doc(d)
def clean_tweets(df): # define the text preprocessro text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'money', 'phone', 'time', 'date'], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens #tokenizer=SocialTokenizer(lowercase=True).tokenize, tokenizer=TweetTokenizer().tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) seg = Segmenter(corpus="twitter") tweet_text = df.tweet_text.to_list() clean_tweets = [] for tweet in tweet_text: # manually tag usernames # ex: @DoctorChristian -> <user> doctor christian </user> match = re.findall(r'@\w+', tweet) try: for at in match: user_seg = seg.segment(at[1:]) tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>') except: None # manually tag all caps so that the unpack_contractions functions works match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet) try: for all_caps in match: tweet = tweet.replace( all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>') except: None # manually tag percentages match = re.findall(r"(\d+.?\d?%)", tweet) try: for percent in match: tweet = tweet.replace( percent, '<percent> ' + percent[0:len(percent) - 1] + ' </percent>') except: None # deal with contractions that the tool misses tweet = re.sub( r"(\b)([Ww]hat|[Ii]t|[Hh]e|[Ss]he|[Tt]hat|[Tt]here|[Hh]ow|[Ww]ho|[Hh]ere|[Ww]here|[Ww]hen)'s", r"\1\2 is", tweet) tweet = re.sub(r"(\b)([Aa]in)'t", r"is not", tweet) tweet = re.sub(r"(\b)([Ww]asn)'t", r"was not", tweet) tweet = re.sub(r"(\b)([Hh]e|[Ss]he|[Ii]|[Yy]ou|[Tt]hey|[Ww]e)'d", r"\1\2 would", tweet) tweet = re.sub(r"(\b)([Ii]t|[Tt]hat|[Tt]his)'ll", r"\1\2 will", tweet) tweet = re.sub(r"(\b)([Cc])'mon", r"come on", tweet) # process the rest of the tweet with the nltk tweet tokenizer tweet = " ".join(text_processor.pre_process_doc(tweet)).lower() clean_tweets.append(tweet) # below is code to create the tsv file of cleaned tweets df['tweet_text'] = clean_tweets return df
# list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) seg = Segmenter(corpus="twitter") clean_tweets = [] for tweet in data: # manually tag usernames # ex: @DoctorChristian -> <user> doctor christian </user> match = re.findall(r'@\w+', tweet) try: for at in match: user_seg = seg.segment(at[1:]) tweet = tweet.replace(at, '<user> ' + user_seg + ' </user>') except: None # manually tag all caps so that the unpack_contractions functions works match = re.findall(r"(?<![#@$])\b([A-Z][A-Z ,.']*[A-Z])\b", tweet) try: for all_caps in match: tweet = tweet.replace( all_caps, '<allcaps> ' + all_caps.lower() + ' </allcaps>') except: None # deal with contractions that the tool misses
from ekphrasis.classes.segmenter import Segmenter # segmenter using the word statistics from english Wikipedia seg_eng = Segmenter(corpus="english") # segmenter using the word statistics from Twitter seg_tw = Segmenter(corpus="twitter") # segmenter using the word statistics from Twitter seg_tw_2018 = Segmenter(corpus="twitter_2018") words = [ "exponentialbackoff", "gamedev", "retrogaming", "thewatercooler", "panpsychism" ] for w in words: print(w) print("(eng):", seg_eng.segment(w)) print("(tw):", seg_tw.segment(w)) print("(tw):", seg_tw_2018.segment(w)) print()
def preprocess_corpus(corpus,stemming=False, all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False, hugs_and_kisses=False,hearts=False, hashtag=False, hashtag_mention=False, numbers=False, number_mention=False, exclamation=False, ##OBS denne er nå ikke testet, eventuelt bare fjerne den set_to_not=False, segmentation_hash= False, spelling=False, elongation=False, remove_signs=False ): """ Function used to apply preprocessing Input: corpus: a corpus on the format as the output in creat_corpus. Default False. all_smilies: if true, same effect as if pos_smilies, neg_smilies, and other_smilies were true.Default False. pos_smilies: if true, positive smilies such as : ), : (, ; ), ( ;, :p, ;p, : p, are replaced by "possmiley.Default False. neg_smilies: if true, negative smilies such as : (, ) : are replaced by "negsmiely".Default False. other_smilies: if true, smilies such as ^_^ are replaced by a describing word.Default False. hugs_and_kisses: if true, words such as xxx xoxo etc are replaced by "kisses" or "hug" and "kisses". Default False. hearts: if true, "<3" are replaced by "heart".Default False. hashtags: if true, hashtags are removed from the beginning of words, so #apple becomes apple.Default False. hashtag_mention: if true, and if hashtag is true, the word "hashatag" is added at the end of a tweet that used to contain one or more words beginning with a hashtag. Default False. numbers: if true, words that are purely numbers are removed.Default False. number_mention: if true, and if number is true, the word "thereisanumber" is added at the end of a tweet that used to contain one or more words that were purely numbers. Default False. exclamation: if true, the word "exclamation" is added at the end of a tweet that contain one or more "!".Default False. set_to_not: if true, all words ending with "n't" is replaced by not.Default False. segmentation_hash: if true, words starting with # that do not appear in the english dictionary is split into segments, eg '#iammoving' becomes 'i am moving'. Default False. spelling: if true, all words that are not a part of the english dictionary is set to the most likely word, within two alterations. Default False. elongation: if true, the length of all sequences of letters in words that are not a part of the English dictionary is set to max 2. Before words that are altered because of this, the word 'elongation' appears. Default False. remove_signs: if true, signs such as ",", ".", ":", ";", "-", are removed. Default False. Output: new_corpus: a new corpus, on same format as the input corpus. """ start = time.time() #initialising the new corpus: new_corpus=[] #Want to split the tweets using this tokenizer: tknzr = TweetTokenizer(reduce_len=True) if stemming: ps = PorterStemmer() if segmentation_hash or spelling or elongation: d = enchant.Dict("en_US") if segmentation_hash: #seg = Segmenter(corpus="english") seg = Segmenter(corpus="twitter") if spelling: sp = SpellCorrector(corpus="english") elapsed = time.time() print("Time in min before starting first for loop:", (elapsed - start) / 60 ) #Want to go though each line (tweet) in the corpus for k, line in enumerate(corpus): if hashtag_mention: there_is_hashtag=False if number_mention: there_is_number=False if exclamation: there_is_exclamation=False #Splitting the tweet using the chosen tokenizer. words=tknzr.tokenize(line) #Initializing for cleaned_tweet: cleaned_tweet=[] for i, word in enumerate(words): #Indicating that the word has not been treated yet word_not_treated=True end_=len(words)-1 if ((pos_smilies or all_smilies) and word_not_treated): if (i>0 and (word=='d' and (words[i-1]==':' or words[i-1]==';'))) or word == ':d' or word == ';d': cleaned_tweet.append('smile') word_not_treated=False elif (i>0 and (word=='p' and (words[i-1]==':' or words[i-1]==';'))) or word == ':p' or word == ';p' : cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word=='d' and (words[i-1]==':' or words[i-1]==';' or words[i-1]=='x'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and words[i-1]=='(' and (word==':' or word==';'): cleaned_tweet.append('smile') word_not_treated=False elif i>0 and word==')' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('smile') word_not_treated=False if ((neg_smilies or all_smilies) and word_not_treated): if i>0 and words[i-1]==')' and (word==':' or word==';'): cleaned_tweet.append('sad') word_not_treated=False elif i>0 and word=='(' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('sad') word_not_treated=False if ((other_smilies or all_smilies) and word_not_treated): if i>0 and i<end_ and word=='_' and words[i-1]=='^' and words[i+1]=='^': cleaned_tweet.append('eyesmiley') word_not_treated=False elif i>0 and word=='o' and words[i-1]==':': cleaned_tweet.append('openmouthface') word_not_treated=False elif i>0 and word=='/' and words[i-1]==':': cleaned_tweet.append('slashsmiely') word_not_treated=False elif i>0 and word=='*' and (words[i-1]==':' or words[i-1]==';'): cleaned_tweet.append('kiss') word_not_treated=False if ((hugs_and_kisses and word_not_treated)): #want to find hearts, hugs, kisses, etc: if (word == "xoxo" or word == "xo" or word == "xoxoxo" or word == "xxoo"): cleaned_tweet.append('hug') cleaned_tweet.append('kiss') word_not_treated=False elif (word=='xx' or word=='xxx'or word=='xxxx'): cleaned_tweet.append('kiss') word_not_treated=False if ((hearts and word_not_treated)): if word == "<3": cleaned_tweet.append('heart') word_not_treated=False if (hashtag and word_not_treated): if word[0]=='#': there_is_hashtag=True if (len(word)>1 and segmentation_hash and not d.check(word[1:])): cleaned_tweet.append(seg.segment(word[1:])) else: cleaned_tweet.append(word[1:]) word_not_treated=False if (numbers and word_not_treated): if word.isdigit(): there_is_number=True word_not_treated=False if (exclamation and word_not_treated): if word=='!': there_is_exclamation=True cleaned_tweet.append(word) word_not_treated=False if (set_to_not and word_not_treated): if word[-3:]=='n\'t': cleaned_tweet.append('not') word_not_treated=False if (word_not_treated): if (not remove_signs) or (remove_signs and ( (word!= '^' and word!=',' and word!='.' and word!=':' and word!='-' and word!='´' and word!=';'and word!=')' and word!='(' and word!='*'))): if ((not word[0].isdigit()) and elongation and not d.check(word) and len(word)>2): new=[] new.append(word[0]) for i,letter in enumerate(word): if i>0 and i<len(word)-1: if not( letter==word[i-1]==word[i+1]): new.append(letter) new.append(word[-1]) new_word=''.join(new) if new_word!= word: cleaned_tweet.append('elongation') word=new_word if spelling and not d.check(word)and len(word)>2: word=sp.correct(word) if stemming: word=ps.stem(word) cleaned_tweet.append(word) if (hashtag_mention and there_is_hashtag) : cleaned_tweet.append('hashtag') if (number_mention and there_is_number) : cleaned_tweet.append('number') if (exclamation and there_is_exclamation): cleaned_tweet.append('exclamation') new_words = ' '.join(cleaned_tweet) new_words = new_words.encode('utf-8') new_corpus.append(new_words) if np.mod(k,25000)==1: elapsed = time.time() print("Time in min after", k, " tweets:", (elapsed - start) / 60 ) elapsed = time.time() print("Time in min total:", (elapsed - start) / 60 ) return new_corpus
# ekphrasis que es para hacer sentimental analysis en especifico aqui se uso para la segmentacion de hashtags #Metodo para limpiar tweets quitar caracteres especiales, hashtags y url def clean_tweet(tweet): tweet = re.sub(r"pic.\S+", "", tweet) return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) #Query para los 20 tweets recientes tweets = query_tweets_from_user("realDonaldTrump", 20) #Imprimir los tweets limpios for tweet in tweets: print(clean_tweet(tweet.text)) tweetHashtag = re.findall(r"#(\w+)", tweet.text) if tweetHashtag.__len__ != 0: hashtagArray.extend(tweetHashtag) print("\n") #El corpus se refiere a las estadisticas que usara para segmentar los hashtags en este caso son de twitter seg_tw = Segmenter(corpus="twitter") hashtagArray = [] print("Hashtags Segmention:\n") for hashtag in hashtagArray: # print("(tw):", seg_tw.segment(hashtag))
}, fix_html=True, # fix HTML tokens segmenter="twitter", corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=True, # spell correction for elongated words tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) seg_tw = Segmenter(corpus="twitter") sp = SpellCorrector(corpus="twitter") f1 = open('tokenized_tweets_golbeck.txt', 'w') c = 1 for line in data: a = line.strip().split('\t') if len(a) >= 3: b = a[2] c = a[1] b = b.split() for i in range(len(b)): if b[i].startswith('http'): b[i] = '<url>' b = ' '.join(b) a = text_processor.pre_process_doc(b) for i in range(len(a)): if a[i].isalpha(): a[i] = seg_tw.segment(sp.correct(a[i])) a = ' '.join(a) f1.write(a + ' ' + c + '\n')
dicts=[emoticons, slangdict]) segmenter = Segmenter(corpus="twitter") count = 0 all_texts = [] user_dict = defaultdict(lambda: None) for file_name in sorted(os.listdir(tweet_path)): if file_name.endswith('.json'): print('processing ' + file_name) with open(tweet_path + file_name, 'r') as tweet_batch: tweets = json.load(tweet_batch) for tweet in tweets: # text = preprocess(tweet['content']['text']) tokens = text_processor.pre_process_doc(text) tokens = [segmenter.segment(t) for t in tokens] text = " ".join(tokens) text = process_tags(text).strip() username = str(tweet['username']) if text: if user_dict[username]: user_dict[username] = list( set(user_dict[username]) | set(tweet['college'])) else: user_dict[username] = tweet['college'] data.loc[count, 'username'] = str(tweet['username']) data.loc[count, 'id'] = str(tweet['id_str']) data.loc[count, 'conversation'] = str( tweet['content']['conversation']) data.loc[count, 'text'] = text
class TextPreProcessor: """ Kwargs: normalize (list) possible values: ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date'] annotate (list) possible values: ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'] unpack_hashtags (bool) unpack_contractions (bool) segmenter (str): define the statistics of what corpus you would like to use [english, twitter] corrector (str): define the statistics of what corpus you would like to use [english, twitter] tokenizer (callable): callable function that accepts a string and returns a list of strings if no tokenizer is provided then the text will be tokenized on whitespace simplify_emoticons (bool) dictionaries (list) """ def __init__(self, **kwargs): self.tokens_to_normalize = kwargs.get("normalize", []) self.annotate = kwargs.get("annotate", []) self.unpack_hashtags = kwargs.get("unpack_hashtags", False) self.unpack_contractions = kwargs.get("unpack_contractions", False) self.segmenter_corpus = kwargs.get("segmenter", "english") self.corrector_corpus = kwargs.get("corrector", "english") self.segmenter = Segmenter(corpus=self.segmenter_corpus) self.spell_corrector = SpellCorrector(corpus=self.corrector_corpus) self.tokenizer = kwargs.get("tokenizer", None) self.simplify_emoticons = kwargs.get("simplify_emoticons", False) self.dictionaries = kwargs.get("dictionaries", []) self.stats = {} self.preprocessed_texts = -1 def pre_process(self, text: str, with_stats=False): self._increment_counter() text = self._remove_repeating_spaces(text) text = self._normalize(text) text = self._unpack_hashtags(text) text = self._annotate(text) text = self._unpack_contractions(text) text = self._remove_repeating_spaces(text) tokens = self._tokenize(text) tokens = self._simplify_emoticons(tokens) tokens = self._replace_using_dictionaries(tokens) if with_stats: return tokens, self._pre_processed_text_stats() else: return tokens def _pre_processed_text_stats(self): return self.stats[self.preprocessed_texts] def _increment_counter(self): self.preprocessed_texts += 1 self.stats[self.preprocessed_texts] = {} def _normalize(self, text): for item in self.tokens_to_normalize: text = self._change_using_regexp(item, lambda m: f' <{item}> ', text, 'normalize') return text def _unpack_hashtags(self, text): if self.unpack_hashtags: return self._change_using_regexp("hashtag", lambda w: self._handle_hashtag_match(w), text, "unpack") return text def _annotate(self, text): text = self._annotate_allcaps(text) text = self._annotate_elongated(text) text = self._annotate_repeated(text) text = self._annotate_emphasis(text) text = self._annotate_censored(text) return text def _annotate_allcaps(self, text): if "allcaps" in self.annotate: return self._change_using_regexp("allcaps", lambda w: self._handle_generic_match(w, "allcaps", mode='wrap'), text, "annotate") return text def _annotate_elongated(self, text): if "elongated" in self.annotate: return self._change_using_regexp("elongated", lambda w: self._handle_elongated_match(w), text, "annotate") return text def _annotate_repeated(self, text): if "repeated" in self.annotate: return self._change_using_regexp("repeat_puncts", lambda w: self._handle_repeated_puncts(w), text, "annotate") return text def _annotate_emphasis(self, text): if "emphasis" in self.annotate: return self._change_using_regexp("emphasis", lambda w: self._handle_emphasis_match(w), text, "annotate") return text def _annotate_censored(self, text): if "censored" in self.annotate: return self._change_using_regexp("censored", lambda w: self._handle_generic_match(w, "censored"), text, "annotate") return text def _change_using_regexp(self, regexp_name, func, text, stats_name_prefix): changing_result = regexes[regexp_name].subn(func, text) self._update_stats(f'{stats_name_prefix}_{regexp_name}', changing_result[1]) return changing_result[0] def _unpack_contractions(self, text): if self.unpack_contractions: text = self._unpack_selected_contrations(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|" r"[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n'?t", r"\1\2 not", text) text = self._unpack_selected_contrations(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Ww]ho|[Yy]ou)ll", r"\1\2 will", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([Tt]hey|[Ww]hat|[Yy]ou)re", r"\1\2 are", text) text = self._unpack_selected_contrations(r"(\b)([[Hh]e|[Ss]he)'s", r"\1\2 is", text) text = self._unpack_selected_contrations( r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)" r"'?ve", r"\1\2 have", text) text = self._unpack_selected_contrations(r"(\b)([Cc]a)n't", r"\1\2n not", text) text = self._unpack_selected_contrations(r"(\b)([Ii])'m", r"\1\2 am", text) text = self._unpack_selected_contrations(r"(\b)([Ll]et)'?s", r"\1\2 us", text) text = self._unpack_selected_contrations(r"(\b)([Ww])on'?t", r"\1\2ill not", text) text = self._unpack_selected_contrations(r"(\b)([Ss])han'?t", r"\1\2hall not", text) text = self._unpack_selected_contrations(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text) return text def _unpack_selected_contrations(self, regexp, replacement, text): unpacking_result = re.subn(regexp, replacement, text) self._update_stats("unpack_contrations", unpacking_result[1]) return unpacking_result[0] def _tokenize(self, text): if self.tokenizer: return self.tokenizer(text) else: return text.split(' ') def _simplify_emoticons(self, tokens): if self.simplify_emoticons: result = [] for token in tokens: if token in emoticons: new_emoticon = emoticons[token] if new_emoticon != token: self._update_stats('emoticon_simplification', 1) result.append(new_emoticon) else: result.append(token) return result else: return tokens def _replace_using_dictionaries(self, tokens): if len(self.dictionaries) > 0: for dictionary in self.dictionaries: for idx, token in enumerate(tokens): if token in dictionary: value = dictionary[token] if '<entity>' not in value: tokens[idx] = value self._update_stats('dictionary_replacement', 1) return ' '.join(tokens).split(' ') else: return tokens @lru_cache(maxsize=65536) def _handle_hashtag_match(self, m): text = m.group()[1:] if text.islower(): expanded = self.segmenter.segment(text) expanded = " ".join(expanded.split("-")) expanded = " ".join(expanded.split("_")) else: expanded = regexes["camel_split"].sub(r' \1', text) expanded = expanded.replace("-", "") expanded = expanded.replace("_", "") if "hashtag" in self.annotate: expanded = self._add_special_tag(expanded, "hashtag", mode="wrap") return expanded @lru_cache(maxsize=65536) def _handle_generic_match(self, m, tag, mode="every"): text = m.group() if tag == 'allcaps': # word around for allcaps contractions like YOU'RE TODO refactor text = text.lower() text = self._add_special_tag(text, tag, mode=mode) return text def _handle_elongated_match(self, m): text = m.group() text = regexes["normalize_elong"].sub(r'\1\1', text) normalized = self.spell_corrector.normalize_elongated(text) if normalized: text = normalized text = self._add_special_tag(text, "elongated") return text @lru_cache(maxsize=65536) def _handle_repeated_puncts(self, m): text = m.group() text = "".join(sorted(set(text), reverse=True)) text = self._add_special_tag(text, "repeated") return text @lru_cache(maxsize=65536) def _handle_emphasis_match(self, m): text = m.group().replace("*", "") text = self._add_special_tag(text, "emphasis") return text def _update_stats(self, key, value): if value > 0: stats_for_text = self.stats[self.preprocessed_texts] if key not in stats_for_text: stats_for_text[key] = 0 stats_for_text[key] += value @staticmethod def _remove_repeating_spaces(text): return re.sub(r' +', ' ', text).strip() @staticmethod def _add_special_tag(m, tag, mode="single"): if isinstance(m, str): text = m else: text = m.group() if mode == "single": return " {} <{}> ".format(text, tag) elif mode == "wrap": return " ".join([" <{}> {} </{}> ".format(tag, text, tag)]) + " " elif mode == "every": tokens = text.split() processed = " ".join([" {} <{}> ".format(t, tag) for t in tokens]) return " " + processed + " "
def tokenize_hashtags(hashtags): seg_eng = Segmenter(corpus="english") hash= ' '.join(seg_eng.segment(hashtags) for h in hashtags) return hash