def prune_emojis_emoticons(string): # at tim doesn't work specially when emoticons comes after a weird emoji like O.o which isnt registered if "location" in emot.emoji(string).keys() is not None: for loc in reversed(emot.emoji(string)['location']): string = string[0:loc[0]] + string[loc[1] + 1::] # print(emot.emoticons(string)) if "location" in emot.emoticons(string): for loc in reversed(emot.emoticons(string)['location']): string = string[0:loc[0]] + string[loc[1] + 1::] return string
def process_emoji(tweets): ##stop_words = get_stop_words("en") emoji_count = len(emot.emoji(tweets)) emoticon_count = len(emot.emoticons(tweets)) '''remove stop-words text_token = tokenizer.tokenize(tweets) non_stop_token = [word for word in text_token if word not in stop_words] non_stop_text = " ".join(non_stop_token) ''' '''remove non-ascii letters''' new_string = re.sub(r"[^\w']", " ", tweets) new_string = re.sub(r"[\s]+", ' ', new_string) new_string = new_string.strip() '''Textblob: Spell correction and analysis polarity''' text_blob = TextBlob(new_string) correct_string = str(text_blob.correct()) polarity = text_blob.sentiment.polarity subjectity = text_blob.sentiment.subjectivity emoji_result = emojiClass(correct_string, emoji_count, emoticon_count, polarity, subjectity) return emoji_result
def classByEmoji(text): global positive_emojis global negative_emojis global neutral_emojis emojis = emot.emoji(text) emoticons = emot.emoticons(text) emots = set() for map_emoji in emojis: emots.add(map_emoji['value']) for map_emoji in emoticons: emots.add(map_emoji['value']) positive_inter = emots.intersection(positive_emojis) negative_inter = emots.intersection(negative_emojis) neutral_inter = emots.intersection(neutral_emojis) if positive_inter: if len(negative_inter) == 0 and len(neutral_inter) == 0: return 1 elif negative_inter: if len(neutral_inter) == 0: return -1 elif neutral_inter: return 0 return ''
def test_emo(): test = "I love it, 👨 :-) 🏁:-) :-)🏁 :-) 🏁 <3" print(emo.emoji(test)) print(emo.emoticons(test)) print(test[27:30]) print(test[17]) return None
def convert_emojis(old_text): smiley = emot.emoji(old_text) new_text = old_text if smiley['flag']: for i in range(0, len(smiley['value'])): new_text = re.sub(smiley['value'][i], smiley['mean'][i], old_text) old_text = new_text return new_text
def _strip_emojis(text): emojis = set([emoji['value'] for emoji in emot.emoji(text)]) normalized = text for emoji in emojis: normalized = normalized.replace(emoji, '') return normalized
def count_emoji(file_name): emoji_map = defaultdict(float) path = 'Dataset\\Tweets\\' if file_name.startswith("dataset"): with open(path + file_name, 'r', encoding="utf-8") as file_to_read: for line in file_to_read: for emoji in emot.emoji(line): emoji_map[emoji['value']] += 1 return emoji_map
def is_emo(word: str) -> bool: """Use emot to detect if something is an emoticion or emoji.""" emoji = emot.emoji(word)['flag'] emoticon = emot.emoticons(word) try: emoticon = emoticon['flag'] except TypeError: emoticon = emoticon[0]['flag'] return bool(emoji) + bool(emoticon)
def extract_emojis_emoticons(text): extracted = [] vals = emot.emoticons(text) if len(vals) > 1: extracted.extend(vals['value']) vals = emot.emoji(text) if len(vals) > 1: extracted.extend(vals['value']) return extracted
def find_emojis(text): """ Find and remove emojis in text. Return emojis founded and text without emojis. """ emojis = [] for emoji in emot.emoji(text): emojis.append(emoji['value']) text = text.replace(emoji['value'], '') return text, emojis
def formate_smiley(txt): """ Utilisent le package emot pour extraire les emoji (inutilisé à l'heure actuelle). Entrée : txt, une string Sortie : la liste des emoji texte et caractères. """ lst = list(map(lambda x: x["value"], emot.emoji(txt))) lst2 = list(map(lambda x: x["value"], emot.emoticons(txt))) lst.extend(lst2) return lst
def isEmoji(word): x=emot.emoji(word) if isinstance(x, list): for v in x: val=v['flag'] else: val=x['flag'] if(val==False): return False else: return True
def detect_emoticons_emojis(self, string): emoticons = emot.emoticons(string) emojis = emot.emoji(string) if len(emoticons) > 0: for emoticon in emoticons: value = emoticon['value'] if value != (')' or ':'): self.emoticons.setdefault(value, set()).add(string) if len(emojis) > 0: for emoji in emojis: value = emoji['value'] if value != (')' or ':'): self.emojis.setdefault(value, set()).add(string)
def text_demojis(text, how_replace=""): emojis = emot.emoji(text) if isinstance(emojis, list): emojis = emojis[0] if emojis['flag']: for index in range(len(emojis["value"])): if how_replace == 'mean': source = emojis['value'][index] target = emojis['mean'][index].split(':')[1] text = text.replace(source, target) else: text = text.replace(emojis['value'][index], "") return text
def extract_emoji(text): try: emoticons_list = emot.emoticons(text)['value'] except TypeError: emoticons_list = [] try: emoji_list = emot.emoji(text)['value'] except TypeError: emoji_list = [] emo_list = emoticons_list + emoji_list return emo_list
def count_emojis(text): global skin_tones text = text.replace(':', '') emojis_counter = {} emojis = emot.emoji(text) for map_emoji in emojis: try: if(map_emoji['value'] not in skin_tones): emojis_counter[map_emoji['value']] += 1 except KeyError as e: if(map_emoji['value'] not in skin_tones): emojis_counter[map_emoji['value']] = 1 emojis_counter_sorted = sorted(emojis_counter.items(), key=operator.itemgetter(1), reverse=True) return emojis_counter_sorted
def delete_emoji_and_emoticon(line): inline_emoji = emot.emoji(line) for data_x in inline_emoji: if data_x['value'] in line: line = line.replace(data_x['value'], ' ') for data_pe in pos_emoticons: if data_pe in line: line = line.replace(data_pe, ' ') emoticon_map[sentiment]["pos"] += 1 for data_ne in neg_emoticons: if data_ne in line: line = line.replace(data_ne, ' ') emoticon_map[sentiment]["neg"] += 1 for data_o in others: if data_o in line: line = line.replace(data_o, ' ') return line
def convert_emojis(text): def replace_emoji(txt, indexes, replacements): for (index, replacement) in zip(indexes, replacements): txt[index] = replacement return txt converted_emojis = emot.emoji(text) if converted_emojis["flag"] is True: text = [x for x in text] idx_emojis = [ location[0] for location in converted_emojis["location"] ] # Get the first el as it is a single car text = replace_emoji( text, idx_emojis, converted_emojis["mean"], ) text = "".join(text) return text
def base_emoji(text, flag): '''base_emoji return setiment of the text based on emoji and emoticons in text. Args: text (str): Setence of paragraph for calculating setiment. flag (boolean): True --> It gives 5 criteria 0,1,2,3,4 where 2(Nutral), 4(very positive), 1(very negative) False --> Gives probability with 2 floating point accuray between -1(negative) to 1(positive) Returns: __prob_sentiment: If flag = True it will return number(int) between 0 to 4 If flag = False it will return nmber(float-2f) between -1 to 1 ''' #convert input to string text = str(text) __temp_emoji = emot.emoji(text) __temp_emoti = emot.emoticons(text) __pre_final_text = "" #Finding emoji and emoticons from text if __temp_emoji['flag'] == True: for data in __temp_emoji['mean']: __pre_final_text = str(__pre_final_text) + str(data) + " " try: if __temp_emoti['flag'] == True: for data in __temp_emoti['mean']: __pre_final_text = str(__pre_final_text) + str(data) + " " except: pass if len(__pre_final_text) < 2: __pre_final_text = text __analysis = TextBlob(__pre_final_text) #choosing output formate of sentiment based on flag if flag == False: __prob_sentiment = round(__analysis.sentiment.polarity, 4) else: __prob_sentiment = get_solid_setiment(__analysis.sentiment.polarity) return __prob_sentiment
import pandas as pd import emot import re dataset1 = pd.read_csv('dataset1_2.csv', dtype={'tweet_id': object}) #Detect emoticons - presence and counts detect_emojis_and_emoticons = lambda x: len(emot.emoji(x)) + len( emot.emoticons(x)) dataset1['count_of_emoticons'] = dataset1.text.apply( detect_emojis_and_emoticons) #Detect and clean non-unicode characters clean_non_unicode = lambda x: x.replace(r'[^\x00-\x7F]+', ' ').strip().lower() dataset1['cleaned_text'] = dataset1.text.apply(clean_non_unicode) #Detect and clean links clean_links = lambda x: re.sub(r'http\S+', ' ', x).strip() dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_links) #Clean hashtags hashtags = [ '#drunk', '#imdrunk', '#drank', '#sober', '#notdrunk', '#imnotdrunk' ] hashtags_regex = re.compile('|'.join(map(re.escape, hashtags))) clean_hashtags = lambda x: hashtags_regex.sub(' ', x) dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_hashtags) #Remove punctuations and extra whitespaces clean_punctuations = lambda x: re.sub(r'[^\w\s]', ' ', x) dataset1['cleaned_text'] = dataset1.cleaned_text.apply(clean_punctuations)
def produceWordEmbd(rawTweet): tweet = rawTweet # print(tweet) # Removing twitter handles' tags tweet = re.sub(r"@{1}[A-Za-z0-9_]+\s", ' ', tweet) # Removing web addresses tweet = re.sub(r"htt(p|ps)\S+", " ", tweet) # Removing email addresses emails = r'[a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}' tweet = re.sub(emails, " ", tweet) #Getting all emoticons together emojis_dict = emot.emoji(tweet) emojis = [] for z in emojis_dict: emojis.append(z['value']) tweet = re.sub(z['value'], '', tweet) # print(tweet, emojis) # Tokenizing based on whitespaces tokens = word_tokenize(tweet) # print(tokens) # Getting hashtags intact newTokens = [] for i, x in enumerate(tokens): if x == '#' and i < len(tokens) - 1: y = x + tokens[i + 1] newTokens.append(y) else: if i > 0: if (tokens[i - 1] != '#'): newTokens.append(x) else: newTokens.append(x) # Getting clitics intact finalTokens = [] for j, x in enumerate(newTokens): S = ["'s", "'re", "'ve", "'d", "'m", "'em", "'ll", "n't"] if x in S: y = newTokens[j - 1] + x finalTokens.append(y) else: if j < len(newTokens) - 1: if newTokens[j + 1] not in S: finalTokens.append(x) else: finalTokens.append(x) # Eliminate case sensitivity for i, z in enumerate(finalTokens): finalTokens[i] = z.lower() # Getting rid of stopwords stopwordSet = set(stopwords.words('english')) filteredFinalTokens = [] for i, z in enumerate(finalTokens): if z not in stopwordSet: filteredFinalTokens.append(z) for x in filteredFinalTokens: u = re.split(r"\\n", x) for m in u: vocabulary.append(m) # print(filteredFinalTokens) words = filteredFinalTokens word_vecs = [] for word in words: fr = np.zeros(400) if word in w2v.vocab: tr = w2v[word] for k in range(400): fr[k] = tr[k] word_vecs.append(fr) for emoji in emojis: yr = np.zeros(400) if emoji in e2v.vocab: zr = e2v[emoji] for k in range(300): yr[k] = zr[k] word_vecs.append(yr) return sum(word_vecs) / (len(word_vecs) + 1) pass
def collectFeaturesFromTimeline(filename): user_feature = defaultdict(dict) tknzr = TweetTokenizer() tknzr_reduce = TweetTokenizer(strip_handles=True, reduce_len=True) stop_words = set(stopwords.words('english')) with gzip.open(filename, 'r') as inf: line_count = 0 num_contr = 0 num_tokens = 0 # key: emoticon text, value: occurrence emoticon_use = defaultdict(int) emoji_use = defaultdict(int) num_ht = 0 num_with_ht_tweets = 0 all_tokens_set = set() reduced_all_tokens_set = set() tweet_text = [] pos_bigrams = defaultdict(int) pos_trigrams = defaultdict(int) hashtags_occur = defaultdict(int) for line in inf: line_count += 1 tweet = json.loads(line.decode('utf8')) userid_str = tweet['user']['id_str'] # filtering section # English and non-retweet if tweet['lang'] != 'en': continue if 'retweeted_status' in tweet.keys(): continue # note: need to get untruncated tweets text = tweet['full_text'] if text == '': continue tweet_text.append(text) # Tokenize # tokenize has preserve_case option default as True tokens = tknzr.tokenize(text) num_tokens += len(tokens) all_tokens_set.update(set(tokens)) if len(tweet['entities']['hashtags']) is not 0: num_with_ht_tweets += 1 for ht in tweet['entities']['hashtags']: hashtags_occur[ht['text']] += 1 num_ht += len(tweet['entities']['hashtags']) # lexical diversity number of tokens in a tweet without URLs, user mention and stopwords divided by the total number of tokens text_without_url = re.sub(r"http\S+", "", text) reduced_tokens = tknzr_reduce.tokenize(text_without_url) # eliminate : if reduced_tokens[0] == ':': reduced_tokens.pop(0) reduced_tokens_set = set(reduced_tokens) for sw in intersectionTokens(reduced_tokens_set, stop_words): # note: remove could raise keyerror reduced_tokens_set.remove(sw) # order doesn't matter here reduced_all_tokens_set.update(reduced_tokens_set) # lots of :/ with URL in the text emoticons_text = emot.emoticons(text_without_url) if type(emoticons_text) == dict: for ele in emoticons_text['value']: emoticon_use[ele] += 1 emoji_text = emot.emoji(text) for ele in emoji_text['value']: emoji_use[ele] += 1 # features need to loop over all the tokens # syntactic features and contractions # pos tagging tags = nltk.pos_tag(tokens) for i in range(len(tokens)): if tags[i][0] in contractions: num_contr += 1 # For noun's-like contractions if tags[i][1] == 'NN' or tags[i][1] == 'NNP': if len(tags[i][0]) <= 3: continue if tags[i][0][-2:] == "'s" or tags[i][0][-3:] == "'re": if i + 1 < len(tokens): if tags[i + 1][1] == 'VBN' and tags[i + 1][1] == 'VBG': num_contr += 1 # not including BOS if i < len(tokens) - 1: pos_bigrams[(tags[i][1], tags[i + 1][1])] += 1 if i < len(tokens) - 2: pos_trigrams[(tags[i][1], tags[i + 1][1], tags[i + 2][1])] += 1 # line_count is num of Tweets Got if line_count == 0 or num_tokens == 0: return {} user_feature[userid_str]['scraped_tweets_num'] = line_count # NOTE: store the user information for behavioral analysis, store the last tweet per user with open(os.path.join(outdir, "{}_user_unique.json".format(group)), 'a', encoding='utf8') as uniquef: uniquef.write("{}\n".format(json.dumps(tweet))) # Collect features here # Type-Token Ratio user_feature[userid_str]['type_token_ratio'] = len( all_tokens_set) / num_tokens # Usage of Contractions user_feature[userid_str][ 'num_contractions_per_tweet'] = num_contr / line_count # lexical diversity user_feature[userid_str]['lexical_diversity'] = len( reduced_all_tokens_set) / len(all_tokens_set) # hashtags: hashtag occurrence and use by num of users user_feature[userid_str]['hashtags_occur'] = hashtags_occur hashtags_user = {} for key in hashtags_occur.keys(): hashtags_user[key] = 1 user_feature[userid_str]['hashtags_user'] = hashtags_user user_feature[userid_str]['num_ht_per_tweet'] = num_ht / line_count # emoticons and emoji user_feature[userid_str]['emoticon_use'] = emoticon_use user_feature[userid_str]['emojis'] = emoji_use # syntactic features user_feature[userid_str]['pos_bigrams'] = pos_bigrams user_feature[userid_str]['pos_trigrams'] = pos_trigrams # store the text for intangible feature use if os.path.exists( os.path.join(outdir, 'tmp/{}/{}.txt'.format(group, userid_str))): return user_feature with open(os.path.join(outdir, 'tmp/{}/{}.txt'.format(group, userid_str)), 'w', encoding='utf8') as outf: for tt in tweet_text: # NOTE: replace \n with space in text tt = tt.replace("\n", ".") assert "\n" not in tt outf.write('{}\n'.format(tt)) return user_feature
def produceWordEmbd(rawTweet): tweet = rawTweet print(tweet) # Removing twitter handles' tags tweet = re.sub(r"@{1}[A-Za-z0-9_]+\s", ' ', tweet) # Removing web addresses tweet = re.sub(r"htt(p|ps)\S+", " ", tweet) # Removing email addresses emails = r'[a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}' tweet = re.sub(emails, " ", tweet) # Tokenizing based on whitespaces tokens = word_tokenize(tweet) print(tokens) # Getting hashtags intact newTokens = [] for i,x in enumerate(tokens): if x == '#' and i < len(tokens)-1: y = x + tokens[i+1] newTokens.append(y) else: if i>0: if (tokens[i-1]!='#'): newTokens.append(x) else: newTokens.append(x) # Getting clitics intact finalTokens = [] for j,x in enumerate(newTokens): S = ["'s", "'re", "'ve", "'d", "'m", "'em", "'ll", "n't"] if x in S: y = newTokens[j-1] + x finalTokens.append(y) else: if j<len(newTokens)-1: if newTokens[j+1] not in S: finalTokens.append(x) else: finalTokens.append(x) # Eliminate case sensitivity for i,z in enumerate(finalTokens): finalTokens[i] = z.lower() # Getting rid of stopwords stopwordSet = set(stopwords.words('english')) filteredFinalTokens = [] for i,z in enumerate(finalTokens): if z not in stopwordSet: filteredFinalTokens.append(z) print(filteredFinalTokens) # Treating emojis word_vecs = [] for j,y in enumerate(filteredFinalTokens): z = emot.emoji(y) if z == []: if y in w2v.vocab: print("Adding word vector for " + y) word_vecs.append(w2v[y]) else: w = re.findall(r"[A-Za-z0-9]+", y) s1 = np.ndarray(shape=(300,1), dtype=float) s2 = np.ndarray(shape=(400,1), dtype=float) if w != []: w = w[0] t = re.sub(w,'',y) #s1 = np.ndarray(300, 1) #s2 = np.zeros(400, 1) if t in e2v.vocab: print("Computing emoji vector for " + t) s1 = e2v[t] print(type(s1)) if w in w2v.vocab: print("Computing vector for word-part " + w) s2 = w2v[w] #s1 = [s1, np.zeros(100,1)] N = 100 s1 = np.pad(s1, (0, N), 'constant') word_vecs.append(s1 + s2) pass
def n_total_emojis(string): if any("value" in d for d in emot.emoji(string)): return len(emot.emoji(string)["value"]) else: return 0
# to lower case amazon_file['review'] = amazon_file['review'].apply( lambda x: " ".join(x.lower() for x in x.split())) amazon_file['review'].head() # exract emoji import emot amazon_list = amazon_file['review'] l = [] for i in range(len(amazon_list)): #print (emot.emoji(reviewlist[i]).get("value","none")) x = emot.emoji(amazon_list[i]).get("mean", "none") l.append(x) amazon_file['emoji'] = l emoji_file1 = pd.read_csv('C:\\Users\\gupta\\.spyder-py3\\amazon_emotions.csv') emoji_file1['emoji'] = emoji_file1['emoji'].str.replace('[^\w\s]', '') emoji_file1['emoji'].tail() # punctuation removal amazon_file['review'] = amazon_file['review'].str.replace('[^\w\s]', '') amazon_file['review'].tail() # stop words removal import nltk
pass for x in tqdm.trange(len(data)): s = data[x][0] for i in tqdm.trange(len(s)): print("Word embedding for ", i) produceWordEmbd(s[i]) L = len(vocabulary) wordVectors = [] for l in tqdm.trange(L): y = vocabulary[l] z = emot.emoji(y) if z == []: if y in w2v.vocab: # print("Adding word vector for " + y) wordVectors.append(w2v[y]) else: w = re.findall(r"[A-Za-z0-9]+", y) s1 = np.ndarray(shape=(300, 1), dtype=float) s2 = np.ndarray(shape=(400, 1), dtype=float) if w != []: w = w[0] t = re.sub(w, '', y) #s1 = np.ndarray(300, 1) #s2 = np.zeros(400, 1) if t in e2v.vocab: # print("Computing emoji vector for " + t)
# to lower case spotify_file['Review'] = spotify_file['Review'].apply( lambda x: " ".join(x.lower() for x in x.split())) spotify_file['Review'].head() # exract emoji import emot spotify_list = spotify_file['Review'] l = [] for i in range(len(spotify_list)): #print (emot.emoji(reviewlist[i]).get("value","none")) x = emot.emoji(spotify_list[i]).get("mean", "none") l.append(x) spotify_file['emoji'] = l # punctuation removal spotify_file['Review'] = spotify_file['Review'].str.replace('[^\w\s]', '') spotify_file['Review'].tail() # stop words removal import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize
def process_title(word=''): # punctuation_regex = r"[()]?[.,!?;~:]+" specials = [r':-?[\)\(]+'] word = word.lower() others = [] for o in specials: m = re.search(o, word) if m is not None: others.append(m.group(0)) word = word.replace(m.group(0), '') emoji = emot.emoji(word) # detect skin code # to_delete = [] # for i, emo in enumerate(emoji): # if u"\U0001F3FB" <= emo['value'] <= u"\U0001F3FF": # emoji[i-1]['value'] += emo['value'] # to_delete.append(i) # for i in reversed(to_delete): # del emoji[i] if not re.search('[a-zA-Z]', word): # just manage emoji emos = list(map(lambda emo: emo['value'], emoji)) for emo in emos: word = word.replace(emo, '') return ' '.join(emos) + ' ' + word emoticons = emot.emoticons(word) # skip all-words emoticons, normally wrong emoticons = list(filter(lambda emo: re.search('[^a-z]', emo['value']), emoticons)) # merge single-char emoticons, normally wrong previous = -2 to_delete = [] for i, emo in enumerate(emoticons): if len(emo['value']) < 2 and emo['location'][0] == previous - 1: emoticons[i - 1]['value'] += emo['value'] to_delete.append(i) previous = emo['location'][1] for i in reversed(to_delete): del emoticons[i] # remove remaining single-char emoticons emoticons = list(filter(lambda emo: len(emo['value']) >= 2, emoticons)) emos = list(map(lambda emo: emo['value'], emoji + emoticons)) for emo in emos: word = word.replace(emo, '') # punctuation # punctuation = re.findall(punctuation_regex, word) # for p in punctuation: # word = word.replace(p, ' ') # # parentesis # word = re.sub(r'[\(\)]', '', word) # multiple spaces word = word.replace(' ', ' ').strip() # separated letters (i.e. 'w o r k o u t' or 'r & b') if re.match(r'^([\w&] )+[\w&]$', word): word = word.replace(' ', '') # hashtag word = re.sub(r'^#', '', word) # #remove stopwords # stop_words = stopwords.words('english') # ' '.join([w for w in word.split(' ') if w not in stop_words]) # remove spaces word_no_spaces = word.replace(' ', '') # if(len(punctuation)>=1): # print(punctuation) return ' '.join(emos + others) + ' ' + word + ' ' + word_no_spaces
spotify_file= pd.read_csv('spotify3.csv') print(spotify_file) # to lower case spotify_file['Review']= spotify_file['Review'].apply(lambda x: " ".join(x.lower() for x in x.split())) spotify_file['Review'].head() # exract emoji # to be taken later import emot spotify_list =spotify_file['Review'] l=[] for i in range(len(spotify_list)): x= emot.emoji(spotify_list[i]).get("value","none") l.append(x) spotify_file['emoji']=l # punctuation removal #to be not done spotify_file['Review'] = spotify_file['Review'].str.replace('[^\w\s]','') spotify_file['Review'].tail() # stop words removal import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize
def place_emoji(self, insert_emoji): emoji = emot.emoji(insert_emoji) return emoji