def emoji_to_text(text): demoji.findall(text) text = demoji.findall(text) temp_list = demoji_text.values() temp_list = demoji_text.values() no_emoji_df = pd.DataFrame(temp_list, columns=['emoji_Text']) print(no_emoji_df) no_emoji_df.to_csv( r'C:\Users\heromero\Desktop\Stefanini\Desarollo_Redes_sociales\Nueva carpeta\IGphotoDownloader\emoji_to_text.csv' )
def get_stats_frame(self): raw_data = self._get_raw_data() USER = [] TOTAL_MSG = [] TOTAL_MEDIA = [] TOTAL_LINK = [] """ This User profiles, will search for each of the unique users and will map all their messages and all sorts of the data, and will this dict will return all the different dataframes with keys is the unique users. """ USER_PROFILES = { user: raw_data[raw_data.User == user] for user in raw_data.User.unique() } for user in USER_PROFILES.keys(): total_msg_with_media = USER_PROFILES[user].shape[0] media = list(USER_PROFILES[user].loc[ USER_PROFILES[user].Msg.str.contains('<Media omitted>'), 'Msg'].index) link = list(USER_PROFILES[user].loc[ USER_PROFILES[user].Msg.str.contains('https'), 'Msg'].index) USER.append(user) TOTAL_MSG.append(total_msg_with_media - len(media)) TOTAL_MEDIA.append(len(media)) TOTAL_LINK.append(len(link)) # Getting the emojies to append too with the user NAME = [] EMOJIES = [] EMOJIES_LEN = [] for user in USER_PROFILES.keys(): NAME.append(user) EMOJIES.append( list(demoji.findall(str(USER_PROFILES[user].Msg)).keys())) EMOJIES_LEN.append( len(list(demoji.findall(str(USER_PROFILES[user].Msg)).keys()))) Stat_data = { "User": USER, "Total_msg": TOTAL_MSG, "Total_media": TOTAL_MEDIA, "Total_link": TOTAL_LINK, "Emojies": EMOJIES, "Total_emojies": EMOJIES_LEN } Stat_data_frame = pd.DataFrame(Stat_data) return Stat_data_frame
def rep_emojis(t): d = demoji.findall(t) if d: for k, v in d.items(): t = t.replace(k, ' xxemoji ') #t=t.replace(k,' xxemoji xx'+v.replace(' ','_')+' ') return t
def preProcessing(text): text = text.lower() lemmatizer = WordNetLemmatizer() words = text.split(' ') stopSet = stopwords.words('english') text = " ".join([i for i in words if i not in stopSet]) #replace emoji words = text.split(' ') idx = 0 for word in words: emojiDict = demoji.findall(word) emojiText = list(emojiDict.keys()) if len(emojiText) > 0: words[idx] = emojiDict[emojiText[0]] idx += 1 text = '' for w in words: nWord = w.replace('n\'t', 'not') text += ' ' + lemmatizer.lemmatize(nWord) #remove non-ascii characters text = unidecode(text) seperators = list(string.punctuation) for i in seperators: text = text.replace(i, ' ') #print(text) return text
def emoji2text(connection, df_tweets): """ A function that replaces emoji with corresponding text inside the tweets""" cursor = connection.cursor() for i in range(len(df_tweets)): id = df_tweets.loc[i, 'ID'] content = df_tweets.loc[i, 'TRCONTENT'] if df_tweets.loc[ i, 'TRCONTENT'] else df_tweets.loc[i, 'CONTENT'] #find the emojis inside the text, function returns emojis and corresponding text value emoDict = demoji.findall(content) convertedText = df_tweets.loc[i, 'FCONTENT'] #replace all emojis with corresponding text if len(emoDict): print('ORIGINAL: ', content) for emo, emoText in emoDict.items(): emoText = ' ' + emoText + ' ' #leading and panding spaces to separate emoji from other words convertedText = convertedText.replace(emo, emoText) df_tweets.loc[i, 'FCONTENT'] = convertedText print('CONVERTED: ', df_tweets.loc[i, 'FCONTENT']) query = 'UPDATE GoldenSet SET FCONTENT = "' + str( df_tweets.at[i, 'FCONTENT']) + '" WHERE ID = ' + str(id) + ";" cursor.execute(query) print('-' * 40 + '\n') cursor.close() print( "\n\n\n----------------------- EMOJI TO TEXT FINISHED -------------------------\n\n\n" )
def demojify(input_text): text_emojis = { ':-*\)': 'smile', ':-*]': 'smile', ':-*d': 'smile', ':-*\(': 'frown', ':-*\[': 'frown', ':-*/': 'unsure', ':-*o': 'astonish', ':-*0': 'astonish', 'xd': 'laugh', ';-*\)': 'wink', ":'\(": 'cry', ':3': 'smile', '<3': 'love', } # Find all icon emojis icon_emojis = demoji.findall(input_text) emojis = {**text_emojis, **icon_emojis} for emoji, emoji_text in emojis.items(): # Add extra space to avoid combining the text with the next word. # Extra space is removed later. input_text = re.sub(emoji, f' {emoji_text} ', input_text) return input_text
def df_punct(df, emoji='on'): # Import necessary libraries import re import demoji # demoji.download_codes() # Reset index df = df.reset_index(drop=True) # Delete all urls from the strings, which are almost solely used to retweet df['text'] = [re.sub(r'http\S+', "", txt) for txt in df['text']] # Locate retweets and assign a dummy variable to them df['rt'] = [1 if 'RT @' in txt else 0 for txt in df['text']] # Replace all emojis with word representations if emoji == 'on': big_str = ' '.join(df['text']) emj_dct = demoji.findall(big_str) for emoji in emj_dct: df['text'] = df['text'].str.replace(emoji, emj_dct[emoji]) elif emoji == 'off': pass # Delete from the text strings 'rt' which indicates a Retweet df['text'] = [re.sub(r'rt', "", txt) for txt in df['text']] # Delete punctuation df['text'] = [ re.sub(r'[^\w\s]', '', str(txt).lower().strip()) for txt in df['text'] ] return df
def text_process(tweet): #removing mentions tweet = re.sub(r'@[A-Za-z0-9]+','' ,tweet, flags=re.MULTILINE) #removing url links tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE) #removing numbers tweet = ''.join([i for i in tweet if not i.isdigit()]) #converting some words to not tweet=re.sub(r"\bdidn't\b","not",tweet.lower()) tweet=re.sub(r"\bdoesn't\b","not",tweet.lower()) tweet=re.sub(r"\bdon't\b","not",tweet.lower()) #converting emojis to their meaning #demoji.download_codes() l=demoji.findall(tweet) for key, value in l.items(): tweet = tweet.replace(key, value) #removing puctuations nopunc = [char for char in tweet if char not in string.punctuation] nopunc = ''.join(nopunc) #seperating words nopunc=' '.join(segment(nopunc)) #returning the tweet without the stopwords tokens = [word for word in nopunc.split() if word.lower() not in sw] filtered_tokens = [] for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems
def _process_unicode_emojis(m, emoji_class): e = m.group() title = UNICODE_LIST.get(e, demoji.findall(e)[e]) if '\u200d' not in e: # If there isn't a zero width joiner, strip out variation selectors e = re.sub(r'[\U0000FE00-\U0000FE0F]$', '', e) codepoint = "-".join(['%04x' % ord(_c) for _c in e]).lstrip('0') return f'<img class="{emoji_class}" title=":{title}:" ' \ f'src="https://twemoji.maxcdn.com/2/svg/{codepoint}.svg" alt="{e}">'
def find_emojis(tweet_list): emoji_count = 0 for tweet in tweet_list: ems = demoji.findall(tweet) if len(ems): print(ems) emoji_count += 1 print(f"Found {emoji_count} tweets with emojis.")
def get_stats_frame(raw_data): USER = [] TOTAL_MSG = [] TOTAL_MEDIA = [] TOTAL_LINK = [] users = { user: raw_data[raw_data.User == user] for user in raw_data.User.unique() } for user in users.keys(): total_msg_with_media = users[user].shape[0] media = list( users[user].loc[users[user].Msg.str.contains('<Media omitted>'), 'Msg'].index) link = list(users[user].loc[users[user].Msg.str.contains('https'), 'Msg'].index) USER.append(user) TOTAL_MSG.append(total_msg_with_media - len(media)) TOTAL_MEDIA.append(len(media)) TOTAL_LINK.append(len(link)) # Getting the emojies to append too with the user NAME = [] EMOJIES = [] EMOJIES_LEN = [] for user in users.keys(): NAME.append(user) EMOJIES.append(list(demoji.findall(str(users[user].Msg)).keys())) EMOJIES_LEN.append( len(list(demoji.findall(str(users[user].Msg)).keys()))) Stat_data = { "User": USER, "Total_msg": TOTAL_MSG, "Total_media": TOTAL_MEDIA, "Total_link": TOTAL_LINK, "Emojies": EMOJIES, "Total_emojies": EMOJIES_LEN } Stat_data_frame = pd.DataFrame(Stat_data) return Stat_data_frame
def preprocess_cleaning(df): ''' Convert non-ascii to ascii Count URL, emoji, punc, hashtag, mentions convert emoji to text convert hashtag using camel case lower text ''' stop_words = stopwords.words('english') EMOJI_TO_TEXT = demoji.findall((' ').join(df['tweet_text'].to_list())) lemmatizer = WordNetLemmatizer() wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} def lemmatize_words(text): pos_tagged_text = nltk.pos_tag(text.split()) return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]) def clean_text(text): '''Make text lowercase, remove text in square brackets, remove links, remove user mention, remove punctuation, remove numbers and remove words containing numbers.''' text = re.sub('(#[A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text)) # Split by camel case text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('@\w+', '', text) # mentions text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punc text = re.sub('\n', '', text) text = re.sub(r'(.)\1+', r'\1\1', text) # char repeated more than twice. ex hellllp -> hellp return text def emoji_to_text(text): return ' '.join([EMOJI_TO_TEXT.get(i, i) for i in text.split(' ')]) df['num_url']=df['tweet_text'].apply(lambda x:x.count('URL')) df['num_user_id']=df['tweet_text'].apply(lambda x:x.count('USERID')) df['num_emoji'] = df['tweet_text'].apply(lambda x:len([i for i in x if i in EMOJI_TO_TEXT])) df['tweet_text']=df['tweet_text'].apply(lambda x:emoji_to_text(x)) df['tweet_text']=df['tweet_text'].apply(lambda x:unidecode(x)) df['tweet_text']=df['tweet_text'].apply(lambda x:lemmatize_words(x)) df['has_url']=(df['num_url']>0).astype(int) df['has_emoji']=(df['num_emoji']>0).astype(int) df['num_hashtags'] = df['tweet_text'].str.findall(r'#(\w+)').apply(lambda x : len(x)) df['num_user_mention'] = df['tweet_text'].str.findall(r'@(\w+)').apply(lambda x : len(x)) df['num_punctuation'] = df['tweet_text'].str.replace(r'[\w\s#]+', '').apply(lambda x : len(x)) df['text_cleaned'] = df['tweet_text'].apply(clean_text) # Remove stop words df['text_cleaned'] = df['text_cleaned'].str.split().apply(lambda x: [word for word in x if word not in stop_words]).apply(lambda x: ' '.join(x)) return df
def contains_emoji(text): """Return True if text contains an emoji, False otherwise. Examples: >>> contains_emoji("Ъда coronavirus. ЪцњЪўиЪўиЪўи") True >>> contains_emoji("No emojis here. :( )") False """ return bool(demoji.findall(text))
def preprocessing(i, file, userList): tweet_dic = json.loads(i) #date date = time.strftime( '%Y-%m-%dT%H:00:00Z', time.strptime(tweet_dic["created_at"], "%a %b %d %H:%M:%S +0000 %Y")) format_str = '%Y-%m-%dT%H:00:00Z' dt = datetime.strptime(date, format_str) final = dt + timedelta(hours=1) #reply_text, poi_id, poi_name tweet_dic['tweet_date'] = final.strftime(format_str) if ('full_text' in tweet_dic.keys()): print("tweet has full_text") elif ('text' in tweet_dic.keys()): print("has text instead of full_text") tweet_dic['full_text'] = tweet_dic['text'] else: print("No full_text or text") return if tweet_dic['in_reply_to_status_id'] is not None: #print("wkefnwlm") if tweet_dic['in_reply_to_screen_name'] not in userList: tweet_dic['poi_name'] = tweet_dic['user']['screen_name'] tweet_dic['poi_id'] = tweet_dic['user']['id'] else: tweet_dic['poi_name'] = tweet_dic['in_reply_to_screen_name'] tweet_dic['poi_id'] = tweet_dic['in_reply_to_user_id'] tweet_dic['reply_text'] = tweet_dic['full_text'] else: #print("welfnwelknfdwm") #print(tweet_dic) tweet_dic['poi_name'] = tweet_dic['user']['screen_name'] tweet_dic['poi_id'] = tweet_dic['user']['id'] tweet_dic['reply_text'] = None print(tweet_dic['poi_name']) #country screen_name = tweet_dic['poi_name'] tweet_dic['country'] = "India" if screen_name in india_list: tweet_dic['country'] = "India" elif screen_name in usa_list: tweet_dic['country'] = 'USA' elif screen_name in brazil_list: tweet_dic['country'] = 'brazil' else: print("error poi {}".format(tweet_dic['id'])) #text_xx #text_xx = "text_" + str(tweet_dic['lang']) full_text = tweet_dic['full_text'] tweet_dic['text_copy'] = demoji.replace(full_text) tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys()) #time.sleep(1) json.dump(tweet_dic, file, ensure_ascii=False) #time.sleep(5) file.write("\n")
def remove_emoji(transient_tweet_text): tweet_tokenizer = TweetTokenizer() tokenized_tweet = tweet_tokenizer.tokenize(transient_tweet_text) emojis_present = demoji.findall(transient_tweet_text) tweet_no_emoji='' for i,s in enumerate(tokenized_tweet): if s in emojis_present.keys(): tweet_no_emoji = tweet_no_emoji + ' ' + emojis_present[s] else: tweet_no_emoji = tweet_no_emoji + ' ' + s return tweet_no_emoji
def rep_emojis(message): butter = [] bandit = demoji.findall(message) no = [char for char in message] for word in no: if word in bandit.keys(): butter.append(' ') butter.append(word.upper().replace(word, bandit[word])) else: butter.append(word) return ''.join(butter)
def emojiReplace_v2(text_string): emoji_dict = demoji.findall(text_string) for emoji in emoji_dict.keys(): #Making the connecting token between words a normal letter 'w' because BERT's tokenizer #splits on special tokens like '%' and '$' emoji_token = 'x'.join(re.split('\W+', emoji_dict[emoji])) + ' ' text_string = text_string.replace(emoji, emoji_token) #Controlling for multiple emojis in a row pattern = '(' + emoji_token + ')' + '{2,}' text_string = re.sub(pattern, 'mult' + emoji_token + ' ', text_string) return text_string
def parse_emoji(sentence: str, sub: bool): emojis = demoji.findall(sentence) for emoji in emojis: if sub: desc = emj.demojize(emoji) desc = re.sub(r"[:_]", " ", desc).strip() sentence = sentence.replace(emoji, f" EMOJI:[{desc}] ") else: sentence = sentence.replace(emoji, f" {emoji} ") sentence = re.sub(r"\s{2,}", " ", sentence) return sentence
def cleanwords(sentence): sentence_emogis = demoji.findall(sentence) sentence_emogis_short = " " for value in sentence_emogis.values(): sentence_emogis_short = sentence_emogis_short + (str(value) + " ") sentence = deEmojify(sentence) words = word_tokenize(sentence) words = [ lemmatizer.lemmatize(word, pos=get_simple_POS(pos_tag(word)[0][1])).lower() for word in words if not word.lower() in stop and not word.isdigit() ] return " ".join(words)
def replace_emojis(self): """emojis/emoticons found: """ emojis = demoji.findall(self.text) if not emojis: return self.text, False tokenized_text = tokenizer.tokenize(text) for i, s in enumerate(tokenized_text): if s in emojis.keys(): tokenized_text[i] = emojis[s] self.text = ' '.join(tokenized_text) self.text = self.erase_emojis() return self.text, list(emojis.keys())
def preprocessed(textData): allfile = 'lastFinishedPreprocess.csv' with open(textData, newline='') as csvfile: files = csv.reader(csvfile, delimiter=' ', quotechar='|') for text in files: newFile = np.char.lower(text) #print("lower: ", newFile) newFile = remove_stopwords(str(newFile)) #print("stop: ", newFile) newFile = re.sub('http[s]?://\S+', '', str(newFile)) #print("URL: ", newFile) newFile = np.char.replace(newFile, '\\n', '') newFile = np.char.replace(newFile, '\\r', '') symbols = "!#$%&()*+-" for i in symbols: newFile = np.char.replace(newFile, i, ' ') symbols = "!./:;<=>?@[\]^_`{|}~" for i in range(len(symbols)): newFile = np.char.replace(newFile, symbols[i], ' ') newFile = np.char.replace(newFile, ',', '') newFile = np.char.replace(newFile, " ", " ") #print("punc: ", newFile) #print (newFile) newFile = np.char.replace(str(newFile), "’", "") newFile = re.sub(r"\b[a-zA-Z]\b", "", str(newFile)) '''ps = PorterStemmer() newFile = [ps.stem(word) for word in newFile]''' lemmatizer = WordNetLemmatizer() newFile = word_tokenize(newFile) newFile = [lemmatizer.lemmatize(word) for word in newFile] newFile = ' '.join([ num2words.num2words(i) if i.isdigit() else i for i in newFile ]) for i in newFile: emojis = demoji.findall(i) if i in emojis: newFile = newFile.replace(i, emojis[i]) #print(row) newFile = ''.join(i for i in newFile) with open(allfile, 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow([newFile])
def replace_emojis(features): """ """ features_clean = [] for f in features: f_res = demoji.findall(f) if len(f_res) > 0: for x, y in f_res.items(): f = f.replace(x, f"<{y}>") features_clean.append(f) else: features_clean.append(f) return features_clean
def preprocess(text): tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(text) emojis = demoji.findall(text) cleaned = [] for token in tokens: if 'http' in token: cleaned.append('[URL]') elif '@' in token: cleaned.append('[USER]') elif token in emojis: cleaned.append(':' + ''.join(emojis[token].split()) + ':') else: cleaned.append(token.lower()) return ' '.join(cleaned)
async def poll(self, ctx, *, arg): emoji = [] if 'votes=' in arg: return await send_message( ctx, '''You don't need to do votes= for emoji anymore, I'll pull them automatically.''', error=True) # emoji = list(re.findall(emojiPattern, arg, flags=re.DOTALL)) + list(re.findall(customEmojiPattern, arg, flags=re.DOTALL)) emoji = list(demoji.findall(arg).keys()) + list( re.findall(customEmojiPattern, arg, flags=re.DOTALL)) msg = await ctx.send(f"**Poll time! <@{ctx.author.id}> asks:**\n{arg}") for reaction in emoji: await msg.add_reaction(reaction.strip('<> '))
def emoji(textData): #for some reason us flag not being converted noEmo = 'tempEmo.csv' with open(textData, newline='') as csvfile: files = csv.reader(csvfile,delimiter=' ', quotechar='|') for text in files: for row in text: #print(row) for i in row: emojis = demoji.findall(i) if i in emojis: row = row.replace(i,emojis[i]) #print(row) row = ' '.join(i for i in row.split()) with open(noEmo,'a',newline='') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow([row])
def clean_sentence(sentence): if re.search(hashtag, sentence) is not None: sentence = re.sub(hashtag, r'\1', sentence) sentence = re.sub(links, 'URL', sentence) reference = demoji.findall(sentence) # print(reference) emoticons = emot.emoticons(sentence) if isinstance(emoticons, list): emoticons = emoticons[0] # print(emoticons) if len(reference) > 0: for key, value in reference.items(): sentence = sentence.replace(key, value+" ") if emoticons['flag']: for i in range(len(emoticons['value'])): # print(emoticons['value'][i]) sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i])) return sentence
def get_emoji_list(string): emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) emoticons_dict = demoji.findall(string) if not emoticons_dict is null: emo_list = [] for emo in emoticons_dict.keys(): emo_list.append(emo) return emo_list
def clean_tweets(tweet, translate=False, tokenize=False): '''process tweets: - tweet processor to get rid of url links - remove punctuations - translate to english - to lowercase - remove common stopwords - stem instead of lemmitize (for faster process, although draw back is the stemmed words sometime do not look like a real word) - process emojis i.g. :) --> smily face''' tweet = str(tweet) # sometimes not a string type i.e. float orig_tweet = copy.deepcopy(tweet) # remove urls, mentions, and hashtags try: tweet = p.clean(tweet) except Exception as e: print(e) # 1. remove non-letter or space. tweet = re.sub('[^[a-z|A-Z|\s]*', '', tweet) if translate: try: # translate to English tweet = translate(tweet) except json.decoder.JSONDecodeError: print(tweet) stop = stopwords.words('english') # common stop words stop_words = set(stop) # 2. convert to lower case and tokenize tweet = tweet.lower() tweet_tokens = word_tokenize(tweet) # 3. remove stopwords, Stemming ret = [] for word in tweet_tokens: if not word in stop_words: stemmer = PorterStemmer() word = stemmer.stem(word) ret.append(word) # 4. append emojis ret.extend(list(demoji.findall(orig_tweet).values())) if not tokenize: ret = ' '.join(ret) return ret
def clean_sentence(sentence): """ replaces all emojis and emoticons with their text equivalent :param sentence: str, raw text :return: clean text """ reference = demoji.findall(sentence) # print(reference) emoticons = emot.emoticons(sentence) if isinstance(emoticons, list): emoticons = emoticons[0] # print(emoticons) if len(reference) > 0: for key, value in reference.items(): sentence = sentence.replace(key, value+" ") if emoticons['flag']: for i in range(len(emoticons['value'])): # print(emoticons['value'][i]) sentence = sentence.replace(emoticons['value'][i], extract_emotion(emoticons['mean'][i])) return sentence
def emoji(cleaned_text, Emoji_emo): emoji = demoji.findall(cleaned_text) score = pd.DataFrame(np.zeros(shape=(1, 6)).astype(int), columns=[ 'Happy', 'Excitement', 'Pleasant', 'Surprise', 'Fear', 'Angry' ]) for emo in emoji: if Emoji_emo["Happy"].str.contains(emoji[emo]).sum() > 0: score["Happy"] += 1 if Emoji_emo["Excitement"].str.contains(emoji[emo]).sum() > 0: score["Excitement"] += 1 if Emoji_emo["Pleasant"].str.contains(emoji[emo]).sum() > 0: score["Pleasant"] += 1 if Emoji_emo["Surprise"].str.contains(emoji[emo]).sum() > 0: score["Surprise"] += 1 if Emoji_emo["Fear"].str.contains(emoji[emo]).sum() > 0: score["Fear"] += 1 if Emoji_emo["Angry"].str.contains(emoji[emo]).sum() > 0: score["Angry"] += 1 return (score)