def preprocessing(train, test, dev): print("Preprocessing ....") train = train.str.replace("#", "") test = test.str.replace("#", "") dev = dev.str.replace("#", "") train = train.map(lambda x: em.decode(x)) test = test.map(lambda x: em.decode(x)) dev = dev.map(lambda x: em.decode(x)) train = train.str.lower() test = test.str.lower() dev = dev.str.lower() train = train.map(lambda x: " ".join(token.lemma_ for token in nlp(x) if token.lemma_ != "-PRON-")) test = test.map(lambda x: " ".join(token.lemma_ for token in nlp(x) if token.lemma_ != "-PRON-")) dev = dev.map(lambda x: " ".join(token.lemma_ for token in nlp(x) if token.lemma_ != "-PRON-")) train = train.map(lambda x: " ".join("someone" if "@" in word else word for word in x.split(" "))) test = test.map(lambda x: " ".join("someone" if "@" in word else word for word in x.split(" "))) dev = dev.map(lambda x: " ".join("someone" if "@" in word else word for word in x.split(" "))) #print(train) return (train, test, dev)
def clean_twitter_text(tweets_data): # Change all text to lowercase tweets_data['text'] = tweets_data['text'].str.lower() tweets_data['text'] = tweets_data['text'].str.replace(',', '') tweets_data['text'] = tweets_data['text'].str.replace('.', '') tweets_data['text'] = tweets_data['text'].str.strip() # Remove the '...' tweets_data['text'] = tweets_data['text'].str.replace(r'…', '', regex=True) # Remove hyperlinks tweets_data['text'] = tweets_data['text'].replace(r'http\S+', '', regex=True) # Replace \n with a space tweets_data['text'] = tweets_data['text'].replace(r'\n', ' ', regex=True) # Remove stock tags tweets_data['text'] = tweets_data['text'].replace(r'\$\S+', '', regex=True) # Remove tags tweets_data['text'] = tweets_data['text'].replace(r'\@\S+', '', regex=True) # Add eastern standard time column tweets_data['Datetime_eastern'] = tweets_data.index.tz_convert( 'US/Eastern') # Create decoded version of text field tweets_data['text_dec'] = tweets_data['text'].map( lambda x: emojis.decode(x)) # Add spaces around emojis so they can be separated as words tweets_data['text_dec'] = tweets_data['text_dec'].replace(r'(:[a-z]+:)', ' \\1 ', regex=True) return tweets_data
def count_tweet_emoticons(tweets): results = {CATEGORY_HATE: {}, CATEGORY_NON_HATE: {}} hate_total = non_hate_total = 0 for tweet in tweets: text = util.get_tweet_text(tweet) text_emojis = emojis.get(text) if len(text_emojis) == 0: # skip tweet if no emojis present continue category = "" if (tweet["hate_speech"]): category = CATEGORY_HATE hate_total += 1 else: category = CATEGORY_NON_HATE non_hate_total += 1 for emoji in text_emojis: emoji_key = emojis.decode(emoji) if emoji_key not in results[category]: results[category][emoji_key] = 1 else: results[category][emoji_key] += 1 # Order by results[CATEGORY_HATE] = sorted(results[CATEGORY_HATE].items(), key=lambda kv: (kv[1], kv[0]), reverse=True) results[CATEGORY_NON_HATE] = sorted(results[CATEGORY_NON_HATE].items(), key=lambda kv: (kv[1], kv[0]), reverse=True) results['hate_total'] = hate_total results['non_hate_total'] = non_hate_total return results
def get_media_type(self, message): main_view = self.main_view if message.media: media_type = message.media.to_dict()["_"] if media_type == "MessageMediaPhoto": media_type = "Photo" elif media_type == "MessageMediaDocument": atts = message.media.document.attributes filename = [ x for x in atts if x.to_dict()["_"] == "DocumentAttributeFilename" ] sticker = [ x for x in atts if x.to_dict()["_"] == "DocumentAttributeSticker" ] if sticker: stickertext = sticker[0].alt if main_view.text_emojis: stickertext = emojis.decode(stickertext) media_type = f"{stickertext} Sticker" elif filename: filename = filename[0].to_dict()["file_name"] media_type = f"Document ({filename})" else: media_type = f"Document ({message.media.document.mime_type})" downloaded = " (saved)" if (message.id in main_view.dialogs[ main_view.selected_chat]["downloads"]) else "" return (media_type, downloaded) return (None, None)
async def countCurrency(*dates): chan = await client.fetch_channel(os.getenv("CHANNEL1")) if dates: messages = await chan.history(limit=500, before=dates[0], after=dates[1]).flatten() elif not dates: messages = await chan.history(limit=500).flatten() authors = [] pairs = {} for msg in messages: decoded = emojis.decode(msg.content) for currency in currencies: if (currency in decoded): currCount = decoded.count(currency) author = msg.author.name if (author not in authors): authors.append(author) pairs[author] = currCount else: pairs[author] = int(pairs.get(author)) + currCount sortedPairs = {} sortedKeys = sorted(pairs, key=pairs.get, reverse=True) for key in sortedKeys: sortedPairs[key] = pairs[key] return sortedPairs
def df_emojis(df: pd.DataFrame, n=5): print("EMOJI ANALYSIS") author_counters = {} all_emojis = {} for author in df_get_author_list(df): author_counters[author] = {} for row in df.iterrows(): emoji_list = row[1]["emojis"] author = row[1]["author"] if emoji_list: for emoji in emoji_list: if emoji in author_counters[author]: author_counters[author][emoji] += 1 else: author_counters[author][emoji] = 1 if emoji in all_emojis: all_emojis[emoji] += 1 else: all_emojis[emoji] = 1 all_emojis = { k: v for k, v in sorted( all_emojis.items(), reverse=True, key=lambda item: item[1]) } print("OVERALL:") i = 1 for emoji in all_emojis: if i > n: break print(emoji, "--", all_emojis[emoji]) i += 1 bar([emojis.decode(k) for k in list(all_emojis.keys())[:(n * 2)]], [all_emojis[k] for k in list(all_emojis.keys())[:(n * 2)]], "Emojis", "Number of times used", rotation='') for author in author_counters: author_counters[author] = { k: v for k, v in sorted(author_counters[author].items(), reverse=True, key=lambda item: item[1]) } print(author) i = 1 for emoji in author_counters[author]: if i > n: break print(emoji, "--", author_counters[author][emoji]) i += 1
def draw_chats(self): selected_chat_index = self.main_view.selected_chat - self.main_view.selected_chat_offset offset = self.main_view.selected_chat_offset try: self.draw_frame(0, 0, self.chats_height, self.chats_width) index = 0 y = 1 for index in range(self.chats_num): dialog = self.main_view.dialogs[index + offset] message = dialog["messages"][ 0] if "messages" in dialog else dialog["dialog"].message message_string = message.text if message.text else "[Non-text object]" if self.main_view.text_emojis: message_string = emojis.decode(message_string) chat_name = get_display_name(dialog["dialog"].entity) from_string = get_display_name(message.sender) unread = dialog["unread_count"] unread_string = f"({unread} new)" if unread else "" date = dialog["dialog"].date date = date.astimezone() date_string = self._datestring(date) pinned = "* " if dialog["dialog"].pinned else " " selected = selected_chat_index == index self.draw_text([ self.format("o" if dialog["online"] else " ", attributes=self.main_view.colors["secondary"]), self.format( chat_name, attributes=self.main_view.colors["primary"] | curses.A_STANDOUT if selected else curses.A_BOLD, width=int(0.5 * self.chats_width)), self.format(f" {str(index)} " if self.show_indices else "", attributes=self.main_view.colors["standout"]), self.format(unread_string, attributes=self.main_view.colors["error"], alignment="right"), self.format(date_string, alignment="right", attributes=self.main_view.colors["primary"]), ], y, 2, maxwidth=self.chats_width - 2) self.draw_text([ self.format(f"{from_string}:"), self.format( message_string, width=self.chats_width - len(f"{from_string}: ") - 3) ], y + 1, 2, maxwidth=self.chats_width - 2) y += 3 index += 1 except Exception: show_stacktrace()
def convert_emojis2names(top_emojis_list): names_list = [] for emoji in top_emojis_list: demoji = emojis.decode(emoji) #print(demoji) name = re.findall(':(.*?):', demoji) if not name: name = ['black_small_square'] # 1 manual exception #print(name) names_list.append(name[0]) return names_list
async def _handle_key(self, key): if self.main_view.command_box: try: n = int(self.main_view.command_box) except: return self.main_view.edit_message = self.main_view.dialogs[self.main_view.selected_chat]["messages"][n] self.main_view.mode = "edit" self.main_view.inputs = emojis.decode(self.main_view.edit_message.text) self.main_view.inputs_cursor = len(self.main_view.inputs) self.main_view.command_box = ""
def checkemoji(incoming_msg): #Função para verificar emoji if ':' in emojis.decode(incoming_msg): emj = emojis.decode(incoming_msg) emj = emj.replace(':', '') emjc = emojis.db.get_emoji_by_alias(emj) emjc = emjc.category emj = emj.replace('_', ' ') emjs = emj.split() lc = len(emjs[0]) - 1 if emjc == 'Flags': return ("This is " + emj) elif emjs[0][lc] == 's': return ("These are the " + emj) elif 'men' in emj: return ("These are " + emj) elif emjs[0][0] in "aeio": return ("This is an " + emj) else: return ("This is a " + emj) else: return ("Ok")
def process_tweet(tweet): # remove old style retweet text "RT" new_tweet = re.sub(r'^RT[\s]+', '', tweet) # decode emojis to text descriptions new_tweet = emojis.decode(new_tweet) # remove hyperlinks new_tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', new_tweet) new_tweet = re.sub(r'http\S+', '', new_tweet) # remove hashtags new_tweet = re.sub(r'#', '', new_tweet) # remove underscores new_tweet = re.sub(r'_', '', new_tweet) # remove all numbers new_tweet = re.sub(r'[0-9]', '', new_tweet) # remove usernames new_tweet = re.sub('@[^\s]+', '', new_tweet) # remove punctuation even in the middle of a string "in.the.middle" new_tweet = re.sub(r'[^\w\s]',' ', new_tweet) # instantiate tokenizer class tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) # tokenize tweets tweet_tokens = tokenizer.tokenize(new_tweet) tweets_clean = [] for word in tweet_tokens: # Go through every word in your tokens list if (word not in string.punctuation): # remove punctuation tweets_clean.append(word) # Instantiate stemming class stemmer = PorterStemmer() # Create an empty list to store the stems tweets_stem = [] for word in tweets_clean: stem_word = stemmer.stem(word) # stemming word tweets_stem.append(stem_word) # append to the list return tweets_stem
def translate_comment(col): print('开始%s翻译' % col) collection = mydb[col] collection.update_many({}, {'$set': {'Comment_English': 'none'}}) querry = {"Comment_English": 'none'} comment_array = collection.find({}, {'comment_text': 1}) print('已获取所有评论,准备开始翻译') for i in comment_array: ch_comment = emojis.decode(i['comment_text']) result = translate_client.translate(ch_comment, target_language='en') collection.update_one( querry, {'$set': { 'Comment_English': result['translatedText'] }})
def extract_emoji(text: str) -> list: """Return all Unicode emojis contained in string""" # Change Unicode character to :emoji: text = emojis.decode(text) # Match all of them possible_emojis = re.findall(r"(:[^:]*:)", text) found_emoji = [] # Might have matched even non-emoji (if text contained ':not and emoji:' for example) for emoji in possible_emojis: # Add only actual emojis if emojis.db.get_emoji_by_alias(emoji[1:-1]) is not None: found_emoji.append(emojis.encode(emoji)) return found_emoji
def emojiget(string): # initializing list and count for partials emojilist = list() partial_matches = 0 # checking for custom discord emojis custom_matches = re.findall(r"<(a?):(\w*):(\d{18})>", string) if custom_matches: custom_matches = [i[1] for i in custom_matches] emojilist.extend(custom_matches) # checking for partial emojis for emoji in emojis.iter(string): partial_matches += 1 emoji = emojis.decode(emoji).replace(":", "") emojilist.append(emoji) return emojilist
async def setemoji(self, ctx, emoji): """Lets a user define the emoji the user have to react with in order to let the bot quote it Usage: !setemoji {emoji}""" # if the emoji is a normal (no custom) emoji if not emoji.startswith("<"): emoji = emojis.decode(emoji) update_settings("Settings", dict(guild_id=ctx.guild.id, emoji=emoji)) try: await ctx.send(f"New reaction emoji successfully set to {emoji}") except discord.Forbidden: return 0 return 0
async def OnAddQuip(self, ctx, type: QuipType, *quip): """Adds a quip the bot can respond with when mentioned **string|int:** <type> The type of quip you want to have. Available results (not case sensitive): - 0 (Regular) - 1 (Guild Emoji) - 2 (Specific User) - regular (Regular) - r (Regular) - emoji (Guild Emoji) - e (Guild Emoji) - user (Specific User) - u (Specific User) !! If Specific User Selected !! **discord.User:** <user> The discord user you want this quip to be specific to **string:** <quip> The quip you want to add. """ if (type == QuipType.INVALID): raise InvalidQuipType(type) if (type == QuipType.SPECIFIC_USER and len(quip) == 0): raise commands.errors.MissingRequiredArgument( inspect.Parameter('user', inspect.Parameter.POSITIONAL_ONLY)) if (type == QuipType.SPECIFIC_USER and len(quip) < 2): raise EmptyQuip() if (len(quip) == 0): raise EmptyQuip() if (botSettings.guild is None): raise InvalidGuild() combinedQuip = ' '.join(quip) combinedQuip = emojis.decode(combinedQuip) if (type == QuipType.GUILD_EMOJI and not discord.utils.get(botSettings.guild.emojis, name=combinedQuip)): raise InvalidGuildEmoji(combinedQuip) user = None additionalInfo = ' ' if (type == QuipType.SPECIFIC_USER): converter = commands.UserConverter() user = await converter.convert(ctx, quip[0]) additionalInfo = '[{}] '.format(user.mention) quip = quip[1:] combinedQuip = ' '.join(quip) botSettings.AddQuip(combinedQuip, type.value, user) message = '[{}]{}Quip added `{}`'.format(type.name, additionalInfo, combinedQuip) await SendMessage(ctx, description=message, color=discord.Color.blue())
def demojify(self, text): text = text.apply( lambda x: emojis.decode(x).replace(':', ' ').replace('_', ' '))
def remove_emoji(self, src_str) -> str: decode_str = emojis.decode(src_str) return re.sub(":.*:", "", decode_str)
for line in open("emoji_1.txt", encoding='utf-16'): clean_tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line).split()) analysis = TextBlob(clean_tweet) polarity = round(analysis.polarity, 5) subjectivity = round(analysis.subjectivity, 5) for i in emojis.get(line): if i in save: num = line.count(i) position = round( (line.find(i) / len(line) + line.rfind(i) / len(line)) / 2, 5) if i not in cont.keys(): cont[i] = [emojis.decode(i)] cont[i].append(num) cont[i].append([polarity]) cont[i].append([subjectivity]) cont[i].append([position]) else: cont[i][1] = cont[i][1] + num cont[i][2].append(polarity) cont[i][3].append(subjectivity) cont[i][4].append(position) print(1) for line in open("emoji_2.txt", encoding='utf-16'): clean_tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line).split()) analysis = TextBlob(clean_tweet)
def decode_emojis(text: str): ''' https://emojis.readthedocs.io/en/latest/api.html#sample-code import emojis ''' emoj = emojis.decode(text)
def draw_message(self, main_view, chat_idx): messages = main_view.dialogs[main_view.selected_chat]["messages"] message = messages[chat_idx] maxtextwidth = int(self.single_chat_fraction * self.W) - 2 lines = [] if message.text: message_lines = message.text.split("\n") for message_line in message_lines: if main_view.text_emojis: message_line = emojis.decode(message_line) if message_line == "": lines += [""] else: lines += [ message_line[maxtextwidth * i:maxtextwidth * i + maxtextwidth] for i in range( int(math.ceil(len(message_line) / maxtextwidth))) ] if message.media: media_type, downloaded = self.get_media_type(message) media_line = f"[{media_type}]{downloaded}" lines += [ media_line[maxtextwidth * i:maxtextwidth * i + maxtextwidth] for i in range(int(math.ceil(len(media_line) / maxtextwidth))) ] reply = "" if message.is_reply: reply_id = message.reply_to_msg_id reply = " r?? " for idx2, message2 in enumerate( main_view.dialogs[main_view.selected_chat]["messages"]): if message2.id == reply_id: reply = f"r{idx2:02d}" break from_message = message from_user = "******" if message.out else get_display_name( from_message.sender) via_user = f" via {get_display_name(from_message.forward.sender)}" if message.forward else "" user_string = f"{from_user}{via_user} " out = [] if message.out: out.append( f"{chat_idx} {user_string}{self._datestring(message.date.astimezone())}" .rjust(maxtextwidth)) for idx, text in enumerate(lines): out.append(text.rjust(maxtextwidth - 4)) #out.append(f"{chat_idx} {message.date.hour}:{message.date.minute:02d}".rjust(maxtextwidth) + ".") if message.is_reply: out.append(reply) else: out.append( f"{chat_idx} {user_string}{self._datestring(message.date.astimezone())}" ) for idx, text in enumerate(lines): out.append(" " + text) if message.is_reply: out.append(reply) return (out, message)
def draw_message(self, main_view, message, chat_idx): maxtextwidth = int(self.single_chat_fraction * self.W) - 2 lines = [] if message.text: message_lines = message.text.split("\n") for message_line in message_lines: if main_view.text_emojis: message_line = emojis.decode(message_line) if message_line == "": lines += [""] else: lines += [ message_line[maxtextwidth * i:maxtextwidth * i + maxtextwidth] for i in range( int(math.ceil(len(message_line) / maxtextwidth))) ] if message.media: media_type = message.media.to_dict()["_"] if media_type == "MessageMediaPhoto": media_type = "Photo" elif media_type == "MessageMediaDocument": atts = message.media.document.attributes filename = [ x for x in atts if x.to_dict()["_"] == "DocumentAttributeFilename" ] if filename: filename = filename[0].to_dict()["file_name"] media_type = f"Document ({filename})" else: media_type = f"Document ({message.media.document.mime_type})" lines += [f"[{media_type}]"] reply = "" if message.is_reply: reply_id = message.reply_to_msg_id reply = " r?? " for idx2, message2 in enumerate( main_view.dialogs[main_view.selected_chat]["messages"]): if message2.id == reply_id: reply = f"r{idx2:02d}" break from_message = message from_user = "******" if message.out else get_display_name( from_message.sender) via_user = f" via {get_display_name(from_message.forward.sender)}" if message.forward else "" user_string = f"{from_user}{via_user} " out = [] if message.out: out.append( f"{chat_idx} {user_string}{self._datestring(message.date.astimezone())}" .rjust(maxtextwidth)) for idx, text in enumerate(lines): out.append(text.rjust(maxtextwidth - 4)) #out.append(f"{chat_idx} {message.date.hour}:{message.date.minute:02d}".rjust(maxtextwidth) + ".") if message.is_reply: out.append(reply) else: out.append( f"{chat_idx} {user_string}{self._datestring(message.date.astimezone())}" ) for idx, text in enumerate(lines): out.append(" " + text) if message.is_reply: out.append(reply) return (out, message)
def unicodeEmojis(listaEmojis): lista = [] for t in range(len(listaEmojis)): lista.append(emojis.decode(listaEmojis[t])) return lista
def draw_chats(self): selected_chat_index = self.main_view.selected_chat - self.main_view.selected_chat_offset offset = self.main_view.selected_chat_offset try: self.draw_frame(0, 0, self.chats_height + 1, self.chats_width) index = 0 y = 1 chats_to_draw = self.chats_num while index < chats_to_draw: # only draw if messages are pinned and pins are viewable (at top) if index != 0 and index == self.main_view.num_pinned - offset: self.draw_text([ self.format("─" * (self.chats_width // 2 - 1), alignment="center"), ], y, 1, maxwidth=self.chats_width - 2) y += 2 dialog = self.main_view.dialogs[index + offset] if dialog["dialog"].archived: index += 1 chats_to_draw += 1 continue message = dialog["messages"][0] if len( dialog["messages"]) > 0 else dialog["dialog"].message message_string = message.text if message.text else "[Non-text object]" if self.main_view.text_emojis: message_string = emojis.decode(message_string) chat_name = get_display_name(dialog["dialog"].entity) if self.main_view.text_emojis: chat_name = emojis.decode(chat_name) from_string = get_display_name(message.sender) unread = dialog["unread_count"] unread_string = f"({unread} new)" if unread else "" date = dialog["dialog"].date date = date.astimezone() date_string = self._datestring(date) pinned = "* " if dialog["dialog"].pinned else " " selected = selected_chat_index == index self.draw_text([ self.format("o" if dialog["online"] else " ", attributes=self.main_view.colors["secondary"]), self.format( chat_name, attributes=self.main_view.colors["primary"] | curses.A_STANDOUT if selected else curses.A_BOLD, width=int(0.5 * self.chats_width)), self.format(f" {str(index)} " if self.show_indices else "", attributes=self.main_view.colors["standout"]), self.format(unread_string, attributes=self.main_view.colors["error"], alignment="right"), self.format(date_string, alignment="right", attributes=self.main_view.colors["primary"]), ], y, 2, maxwidth=self.chats_width - 2) debug(f"{self.chats_width=}") self.draw_text([ self.format(f"{from_string}:", width=min(self.chats_width // 2, len(from_string) + 1)), self.format(message_string, width=self.chats_width - min(self.chats_width // 2, len(f"{from_string}: ") + 1) - 3) ], y + 1, 2, maxwidth=self.chats_width - 2) y += 3 index += 1 except Exception: show_stacktrace()
location = tweet.place.name elif tweet.author.location != None: location = tweet.author.location else: location = tweet.user.location likes = tweet.favorite_count retweets = tweet.retweet_count clean_tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet.text).split()) analysis = TextBlob(clean_tweet) polarity = analysis.polarity subjectivity = analysis.subjectivity emoji_all = emojis.get(tweet.text) emoji_see = emojis.decode(str(emoji_all)) emoji_num = emojis.count(tweet.text) tweet = tweet.text.encode("unicode_escape") s = pd.Series({'id':my_id,'date':date,'author':author,'location':location,'likes':likes,'retweets':retweets,'polarity':polarity,'subjectivity':subjectivity,'emoji_all':emoji_all,'emoji_see':emoji_see,'emoji_num':emoji_num,'tweet':tweet}) df = df.append(s, ignore_index=True) df.to_csv('foo1.csv') #print(df.head(5))
def reviews(): hotel_name = driver.find_element_by_xpath("//h1[@id='HEADING']").text try: go_review = driver.find_element_by_xpath( "//span[contains(@class, '_33O9dg0j')]") go_review.click() #su per scendere giu alle recensioni city = str(args.place) driver.find_element_by_xpath( "//span[contains(text(),'Tutte le lingue')]").click() time.sleep(seconds) number_pages = driver.find_element_by_xpath( "//a[contains(@class, 'pageNum')][position() = last()]").text pages_review = int(number_pages) #conversion if args.pr: pages_review = args.pr for j in range(0, pages_review): if j < (pages_review - 1): go_on = driver.find_element_by_xpath( "//a[contains(text(),'Avanti')]") #button info_plus = driver.find_element_by_xpath( "//div[contains(@class,'XUVJZtom')]//span[contains(text(),'Scopri di pi')]" ) info_plus.click() time.sleep(seconds) all_reviews = driver.find_elements_by_xpath( "//q[contains(@class, 'IRsGHoPm')]") for i in range(0, (len(all_reviews))): #loop reviews insert_table = "REPLACE INTO reviews (Name, City, Rating, Review, Hometown, Date_of_stay, Trip_type) VALUES (%s, %s, %s, %s, %s, %s, %s)" review = emojis.decode(all_reviews[i].text) ix = str(i + 1) #index time.sleep(seconds) try: rating_value = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, 'ui_bubble')]") rating_class = rating_value.get_attribute("class") length_class = len(rating_class) value_rating_len = rating_class[length_class - 2] rating = int(value_rating_len) #rating (1 a 5) except: rating = '' try: hometown_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class,'default _3J15flPT small')]" ) #hometown hometown = hometown_element.text except: hometown = '' try: date_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, '_34Xs-BQm')]") #date date_bef = date_element.text date = date_bef.replace('Data del soggiorno:', '') except: date = '' try: triptype_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, '_2bVY3aT5')]") #type triptype_bef = triptype_element.text triptype = triptype_bef.replace('Tipo di viaggio:', '') except: triptype = '' records_to_insert = [(hotel_name, city, rating, review, hometown, date, triptype)] cursor.executemany(insert_table, records_to_insert) connection.commit() print(cursor.rowcount, "record in Reviews") go_on.click() time.sleep(seconds) else: #last page info_plus = driver.find_element_by_xpath( "//div[contains(@class,'XUVJZtom')]//span[contains(text(),'Scopri di pi')]" ) info_plus.click() time.sleep(seconds) all_reviews = driver.find_elements_by_xpath( "//q[contains(@class, 'IRsGHoPm')]") for i in range(0, (len(all_reviews))): insert_table = "REPLACE INTO reviews (Name, City, Rating, Review, Hometown, Date_of_stay, Trip_type) VALUES (%s, %s, %s, %s, %s, %s, %s)" review = emojis.decode(all_reviews[i].text) ix = str(i + 1) #index time.sleep(seconds) try: rating_value = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, 'ui_bubble')]") rating_class = rating_value.get_attribute("class") length_class = len(rating_class) value_rating_len = rating_class[length_class - 2] rating = int(value_rating_len) #rating (1 a 5) except: rating = '' try: hometown_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class,'default _3J15flPT small')]" ) #hometown hometown = hometown_element.text except: hometown = '' try: date_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, '_34Xs-BQm')]") #date date_bef = date_element.text date = date_bef.replace('Data del soggiorno:', '') except: date = '' try: triptype_element = driver.find_element_by_xpath( "//div[contains(@class,'_2wrUUKlw _3hFEdNs8')][" + ix + "]//span[contains(@class, '_2bVY3aT5')]") #type triptype_bef = triptype_element.text triptype = triptype_bef.replace('Tipo di viaggio:', '') except: triptype = '' records_to_insert = [(hotel_name, city, rating, review, hometown, date, triptype)] cursor.executemany(insert_table, records_to_insert) connection.commit() print(cursor.rowcount, "record in Reviews") except: pass
async def handle_key(self, key, redraw = True): if self.mode == "popupmessage": self.mode = self.modestack.pop() if not self.ready: return if key == "RESIZE": await self.drawtool.resize() return if self.macro_recording: if key != "q": self.macro_sequence.append(key) if self.mode == "search": if key == "ESCAPE" or key == "RETURN": self.mode = "normal" elif key == "BACKSPACE": if self.search_box == "": self.mode = "normal" else: self.search_box = self.search_box[0:-1] self.search_chats() self.search_next() else: self.search_box += key self.search_chats() self.search_next() elif self.mode == "vimmode": if key == "ESCAPE": self.mode = "normal" elif key == "RETURN": await self.call_command() self.vimline_box = "" self.mode = "normal" elif key == "BACKSPACE": if self.vimline_box == "": self.mode = "normal" else: self.vimline_box = self.vimline_box[0:-1] else: self.vimline_box += key elif self.mode == "normal": num = None try: num = int(key) except: pass if num is not None: self.command_box += str(num) await self.drawtool.redraw() return elif key == ":": self.mode = "vimmode" self.vimline_box = "" elif key == "RETURN" or key == "y": await self.send_message() elif key == "Q": await self.quit() elif key == "q": if self.macro_recording == None: # start macro recording async def record_macro(self, key): if "a" < key.lower() < "z": self.macro_recording = key self.popup_message(f"recording into {key}") else: self.popup_message(f"Register must be [a-zA-Z]") self.spawn_popup(record_macro, "Record into which register?") else: # end macro recording self.macros[self.macro_recording] = self.macro_sequence self.macro_recording = None self.macro_sequence = [] elif key == "@": # execute macro async def ask_macro(self, key): if key in self.macros.keys(): macro = self.macros[key] debug(macro) for k in macro: await self.handle_key(k, redraw = False) else: self.popup_message(f"No such macro @{key}") self.spawn_popup(ask_macro, "Execute which macro?") elif key == "C": self.select_prev_chat() elif key == "c": self.select_next_chat() elif key == "E": self.text_emojis ^= True elif key == "R": await self.mark_read() elif key == "d": if self.command_box: try: n = int(self.command_box) except: return if n >= len(self.dialogs[self.selected_chat]["messages"]): #TODO: alert user self.popup_message("No message by that id.") await self.drawtool.redraw() return async def action_handler(self, key): if key in ["y","Y"]: to_delete = self.dialogs[self.selected_chat]["messages"][n] await to_delete.delete() self.dialogs[self.selected_chat]["messages"].pop(n) self.command_box = "" self.mode = "normal" question = f"Are you really sure you want to delete message {n}? [y/N]" self.spawn_popup(action_handler, question) await self.drawtool.redraw() elif key == "e": if self.command_box: try: n = int(self.command_box) except: return self.edit_message = self.dialogs[self.selected_chat]["messages"][n] self.mode = "edit" self.inputs = emojis.decode(self.edit_message.text) self.command_box = "" elif key == "r": if self.command_box: try: n = int(self.command_box) except: return reply_to = self.dialogs[self.selected_chat]["messages"][n] s = emojis.encode(self.inputs) reply = await reply_to.reply(s) await self.on_message(reply) self.command_box = "" self.inputs = "" elif key == "m": if self.command_box: try: n = int(self.command_box) except: return self.command_box = "" await self.show_media(n) elif key == "M": self.center_selected_chat() elif key == "HOME" or key == "g": self.select_chat(0) elif key == "END" or key == "G": self.select_chat(-1) elif key == "i": self.mode = "insert" elif key == "n": self.search_next() elif key == "N": self.search_prev() elif key == "/": self.mode = "search" self.search_box = "" elif key == " ": self.drawtool.show_indices ^= True elif self.mode == "popup": action, _ = self.popup # I think this could break self.mode = self.modestack.pop() await action(self, key) elif self.mode == "edit": if key == "ESCAPE": async def ah(self, key): if key in ["Y", "y", "RETURN"]: edit = await self.edit_message.edit(self.inputs) await self.on_message(edit) # TODO: update message in chat # this on_message call does not work reliably self.mode = "normal" else: self.popup_message("Edit discarded.") self.mode = "normal" self.spawn_popup(ah, "Do you want to save the edit? [Y/n]") elif key == "LEFT": self.insert_move_left() elif key == "RIGHT": self.insert_move_right() elif key == "BACKSPACE": self.inputs = self.inputs[0:-1] elif key == "RETURN": self.inputs += "\n" else: self.inputs += key elif self.mode == "insert": if key == "ESCAPE": self.mode = "normal" elif key == "LEFT": self.insert_move_left() elif key == "RIGHT": self.insert_move_right() elif key == "BACKSPACE": self.inputs = self.inputs[0:-1] elif key == "RETURN": self.inputs += "\n" else: self.inputs += key self.command_box = "" if redraw: await self.drawtool.redraw()
else: location = tweet.user.location likes = tweet.favorite_count retweets = tweet.retweet_count #use TextBlob to do sentiment analysis clean_tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet.text).split()) analysis = TextBlob(clean_tweet) polarity = analysis.polarity subjectivity = analysis.subjectivity #use emojis to do emoji process emoji_all = emojis.get(tweet.text) emoji_see = emojis.decode(" ".join(emoji_all)) emoji_num = emojis.count(tweet.text) #previous tweet text data, use unicode to encode, also can .decode("unicode_escape") tweet = tweet.text.encode("unicode_escape") s = pd.Series({ 'id': my_id, 'date': date, 'author': author, 'location': location, 'likes': likes, 'retweets': retweets, 'polarity': polarity, 'subjectivity': subjectivity, 'emoji_all': emoji_all,
video_frame.to_excel('Video_frame.xlsx', index=False, header=True) video_frame.head(100) comment_frame = video_frame.loc[:,['videoTitle','textDisplay','likeCount','replyCount']] for i in comment_frame.index: print(comment_frame.loc[i,'textDisplay']+"\n") """## Data PreProcessing""" ! pip install emojis import emojis comment_frame.textDisplay = comment_frame.textDisplay.apply(lambda x: emojis.decode(x).replace(':', ' ').replace('_', ' ')) import nltk nltk.download('stopwords') # imports from bs4 import BeautifulSoup import unicodedata # from contractions import CONTRACTION_MAP # from contractions.py import re import string import nltk import spacy nlp = spacy.load('en',parse=True,tag=True, entity=True) from nltk.tokenize import ToktokTokenizer tokenizer = ToktokTokenizer()