def _classify_replies_with_emojis(prefix=''): emotions = ['anger', 'fear', 'joy', 'sadness'] emojis = [ ['😠', '😡', '😤', '🤬'], #anger ['😰', '😱', '😨', '😟'], #fear ['😂', '😁', '😄', '😊'], #, '🤣', '😀', '😃', '😄', '😆', '😍', '😋'], #joy ['💔', '😢', '😭', '😔'] #sadness ] tdb = TweetDB() tweets_data = [] for i, emoji_list in enumerate(emojis): tweets_with_an_emoji = [] other_emojis = [e for el in emojis for e in el if e not in emoji_list] for emoji in emoji_list: tweets_list = filter( lambda t: None if any(True if e in t.text else False for e in other_emojis) else True, list(tdb.all_replies_like(emoji))) tweets_with_an_emoji += tweets_list tweets_with_emoji = list(set(tweets_with_an_emoji)) tweets_data += list( map(lambda t: ['', t.text, f'{prefix}{emotions[i]}'], tweets_with_emoji)) return emotions, tweets_data
def get_parent_tweets_metrics(tweets_with_emotion, emotion): tdb = TweetDB() parent_tweet = lambda tid: tdb.get_by(str(tid)) parents_tweets = list( set([ parent_tweet(tweet.parent_tweet_id) for tweet in tweets_with_emotion ])) parents_retweets_count = [int(t.retweet_count) for t in parents_tweets] parents_retweets_count_median = np.median( np.array(parents_retweets_count)) if parents_retweets_count else 0 parents_retweets_75_top = np.percentile(np.array(parents_retweets_count), 75) parents_retweets_90_top = np.percentile(np.array(parents_retweets_count), 90) parents_retweets_95_top = np.percentile(np.array(parents_retweets_count), 95) parents_retweets_count_sorted = np.sort(parents_retweets_count)[::-1] parents_retweets_count_top_20_percentage_len = int( len(parents_retweets_count) * 0.2) parents_retweets_count_top_20_percentage = parents_retweets_count_sorted[: parents_retweets_count_top_20_percentage_len] print('%d on %s category' % (len(tweets_with_emotion), emotion)) print( '%d spreader tweets with retweets median: %.2f, 75%% percentile: %.2f, 90%% percentile: %.2f, 95%% percentile: %.2f, and responses average: %.2f)' % (len(parents_tweets), parents_retweets_count_median, parents_retweets_75_top, parents_retweets_90_top, parents_retweets_95_top, len(tweets_with_emotion) / len(parents_tweets))) print('top 20%%: avg: %.2f, median: %.2f' % (np.mean(parents_retweets_count_top_20_percentage), np.median(parents_retweets_count_top_20_percentage)))
def classify_replies_tweets(self): tdb = TweetDB() tweets = list(tdb.all_replies()) tweets_with_emotion_indexes, total = self.classify_sentences([t.text for t in tweets]) tweets_with_emotion = [int(tweets[i].parent_tweet.retweet_count) for i in tweets_with_emotion_indexes] tweets_with_emotion_mean = np.array(tweets_with_emotion).mean() if tweets_with_emotion else 0 self.logger.info('%d of %d on %s category (retweets average: %.2f)' % ( len(tweets_with_emotion), total, self.emotion_label, tweets_with_emotion_mean))
def predict_replies(filepath, **kwargs): identified_replies = [] files_tec = [os.path.join('output', '%s-tec.pickle' % e) for e in emotions] for i, f in enumerate(files_tec): # nb = SingleEmotionSemEvalNaiveBayes(emotion_index=i, filename=f) nb = SingleEmotionTECNaiveBayes(emotion_name=emotions[i], filename=f) tdb = TweetDB() tweets = list(tdb.all_replies()) tweets_with_emotion_indexes, total = nb.classify_sentences( [t.text for t in tweets]) identified_replies.append(tweets_with_emotion_indexes) print('%d of %d on %s category' % (len(tweets_with_emotion_indexes), total, emotions[i])) # Count how many additional emotions each tweet has, and prints to the console the average and median for each emotion for index, tweets_indexes in enumerate(identified_replies): other_tweets_indexes = [ ti for tis in identified_replies for ti in tis if tis != tweets_indexes ] repeated_tweets_indexes = [ other_tweets_indexes.count(ti) for ti in tweets_indexes if ti in other_tweets_indexes ] one_emotion_tweets = [ tweets[ti] for ti in tweets_indexes if ti not in other_tweets_indexes ] print( '%s: %d out of %d have been identified with another emotion as well (avg: %.2f, mdn: %.2f)' % (emotions[index], len(repeated_tweets_indexes), len(tweets_indexes), np.mean(repeated_tweets_indexes), np.median(repeated_tweets_indexes))) print('%d tweets have been identified only with %s' % ((len(tweets_indexes) - len(repeated_tweets_indexes)), emotions[index])) get_parent_tweets_metrics(one_emotion_tweets, emotions[index]) count_repeated_emotions(tweets, identified_replies)
def get_tweets_ids_from_csv(self): yield [str(t.id) for t in TweetDB().all_sorted_by( sort=TweetModel.retweet_count.desc(), source="politifact")] yield [str(t.id) for t in TweetDB().all_sorted_by( sort=TweetModel.retweet_count.desc(), source="gossipcop")]
def predict_top_retweeted_fake_news_tweets(models: list, consolidate=2): number_of_tweets_to_evaluate = 500 tdb = TweetDB() # most_popular_tweets = list(tdb.all_sorted_by(sort=TweetModel.retweet_count.desc())) most_popular_tweets = list( tdb.all_sorted_by_eager_loading(sort=TweetModel.retweet_count.desc())) classified_init = False get_model_name = lambda model: model.__name__.replace("SingleEmotion", "") with open('output/popular_tweets.csv', 'w', newline='') as f: csv_writer = csv.writer(f) csv_top_row = [ 'TweetID', 'Tweet retweet count', 'Tweet replies count', 'Tweet source', ] for model, _ in models: for emotion in emotions: csv_top_row.append(f'{get_model_name(model)} {emotion}') for emotion in emotions: csv_top_row.append(f'{get_model_name(model)} {emotion}%') if consolidate: for emotion in emotions: csv_top_row.append(f'Consolidation {emotion}') for emotion in emotions: csv_top_row.append(f'Consolidation {emotion}%') csv_writer.writerow(csv_top_row) model_instances = {} for model, filename_template in models: model_instances[get_model_name(model)] = [] for emotion in emotions: model_instances[get_model_name(model)].append( model(emotion_name=emotion, filename=filename_template.format(emotion))) tweet_count = 0 for tweet in most_popular_tweets: if tweet_count > number_of_tweets_to_evaluate: break replies = tweet.replies if not len(replies): continue tweet_count += 1 print( f'Found {len(replies)} for tweet {tweet.id} with {tweet.retweet_count} retweets' ) datasets_lists = {} csv_row = [ tweet.id, tweet.retweet_count, len(replies), tweet.source ] for model_name in model_instances.keys(): num_replies_with_emotions = [] for model_instance in model_instances[model_name]: tweets_with_emotion_indexes, _ = model_instance.classify_sentences( [t.text for t in replies]) num_replies_with_emotions.append( tweets_with_emotion_indexes) csv_row += [len(x) for x in num_replies_with_emotions] csv_row += [ len(x) / len(replies) for x in num_replies_with_emotions ] datasets_lists[model_name] = num_replies_with_emotions if consolidate: consolidate_results = [] for index_emotion, _ in enumerate(emotions): consolidate_results.append( consolidate_classifiers(consolidate, replies, [ x[index_emotion] for x in datasets_lists.values() ])) csv_row += [len(x) for x in consolidate_results] csv_row += [len(x) / len(replies) for x in consolidate_results] datasets_lists['Consolidate'] = consolidate_results csv_writer.writerow(csv_row) save_classified_replies(replies, emotions, datasets_lists, classified_init) classified_init = True
def __init__(self): self.twitter = TwitterData() self.tweet_db = TweetDB() self.reply_id = False super().__init__()
class TweetCollector(Loggable): """ Helper class to handle API calls and database connections """ def __init__(self): self.twitter = TwitterData() self.tweet_db = TweetDB() self.reply_id = False super().__init__() def get_tweet(self, tweet_id: str): tweets = self._get_tweet(tweet_id) or [] self.logger.info("Got %d tweets for %s" % (len(tweets), tweet_id)) for tweet in tweets: self._enqueue_tweet(tweet, tweet.get('id')) self.tweet_db.commit() def get_last_tweet_id(self): return False def wrap_up(self): self.logger.info("Finished collecting tweets") def close(self): try: self.tweet_db.commit() except: self.tweet_db.rollback() self.tweet_db.close() self.logger.debug("Queue completed") @abstractmethod def _request_tweet(self, tweet_id): pass def _db_has_tweet_id(self, db_conn, tweet_id) -> TweetModel: return db_conn.session.query(TweetModel).get(tweet_id) def _get_tweet(self, tweet_id): while True: try: return self._request_tweet(tweet_id) except TwythonRateLimitError as e: self._wait_retry_after(e.retry_after) except Exception as e: self.logger.exception(str(e)) return None def _wait_retry_after(self, retry_after: str): self.logger.debug("Retry-after: %s" % retry_after) try: sleep_time = datetime.fromtimestamp( int(retry_after)) - datetime.now() sleep_time = sleep_time.total_seconds() except Exception as e: self.logger.debug(str(e)) sleep_time = 60 * 15 self.logger.info("Sleeping for %d seconds" % sleep_time) time.sleep(sleep_time) def _enqueue_tweet(self, tweet: dict, tid: str): if not tweet: self.logger.warning("Could not get tweet %s" % tid) return if self.reply_id: if str(tweet['in_reply_to_status_id']) != str(self.reply_id): self.logger.warning("Tweet %s not reply to %s, skiping..." % (tid, self.reply_id)) return else: tweet[ 'in_reply_to_status_id'] = None # Do not treat these tweets as replies, even if they are if self._db_has_tweet_id(self.tweet_db, tid): self.logger.info("Tweet %s already saved, skiping..." % tid) else: self.tweet_db.save_tweet(tweet) self.logger.info("Twitter %s saved" % tid)