def collect_tweets_by_manual_keywords(self): old_terms, terms_to_add, all_config_terms, term_term_id_dict = self._convert_keywords_to_terms_by_given_dict( ) topics = self._create_topics_by_given_dict(term_term_id_dict) self._db.addPosts(terms_to_add) self._db.addPosts(topics) today = datetime.datetime.today() month_interval = timedelta(self._month_interval * 365 / 12) before = today - month_interval start_date = date_to_str(before, "%Y-%m-%d") end_date = date_to_str(today, "%Y-%m-%d") for i, term in enumerate(all_config_terms): print("\rTerm: {0} {1}/{2}".format(term, i, len(all_config_terms)), end='') tweets = self._collect_tweets_by_term(term, start_date, end_date) print("\rTerm: {0} Num tweets retrieved: {1}".format( term, len(tweets)), end='') posts, claim_post_connections = self._convert_tweets_to_posts( tweets, term_term_id_dict[term], self._domain) self._db.addPosts(posts) self._db.addPosts(claim_post_connections)
def _retrieve_old_tweets(self, claim, content): datetime_object = claim.verdict_date month_interval = timedelta(self._month_interval * 365 / 12) start_date = date_to_str(datetime_object - month_interval, "%Y-%m-%d") end_date = date_to_str(datetime_object + month_interval, "%Y-%m-%d") tweets = [] try: tweets = self._retrieve_tweets_between_dates(claim, content, start_date, end_date) except: e = sys.exc_info()[0] print("tweet content: {0}, error:{1}".format(content, e)) return tweets
def append_comments(self, submission, origin_post_id, publish_date): datetime_object = publish_date month_interval = timedelta(self._month_interval * 365 / 12) start_date = time.mktime((datetime_object - month_interval).timetuple()) end_date = time.mktime((datetime_object + month_interval).timetuple()) publish_date = time.mktime(publish_date.timetuple()) for comment in submission.comments: if isinstance(comment, MoreComments): continue if comment.created > end_date or comment.created < start_date: continue created_at = datetime.fromtimestamp(comment.created) url = unicode(submission.url + comment.id) comment_guid = compute_post_guid(url, comment.id, date_to_str(created_at)) if start_date < comment.created <= publish_date: if comment.score >= 0 and len(comment.body.split(' ')) > 3: self.convert_comment_to_post(comment, submission) self._post_id_tweets_id_before_dict[origin_post_id].add(comment_guid) elif publish_date < comment.created <= end_date: if comment.score >= 0 and len(comment.body.split(' ')) > 3: self.convert_comment_to_post(comment, submission) self._post_id_tweets_id_after_dict[origin_post_id].add(comment_guid)
def _convert_tweet_to_post(self, tweet, post_type): post = Post() post.post_osn_id = unicode(tweet.id) post_creation_date = tweet.date created_at = unicode(date_to_str(post_creation_date)) post.created_at = created_at post.date = post_creation_date post.favorite_count = tweet.favorites post.retweet_count = tweet.retweets post.content = unicode(tweet.text) author_name = unicode(tweet.username) post.author = author_name # post.author_guid = compute_author_guid_by_author_name(author_name) post_url = tweet.permalink post.url = unicode(post_url) post_guid = compute_post_guid(post_url, author_name, created_at) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.post_type = post_type return post
def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post
def _convert_row_to_post(self, row): # [site, social_id, username_hash, comment_time, comment_tokens] print("\rInsert post to DataFrame {0}/{1}".format( self._current_row, len(self.posts_csv_df)), end="") self._current_row += 1 date = datetime.datetime.fromtimestamp(row['comment_time']) post = Post() claim_id = compute_author_guid_by_author_name(str(row['social_id'])) post.post_id = str( compute_post_guid(row['site'] + str(claim_id), row['username_hash'], date_to_str(date))) post.content = str(row['comment_tokens']) post.author = str(row['username_hash']) post.author_guid = str(row['username_hash']) post.domain = str(row['site']) post.date = date self._posts.append(post) claim_tweet_connection = Claim_Tweet_Connection() claim_tweet_connection.claim_id = str(claim_id) claim_tweet_connection.post_id = str(post.post_id) self._claim_tweet_connections.append(claim_tweet_connection) if self._current_row % self._max_posts_without_save == 0: self._save_posts_and_connections()
def test_retrieve_tweets_by_content_between_dates_before(self): self._add_claim(u"post0", u"The Rock Running for President", u"2017-02-03 00:00:00") self._db.commit() date_interval_dict = defaultdict(set) claim_date = self._claims[u"post0"].verdict_date since_date = str_to_date(u"2016-08-03 00:00:00") self.tweets_crawler._limit_start_date = True self.tweets_crawler._limit_end_date = True tweets = self.tweets_crawler._retrieve_tweets_between_dates( self._claims[u"post0"], u"The Rock Running for President", date_to_str(since_date, "%Y-%m-%d"), date_to_str(claim_date, "%Y-%m-%d")) tweets_date = map(lambda tweet: tweet.date, tweets) self.assertTrue( all([since_date <= date < claim_date for date in tweets_date])) self.assertGreaterEqual(100, len(tweets))
def _json_comment_to_db_comment_converter(self, post, domain="Instagram_comment"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['created_at']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = post['text'] rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _json_post_to_db_post_converter(self, post, domain="Instagram_post"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['taken_at_timestamp']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = ', '.join(x['node']['text'] for x in post['edge_media_to_caption']['edges']) rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def convert_comment_to_post(self, comment, submission, domain=u"Reddit"): post = Post() post.post_osn_id = unicode(comment.id) post.created_at = datetime.fromtimestamp(comment.created) post.date = datetime.fromtimestamp(comment.created) if hasattr(comment, 'author') and comment.author: post.author = unicode(comment.author.name) self._redditors.append(comment.author) else: self._deleted_redditors.append(str(post.date)) post.author = unicode('') post.author_guid = compute_author_guid_by_author_name(post.author) post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7])) post.title = unicode(submission.title) post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip()) post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = domain post.post_type = domain post.post_id = post.guid post.url = u'https://www.reddit.com{}'.format(comment.permalink) return post