def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1): post = Post() post.post_osn_id = post_osn_id post.author = str(author) post.author_guid = compute_author_guid_by_author_name(post.author) post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M") post.url = 'https://www.reddit.com{}'.format( post.author) # just for test post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = 'reddit_comment' post.post_type = 'reddit_comment' post.post_id = post.guid reddit_post = RedditPost() reddit_post.post_id = post.post_id reddit_post.guid = post.guid reddit_post.score = score if upvote_ratio != -1: post.domain = 'reddit_post' post.post_type = 'reddit_post' reddit_post.upvote_ratio = upvote_ratio reddit_post.ups = int( round((reddit_post.upvote_ratio * reddit_post.score) / (2 * reddit_post.upvote_ratio - 1)) if reddit_post.upvote_ratio != 0.5 else round(reddit_post.score / 2)) reddit_post.downs = reddit_post.ups - reddit_post.score else: reddit_post.ups = -1 reddit_post.downs = -1 reddit_post.upvote_ratio = -1 self._db.addPosts([post, reddit_post]) return post, reddit_post
def _convert_tweet_to_post(self, tweet, post_type): post = Post() post.post_osn_id = unicode(tweet.id) post_creation_date = tweet.date created_at = unicode(date_to_str(post_creation_date)) post.created_at = created_at post.date = post_creation_date post.favorite_count = tweet.favorites post.retweet_count = tweet.retweets post.content = unicode(tweet.text) author_name = unicode(tweet.username) post.author = author_name # post.author_guid = compute_author_guid_by_author_name(author_name) post_url = tweet.permalink post.url = unicode(post_url) post_guid = compute_post_guid(post_url, author_name, created_at) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.post_type = post_type return post
def _create_post_by_row(self, record_dict): post = Post() post_id = self._convert_to_unicode_value(record_dict["post_id"]) post.post_osn_id = post_id post.post_id = post_id author_name = self._convert_to_unicode_value(record_dict["tumblog_id"]) post.author = author_name post_short_url = self._convert_to_unicode_value( record_dict["post_short_url"]) self._set_post_url(post_short_url, author_name, post) post_creation_date = self._convert_to_unicode_value( record_dict["created_time_epoch"]) post.created_at = post_creation_date if post_creation_date is not None: post_formatted_creation_date, str_post_formatted_creation_date = convert_epoch_timestamp_to_datetime( post_creation_date) post.date = post_formatted_creation_date else: str_post_formatted_creation_date = self._set_start_date() post.guid = compute_post_guid(post.url, author_name, str_post_formatted_creation_date) post.post_osn_guid = post.guid post.title = self._convert_to_unicode_value(record_dict["post_title"]) post_content = record_dict["post_content"] if post_content != 'NULL': content = json.loads(post_content.decode("utf-8")) #content = eval(record_dict["post_content"]) final_content = "" if 'title' in content.keys(): title = content['title'] final_content += title if 'text' in content.keys(): text = content['text'] final_content += text post.content = self._convert_to_unicode_value(final_content) post.domain = self._domain post.author_guid = compute_author_guid_by_author_name(author_name) post.post_type = self._convert_to_unicode_value( record_dict["post_type"]) post.post_format = self._convert_to_unicode_value( record_dict["post_format"]) post.reblog_key = self._convert_to_unicode_value( record_dict["post_reblog_key"]) post.tags = self._convert_to_unicode_value(record_dict["post_tags"]) post.state = self._convert_to_unicode_value(record_dict["post_state"]) if post.post_osn_id not in self._post_dict: self._post_dict[post.post_osn_id] = post return post
def _generate_comment(self, instagram_comment, post): comment = Post() comment.date = datetime.datetime.fromtimestamp(instagram_comment['created_at']) comment.post_osn_id = instagram_comment['id'] comment.content = str(instagram_comment['text']) comment.author = str(instagram_comment['owner']['username']) comment.author_guid = str(instagram_comment['owner']['id']) comment.url = '{}{}/'.format(post.url, comment.post_osn_id) comment.domain = 'Instagram' comment.post_type = 'comment' comment.post_id = str(comment.post_osn_id) return comment
def _json_comment_to_db_comment_converter(self, post, domain="Instagram_comment"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['created_at']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = post['text'] rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _json_post_to_db_post_converter(self, post, domain="Instagram_post"): rpost = Post() rpost.post_osn_id = str(post['id']) rpost.created_at = datetime.fromtimestamp(post['taken_at_timestamp']) rpost.author = post['owner']['id'] rpost.author_guid = compute_author_guid_by_author_name(rpost.author) rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode'])) rpost.content = ', '.join(x['node']['text'] for x in post['edge_media_to_caption']['edges']) rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at)) rpost.domain = domain rpost.post_type = domain rpost.post_id = rpost.guid return rpost
def _create_post_citation_by_row(self, reblogged_from_metadata): original_post = Post() parent_post_id = reblogged_from_metadata["parent_post_id"] original_post.post_osn_id = parent_post_id original_post.post_id = parent_post_id parent_post_blog_id = reblogged_from_metadata["parent_post_blog_id"] original_post.author = parent_post_blog_id original_post.author_guid = compute_author_guid_by_author_name(parent_post_blog_id) original_post.domain = self._domain parent_post_short_url = self._convert_to_unicode_value(reblogged_from_metadata["parent_post_short_url"]) self._set_post_url(parent_post_short_url, parent_post_blog_id, original_post) if parent_post_id not in self._post_dict: self._post_dict[parent_post_id] = original_post return original_post
def convert_comment_to_post(self, comment, submission, domain=u"Reddit"): post = Post() post.post_osn_id = unicode(comment.id) post.created_at = datetime.fromtimestamp(comment.created) post.date = datetime.fromtimestamp(comment.created) if hasattr(comment, 'author') and comment.author: post.author = unicode(comment.author.name) self._redditors.append(comment.author) else: self._deleted_redditors.append(str(post.date)) post.author = unicode('') post.author_guid = compute_author_guid_by_author_name(post.author) post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7])) post.title = unicode(submission.title) post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip()) post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at)) post.domain = domain post.post_type = domain post.post_id = post.guid post.url = u'https://www.reddit.com{}'.format(comment.permalink) return post
def photo_xml_to_post(self, child): p = Post() p.title = str(child.find('title').text) p.url = str(child.find('urls').find('url').text) try: p.tags = ','.join(tag.text for tag in child.find('tags').findall('tag')) except: pass p.created_at = str(child.find('dates').get('posted')) p.date = datetime.datetime.fromtimestamp(int(p.created_at)) p.author = str(child.find('owner').get('nsid')) p.domain = 'flickr' p.author_guid = compute_author_guid_by_author_name(p.author) p.retweet_count = int(child.find('comments').text) p.post_id = compute_post_guid(p.url, p.author, date_to_str(p.date)) p.post_osn_id = str(child.get('id')) if child.find('labels') is not None: p.post_type = ','.join( tag.text for tag in child.find('labels').findall('label')) return p
def create_dummy_post(self): post = Post() post.post_id = unicode(self.post_id) post.author = u"author" post.guid = unicode(generate_random_guid()) post.title = u"title" post.url = u"http://google.com" post.date = str_to_date("2016-08-24 10:00:15") post.content = u"text" post.is_detailed = True post.is_LB = False post.is_valid = True post.domain = u"Google" post.author_guid = unicode(self.author_guid) post.post_osn_id = 123455678 post.retweet_count = 11 post.favorite_count = 10 post.created_at = u"2016-08-24 10:00:15" return post
def _convert_tweet_dict_to_post(self, tweet_dict): post = Post() post_osn_id = tweet_dict['id_str'] post.post_osn_id = post_osn_id author_osn_id = tweet_dict['author_osn_id'] author = self._author_osn_id_author_dict[author_osn_id] author_screen_name = author.author_screen_name post.author = author_screen_name post.author_guid = compute_author_guid_by_author_name( author_screen_name) created_at = tweet_dict['created_at'] post.created_at = created_at creation_date_str = extract_tweet_publiction_date(created_at) creation_date = str_to_date(creation_date_str) post.date = creation_date post.favorite_count = tweet_dict['favorite_count'] post.retweet_count = tweet_dict['retweet_count'] post.reply_count = tweet_dict['reply_count'] post.content = str(tweet_dict['full_text']) post.domain = self._domain post.language = str(tweet_dict['lang']) post_url = "https://twitter.com/{0}/status/{1}".format( author_screen_name, post_osn_id) post.url = post_url post_guid = compute_post_guid(post_url, author_screen_name, creation_date_str) post.guid = post_guid post.post_id = post_guid return post
def _generate_post(self, instagram_post): post = Post() post.author_guid = str(instagram_post['owner']['id']) post.date = datetime.datetime.fromtimestamp(instagram_post['taken_at_timestamp']) post.post_osn_id = instagram_post['id'] try: post.content = instagram_post['edge_media_to_caption']['edges'][0]['node']['text'] except: pass post.retweet_count = instagram_post['edge_media_to_comment']['count'] post.favorite_count = instagram_post['edge_media_preview_like']['count'] post.url = 'https://www.instagram.com/p/{}/'.format(instagram_post['shortcode']) image_names = [] # for url in instagram_post['urls']: # image_name_contaner = url.split('/')[-1] # end = image_name_contaner.index('?') # image_names.append(image_name_contaner[:end]) image_name = self._get_image_name_from_url(instagram_post['display_url']) post.media_path = str(instagram_post['display_url']) post.post_format = '{}'.format(image_name) post.domain = 'Instagram' post.post_type = 'post' post.post_id = str(post.post_osn_id) return post