def extract_post(self, data, post_type): post = Post() if data['publish_date'] is None: publish_date_date = calendar.timegm(time.gmtime()) * 1000 else: publish_date_date = data['publish_date']['$date'] date_str = datetime.datetime.fromtimestamp( publish_date_date / 1000).strftime('%Y-%m-%d %H:%M:%S') post.post_id = compute_post_guid(data['url'], data['source'], date_str) post.guid = post.post_id post.author_guid = compute_author_guid_by_author_name(data['source']) post.author = str(data['source']) post.date = convert_str_to_unicode_datetime(date_str) post.title = str(data['title']) post.url = str(data['url']) post.source_url = str(data['source']) post.content = str(data['text']) post.tags = ','.join(data['keywords']) post.domain = self._domain post.post_type = post_type if 'description' not in data['meta_data']: post.description = "" else: post.description = str(data['meta_data']['description']) return post
def _convert_row_to_post(self, row): post = Post() claim_id = unicode(row['claim_id']) title = unicode(row['title'], errors='replace') post.content = title description = unicode(row['description'], errors='replace') post.description = description url = unicode(row['url']) post.url = url publication_date = row['publication_date'] post.date = date(publication_date) post_guid = compute_post_guid(self._social_network_url, claim_id, publication_date) post.guid = post_guid post.post_id = post_guid post.domain = self._domain post.author = self._author_name author_guid = compute_author_guid_by_author_name(self._author_name) post.author_guid = author_guid post.post_osn_guid = post_guid keywords = unicode(row['keywords']) post.tags = keywords post_type = unicode(row['post_type']) post.post_type = post_type return post
def _create_post_by_row(self, record_dict): post = Post() post_id = self._convert_to_unicode_value(record_dict["post_id"]) post.post_osn_id = post_id post.post_id = post_id author_name = self._convert_to_unicode_value(record_dict["tumblog_id"]) post.author = author_name post_short_url = self._convert_to_unicode_value( record_dict["post_short_url"]) self._set_post_url(post_short_url, author_name, post) post_creation_date = self._convert_to_unicode_value( record_dict["created_time_epoch"]) post.created_at = post_creation_date if post_creation_date is not None: post_formatted_creation_date, str_post_formatted_creation_date = convert_epoch_timestamp_to_datetime( post_creation_date) post.date = post_formatted_creation_date else: str_post_formatted_creation_date = self._set_start_date() post.guid = compute_post_guid(post.url, author_name, str_post_formatted_creation_date) post.post_osn_guid = post.guid post.title = self._convert_to_unicode_value(record_dict["post_title"]) post_content = record_dict["post_content"] if post_content != 'NULL': content = json.loads(post_content.decode("utf-8")) #content = eval(record_dict["post_content"]) final_content = "" if 'title' in content.keys(): title = content['title'] final_content += title if 'text' in content.keys(): text = content['text'] final_content += text post.content = self._convert_to_unicode_value(final_content) post.domain = self._domain post.author_guid = compute_author_guid_by_author_name(author_name) post.post_type = self._convert_to_unicode_value( record_dict["post_type"]) post.post_format = self._convert_to_unicode_value( record_dict["post_format"]) post.reblog_key = self._convert_to_unicode_value( record_dict["post_reblog_key"]) post.tags = self._convert_to_unicode_value(record_dict["post_tags"]) post.state = self._convert_to_unicode_value(record_dict["post_state"]) if post.post_osn_id not in self._post_dict: self._post_dict[post.post_osn_id] = post return post
def _add_post(self, post_id, content, tags, date_str, domain=u'Microblog'): post = Post() post.author = self._author.author_guid post.author_guid = self._author.author_guid post.content = content post.title = post_id post.domain = domain post.post_id = post_id post.guid = post.post_id post.date = convert_str_to_unicode_datetime(date_str) post.created_at = post.date post.tags = tags self._db.addPost(post) self._author.statuses_count += 1
def convert_claim_to_post(claim): from DB.schema_definition import Post post = Post() post.post_id = claim.claim_id post.content = claim.title post.description = claim.description post.url = claim.url post.date = claim.verdict_date post.domain = 'Claim' post.author = 'no author' post.author_guid = 'no author' post.guid = compute_post_guid(claim.url, post.author, date_to_str(post.date)) post.post_osn_guid = post.guid post.tags = claim.keywords post.post_type = claim.verdict return post
def photo_xml_to_post(self, child): p = Post() p.title = str(child.find('title').text) p.url = str(child.find('urls').find('url').text) try: p.tags = ','.join(tag.text for tag in child.find('tags').findall('tag')) except: pass p.created_at = str(child.find('dates').get('posted')) p.date = datetime.datetime.fromtimestamp(int(p.created_at)) p.author = str(child.find('owner').get('nsid')) p.domain = 'flickr' p.author_guid = compute_author_guid_by_author_name(p.author) p.retweet_count = int(child.find('comments').text) p.post_id = compute_post_guid(p.url, p.author, date_to_str(p.date)) p.post_osn_id = str(child.get('id')) if child.find('labels') is not None: p.post_type = ','.join( tag.text for tag in child.find('labels').findall('label')) return p