def write_party_name(self, csv_data, feed_ids): processor = TextProcessor() feed_to_mk_id = { x: processor.flatten_text(self.get_party_name_if_exists(x), delimiter=DELIMITER) for x in feed_ids } feed_to_mk_id = self.add_empty_commentator_rows(feed_to_mk_id, commentator_id='MK_ID') csv_data.writerow(feed_to_mk_id)
def handle(self, *args, **options): """ Executes processcommentcontent manage.py command. Receives one or more status ids. takes all comments for status(es) and saves processed_content field after text manipulation. """ list_of_statuses = self.parse_statuses(args, options) processor = TextProcessor() # Iterate over list_of_statuses i = 1 if not options['workers']: for status in list_of_statuses: for comment in status.comments.all(): self.worker(i, comment, status, processor) i += 1 else: with futures.ThreadPoolExecutor( max_workers=options['workers']) as executer: for status in list_of_statuses: for comment in status.comments.all(): executer.submit(self.worker, i, comment, status, processor) i += 1 info_msg = "Successfully saved all statuses to db" logger = logging.getLogger('django') logger.info(info_msg) self.stdout.write('Successfully saved all statuses to db.')
def handle(self, *args, **options): print('Start.') comments = self.parse_comments(options) f = open('{}_full_data.csv'.format(options['file_path'].split('.csv')[0]), 'wb') field_names = [ 'comment_id', 'mk_id', 'mk_name', 'parent_status_id', 'parent_status_content', 'parent_status_link', 'comment_link', 'content', 'content_processed', 'published', 'commentator_id', 'commentator_also_liked_status', 'like_count', 'comment_count', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) headers = {field_name: field_name for field_name in field_names} csv_data.writerow(headers) processor = TextProcessor() for i, comment in enumerate(comments): processed_text = processor.text_manipulation_mk_names(text=comment.content, context_status=comment.parent) if options['translate']: processed_text = processor.text_manipulation_translate_text(text=processed_text) processed_text = processor.text_manipulation_emojis(text=processed_text) print('writing comment {} of {}'.format(i + 1, comments.count())) dict_row = { 'comment_id': comment.comment_id, 'mk_id': comment.parent.feed.persona.content_object.id, 'mk_name': processor.text_manipulation_flatten_text(comment.parent.feed.persona.content_object.name, delimiter=DELIMITER), 'parent_status_id': comment.parent.status_id, 'parent_status_content': processor.text_manipulation_flatten_text(comment.parent.content, delimiter=DELIMITER), 'parent_status_link': comment.parent.get_link, 'comment_link': 'www.facebook.com/{}'.format(comment.comment_id), 'content': processor.text_manipulation_flatten_text(comment.content, delimiter=DELIMITER), 'content_processed': processor.text_manipulation_flatten_text(processed_text, delimiter=DELIMITER), 'published': comment.published, 'commentator_id': comment.comment_from.facebook_id, 'commentator_also_liked_status': comment.comment_from.likes.filter( status__status_id=comment.parent.status_id).exists(), 'like_count': comment.like_count, 'comment_count': comment.comment_count } csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') file_name = 'content_only_{}.txt'.format(timezone.now().strftime('%Y_%m_%d_%H_%M_%S')) f = open(file_name, 'wb') field_names = [ 'content', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) processor = TextProcessor() excluded_ids = [] if options['exclude_from_path']: with open(options['exclude_from_path'], 'rb') as g: r = DictReader(g) excluded_ids = [x['comment_id'] for x in r] i = 0 for status in Facebook_Status.objects_no_filters.filter(is_comment=False): for comment in status.comments.all(): if comment.comment_id in excluded_ids: continue if options['from_db']: processed_text = comment.processed_content else: processed_text = comment.content processed_text = processor.replace_mk_names(text=processed_text, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google(text=processed_text) processed_text = processor.replace_emojis_to_named_text(text=processed_text) print('writing comment {}'.format(i + 1)) i += 1 dict_row = { 'content': processor.flatten_text(processed_text, delimiter=DELIMITER), } csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') file_name = 'content_only_{}.txt'.format( timezone.now().strftime('%Y_%m_%d_%H_%M_%S')) f = open(file_name, 'wb') field_names = [ 'content', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) processor = TextProcessor() excluded_ids = [] if options['exclude_from_path']: with open(options['exclude_from_path'], 'rb') as g: r = DictReader(g) excluded_ids = [x['comment_id'] for x in r] i = 0 for status in Facebook_Status.objects_no_filters.filter( is_comment=False): for comment in status.comments.all(): if comment.comment_id in excluded_ids: continue if options['from_db']: processed_text = comment.processed_content else: processed_text = comment.content processed_text = processor.replace_mk_names( text=processed_text, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google( text=processed_text) processed_text = processor.replace_emojis_to_named_text( text=processed_text) print('writing comment {}'.format(i + 1)) i += 1 dict_row = { 'content': processor.flatten_text(processed_text, delimiter=DELIMITER), } csv_data.writerow(dict_row) f.close() print('Done.')
def write_party_name(self, csv_data, feed_ids): processor = TextProcessor() feed_to_mk_id = {x: processor.flatten_text(self.get_party_name_if_exists(x), delimiter=DELIMITER) for x in feed_ids} feed_to_mk_id = self.add_empty_commentator_rows(feed_to_mk_id, commentator_id='MK_ID') csv_data.writerow(feed_to_mk_id)
def handle(self, *args, **options): print('Start.') comments = self.parse_comments(options) file_name = '{}_full_data.csv'.format(options['file_path'].split('.csv')[0]) if options['second_stage']: file_name = ''.join([file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1]]) f = open(file_name, 'wb') if options['second_stage']: field_names = [ 'comment_id', 'MK_ID', 'mk_name', 'post_status_id', 'post_content', 'post_link', 'comment_link', 'comment_content', 'comment_content_processed', 'comment_time_of_publication', 'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'post_like_count', 'post_comment_count', 'post_share_count', 'comment_like_count', 'comment_comment_count', 'comment_main_language', 'POST_LEN_MESSAGE', 'COMMENT_LEN_MESSAGE', 'COMMENTATOR_LIKED_POST', 'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT', 'IDS_OF_MKS_MENTIONED_IN_COMMENT', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST', 'COMMENTATOR_ID', 'POLITICAL_WING_HATNUA_LEFT', 'POLITICAL_WING_HATNUA_CENTER', 'IS_COALITION', 'PARTY_NAME', 'IS_FEMALE', 'AGE', 'MK_POLITICAL_STATUS', 'MK_POLITICAL_SENIORITY', 'IS_CURRENT_OR_PAST_PARTY_LEADER', 'IS_CURRENT_OR_PAST_PM_CANDIDATE', 'IS_PM', 'POST_PUBLICATION_TIMESTAMP', 'POST_PUBLICATION_DATE', 'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'POST_WITH_PHOTO', 'POST_WITH_LINK', 'POST_WITH_VIDEO', 'POST_WITH_STATUS', 'POST_WITH_TEXT_ONLY', 'POST_IN_HEBREW', 'POST_IN_ENGLISH', 'POST_IN_ARABIC', 'POST_IN_OTHER', 'DAYS_FROM_ELECTION', 'DAYS_FROM_THREE_TEENAGER_KIDNAP', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE', 'DAYS_FROM_DUMA_ARSON_ATTACK', 'DAYS_FROM_THIRD_INTIFADA_START_DATE', 'DAYS_FROM_MK_BIRTHDAY', 'POST_PUBLISHED_ON_SATURDAY', 'COMMENT_PUBLISHED_ON_SATURDAY', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', ] else: field_names = [ 'comment_id', 'mk_id', 'mk_name', 'parent_status_id', 'parent_status_content', 'parent_status_link', 'comment_link', 'content', 'content_processed', 'published', 'commentator_id', 'commentator_also_liked_status', 'like_count', 'comment_count', 'language', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) headers = {field_name: field_name for field_name in field_names} csv_data.writerow(headers) processor = TextProcessor() for i, comment in enumerate(comments): processed_text = processor.replace_mk_names(text=comment.content, context_status=comment.parent) if options['translate']: processed_text = processor.request_translated_text_from_google(text=processed_text) processed_text = processor.replace_emojis_to_named_text(text=processed_text) print('writing comment {} of {}'.format(i + 1, comments.count())) if options['second_stage']: dict_row = self.get_second_stage_comment_features(comment, processed_text, processor) else: dict_row = self.get_first_stage_comment_features(comment, processed_text, processor) csv_data.writerow(dict_row) f.close() print('Done.')
def handle(self, *args, **options): print('Start.') comments_in_file = self.parse_comments(options) if options['all_comments']: file_name_part = 'all_comments' else: file_name_part = options['file_path'].split('.csv')[0] file_name = '{}_full_data.csv'.format(file_name_part) if options['second_stage']: file_name = ''.join([ file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1] ]) f = open(file_name, 'wb') if options['second_stage']: field_names = [ 'comment_id', 'MK_ID', 'mk_name', 'post_status_id', 'post_content', 'post_link', 'comment_link', 'comment_content', 'comment_content_processed', 'comment_time_of_publication', 'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'post_like_count', 'post_comment_count', 'post_share_count', 'comment_like_count', 'comment_comment_count', 'comment_main_language', 'POST_LEN_MESSAGE', 'COMMENT_LEN_MESSAGE', 'COMMENTATOR_LIKED_POST', 'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT', 'IDS_OF_MKS_MENTIONED_IN_COMMENT', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST', 'COMMENTATOR_ID', 'POLITICAL_WING_HATNUA_LEFT', 'POLITICAL_WING_HATNUA_CENTER', 'IS_COALITION', 'PARTY_NAME', 'IS_FEMALE', 'AGE', 'MK_POLITICAL_STATUS', 'MK_POLITICAL_SENIORITY', 'IS_CURRENT_OR_PAST_PARTY_LEADER', 'IS_CURRENT_OR_PAST_PM_CANDIDATE', 'IS_PM', 'POST_PUBLICATION_TIMESTAMP', 'POST_PUBLICATION_DATE', 'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE', 'POST_WITH_PHOTO', 'POST_WITH_LINK', 'POST_WITH_VIDEO', 'POST_WITH_STATUS', 'POST_WITH_TEXT_ONLY', 'POST_IN_HEBREW', 'POST_IN_ENGLISH', 'POST_IN_ARABIC', 'POST_IN_OTHER', 'DAYS_FROM_ELECTION', 'DAYS_FROM_THREE_TEENAGER_KIDNAP', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE', 'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE', 'DAYS_FROM_DUMA_ARSON_ATTACK', 'DAYS_FROM_THIRD_INTIFADA_START_DATE', 'DAYS_FROM_MK_BIRTHDAY', 'POST_PUBLISHED_ON_SATURDAY', 'COMMENT_PUBLISHED_ON_SATURDAY', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS', 'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS', 'is_train', ] else: field_names = [ 'comment_id', 'mk_id', 'mk_name', 'parent_status_id', 'parent_status_content', 'parent_status_link', 'comment_link', 'content', 'content_processed', 'published', 'commentator_id', 'commentator_also_liked_status', 'like_count', 'comment_count', 'language', 'is_train', ] csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER) headers = {field_name: field_name for field_name in field_names} csv_data.writerow(headers) processor = TextProcessor() if options['all_comments']: i = 0 for status in Facebook_Status.objects_no_filters.filter( is_comment=False): for comment in status.comments.all(): print('writing comment {}'.format(i + 1)) i += 1 self.handle_comment(comment, comments_in_file, csv_data, options, processor) else: for i, comment in enumerate(comments_in_file): print('writing comment {} ' 'of {}'.format(i + 1, comments_in_file.count())) self.handle_comment(comment, comments_in_file, csv_data, options, processor) f.close() print('Done.')