def handle(self, *args, **options):
        print('Start.')

        file_name = 'content_only_{}.txt'.format(
            timezone.now().strftime('%Y_%m_%d_%H_%M_%S'))
        f = open(file_name, 'wb')
        field_names = [
            'content',
        ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        processor = TextProcessor()

        excluded_ids = []
        if options['exclude_from_path']:
            with open(options['exclude_from_path'], 'rb') as g:
                r = DictReader(g)
                excluded_ids = [x['comment_id'] for x in r]

        i = 0
        for status in Facebook_Status.objects_no_filters.filter(
                is_comment=False):
            for comment in status.comments.all():
                if comment.comment_id in excluded_ids:
                    continue
                if options['from_db']:
                    processed_text = comment.processed_content
                else:
                    processed_text = comment.content
                    processed_text = processor.replace_mk_names(
                        text=processed_text, context_status=comment.parent)
                if options['translate']:
                    processed_text = processor.request_translated_text_from_google(
                        text=processed_text)
                processed_text = processor.replace_emojis_to_named_text(
                    text=processed_text)
                print('writing comment {}'.format(i + 1))
                i += 1
                dict_row = {
                    'content':
                    processor.flatten_text(processed_text,
                                           delimiter=DELIMITER),
                }
                csv_data.writerow(dict_row)

        f.close()
        print('Done.')
    def handle(self, *args, **options):
        print('Start.')

        file_name = 'content_only_{}.txt'.format(timezone.now().strftime('%Y_%m_%d_%H_%M_%S'))
        f = open(file_name, 'wb')
        field_names = [
            'content',
        ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        processor = TextProcessor()

        excluded_ids = []
        if options['exclude_from_path']:
            with open(options['exclude_from_path'], 'rb') as g:
                r = DictReader(g)
                excluded_ids = [x['comment_id'] for x in r]

        i = 0
        for status in Facebook_Status.objects_no_filters.filter(is_comment=False):
            for comment in status.comments.all():
                if comment.comment_id in excluded_ids:
                    continue
                if options['from_db']:
                    processed_text = comment.processed_content
                else:
                    processed_text = comment.content
                    processed_text = processor.replace_mk_names(text=processed_text,
                                                                context_status=comment.parent)
                if options['translate']:
                    processed_text = processor.request_translated_text_from_google(text=processed_text)
                processed_text = processor.replace_emojis_to_named_text(text=processed_text)
                print('writing comment {}'.format(i + 1))
                i += 1
                dict_row = {
                    'content': processor.flatten_text(processed_text, delimiter=DELIMITER),
                }
                csv_data.writerow(dict_row)

        f.close()
        print('Done.')
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        print('Start.')

        comments = self.parse_comments(options)
        file_name = '{}_full_data.csv'.format(options['file_path'].split('.csv')[0])
        if options['second_stage']:
            file_name = ''.join([file_name.split('.')[0], '_2nd_stage.', file_name.split('.')[1]])
        f = open(file_name, 'wb')
        if options['second_stage']:
            field_names = [
                'comment_id',
                'MK_ID',
                'mk_name',
                'post_status_id',
                'post_content',
                'post_link',
                'comment_link',
                'comment_content',
                'comment_content_processed',
                'comment_time_of_publication',
                'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'post_like_count',
                'post_comment_count',
                'post_share_count',
                'comment_like_count',
                'comment_comment_count',
                'comment_main_language',
                'POST_LEN_MESSAGE',
                'COMMENT_LEN_MESSAGE',
                'COMMENTATOR_LIKED_POST',
                'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT',
                'IDS_OF_MKS_MENTIONED_IN_COMMENT',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST',
                'COMMENTATOR_ID',
                'POLITICAL_WING_HATNUA_LEFT',
                'POLITICAL_WING_HATNUA_CENTER',
                'IS_COALITION',
                'PARTY_NAME',
                'IS_FEMALE',
                'AGE',
                'MK_POLITICAL_STATUS',
                'MK_POLITICAL_SENIORITY',
                'IS_CURRENT_OR_PAST_PARTY_LEADER',
                'IS_CURRENT_OR_PAST_PM_CANDIDATE',
                'IS_PM',
                'POST_PUBLICATION_TIMESTAMP',
                'POST_PUBLICATION_DATE',
                'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'POST_WITH_PHOTO',
                'POST_WITH_LINK',
                'POST_WITH_VIDEO',
                'POST_WITH_STATUS',
                'POST_WITH_TEXT_ONLY',
                'POST_IN_HEBREW',
                'POST_IN_ENGLISH',
                'POST_IN_ARABIC',
                'POST_IN_OTHER',
                'DAYS_FROM_ELECTION',
                'DAYS_FROM_THREE_TEENAGER_KIDNAP',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE',
                'DAYS_FROM_DUMA_ARSON_ATTACK',
                'DAYS_FROM_THIRD_INTIFADA_START_DATE',
                'DAYS_FROM_MK_BIRTHDAY',
                'POST_PUBLISHED_ON_SATURDAY',
                'COMMENT_PUBLISHED_ON_SATURDAY',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
            ]

        else:
            field_names = [
                'comment_id',
                'mk_id',
                'mk_name',
                'parent_status_id',
                'parent_status_content',
                'parent_status_link',
                'comment_link',
                'content',
                'content_processed',
                'published',
                'commentator_id',
                'commentator_also_liked_status',
                'like_count',
                'comment_count',
                'language',
            ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        headers = {field_name: field_name for field_name in field_names}
        csv_data.writerow(headers)

        processor = TextProcessor()

        for i, comment in enumerate(comments):
            processed_text = processor.replace_mk_names(text=comment.content, context_status=comment.parent)
            if options['translate']:
                processed_text = processor.request_translated_text_from_google(text=processed_text)
            processed_text = processor.replace_emojis_to_named_text(text=processed_text)
            print('writing comment {} of {}'.format(i + 1, comments.count()))
            if options['second_stage']:
                dict_row = self.get_second_stage_comment_features(comment, processed_text, processor)
            else:
                dict_row = self.get_first_stage_comment_features(comment, processed_text, processor)
            csv_data.writerow(dict_row)

        f.close()
        print('Done.')
    def handle(self, *args, **options):
        print('Start.')

        comments = self.parse_comments(options)
        file_name = '{}_full_data.csv'.format(
            options['file_path'].split('.csv')[0])
        if options['second_stage']:
            file_name = ''.join([
                file_name.split('.')[0], '_2nd_stage.',
                file_name.split('.')[1]
            ])
        f = open(file_name, 'wb')
        if options['second_stage']:
            field_names = [
                'comment_id',
                'MK_ID',
                'mk_name',
                'post_status_id',
                'post_content',
                'post_link',
                'comment_link',
                'comment_content',
                'comment_content_processed',
                'comment_time_of_publication',
                'COMMENT_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'post_like_count',
                'post_comment_count',
                'post_share_count',
                'comment_like_count',
                'comment_comment_count',
                'comment_main_language',
                'POST_LEN_MESSAGE',
                'COMMENT_LEN_MESSAGE',
                'COMMENTATOR_LIKED_POST',
                'HAS_NAME_OF_POST_WRITER_MK_IN_COMMENT',
                'IDS_OF_MKS_MENTIONED_IN_COMMENT',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ON_POST',
                'COMMENTATOR_ID',
                'POLITICAL_WING_HATNUA_LEFT',
                'POLITICAL_WING_HATNUA_CENTER',
                'IS_COALITION',
                'PARTY_NAME',
                'IS_FEMALE',
                'AGE',
                'MK_POLITICAL_STATUS',
                'MK_POLITICAL_SENIORITY',
                'IS_CURRENT_OR_PAST_PARTY_LEADER',
                'IS_CURRENT_OR_PAST_PM_CANDIDATE',
                'IS_PM',
                'POST_PUBLICATION_TIMESTAMP',
                'POST_PUBLICATION_DATE',
                'POST_PUBLICATION_DAYS_FROM_RESEARCH_START_DATE',
                'POST_WITH_PHOTO',
                'POST_WITH_LINK',
                'POST_WITH_VIDEO',
                'POST_WITH_STATUS',
                'POST_WITH_TEXT_ONLY',
                'POST_IN_HEBREW',
                'POST_IN_ENGLISH',
                'POST_IN_ARABIC',
                'POST_IN_OTHER',
                'DAYS_FROM_ELECTION',
                'DAYS_FROM_THREE_TEENAGER_KIDNAP',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_START_DATE',
                'DAYS_FROM_PROTECTIVE_EDGE_OFFICIAL_END_DATE',
                'DAYS_FROM_DUMA_ARSON_ATTACK',
                'DAYS_FROM_THIRD_INTIFADA_START_DATE',
                'DAYS_FROM_MK_BIRTHDAY',
                'POST_PUBLISHED_ON_SATURDAY',
                'COMMENT_PUBLISHED_ON_SATURDAY',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_GIVEN_MK_POSTS',
                'NUM_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'NUM_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_COMMENTS_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
                'RATIO_OF_LIKES_BY_COMMENTATOR_ID_ON_ALL_MK_POSTS',
            ]

        else:
            field_names = [
                'comment_id',
                'mk_id',
                'mk_name',
                'parent_status_id',
                'parent_status_content',
                'parent_status_link',
                'comment_link',
                'content',
                'content_processed',
                'published',
                'commentator_id',
                'commentator_also_liked_status',
                'like_count',
                'comment_count',
                'language',
            ]
        csv_data = DictWriter(f, fieldnames=field_names, delimiter=DELIMITER)
        headers = {field_name: field_name for field_name in field_names}
        csv_data.writerow(headers)

        processor = TextProcessor()

        for i, comment in enumerate(comments):
            processed_text = processor.replace_mk_names(
                text=comment.content, context_status=comment.parent)
            if options['translate']:
                processed_text = processor.request_translated_text_from_google(
                    text=processed_text)
            processed_text = processor.replace_emojis_to_named_text(
                text=processed_text)
            print('writing comment {} of {}'.format(i + 1, comments.count()))
            if options['second_stage']:
                dict_row = self.get_second_stage_comment_features(
                    comment, processed_text, processor)
            else:
                dict_row = self.get_first_stage_comment_features(
                    comment, processed_text, processor)
            csv_data.writerow(dict_row)

        f.close()
        print('Done.')