Beispiel #1
0
	def testSmallCSVFile(self):
		results = {}

		# Add a specific thread, http://archive.4plebs.org/x/thread/23732801/
		results.update(**httpGET_json(gen_thread_api_url('x', 23732801)))

		# First page of /x/
		results.update(**httpGET_json(gen_index_api_url('x', 1)))

		# Turn that json dict into a list of Post objects
		postList = FourPlebsAPI_Post.from_post_json(results)

		output_csv_filepath = 'out/testcase-output-small-example.csv'
		CSVPostWriter.write_posts_to_csv(postList, output_csv_filepath, ALL_CONTENT_FLAGGERS)

		i = 0
		# All lines in this CSV should contain commas!
		with open(output_csv_filepath, 'r') as f:
			for line in f:
				print("line {}".format(i))
				self.assertIn(',', line)

				i += 1

		with open(output_csv_filepath, 'r') as f:
			self.ensure_csv_has_no_empty_fields(f, count=4)
def generate_small_example_csv():
    results = {}

    # Add a specific thread, http://archive.4plebs.org/x/thread/23732801/
    results.update(**httpGET_json(gen_thread_api_url('x', 23732801)))

    for i in range(1, 10):
        # Get the posts from page 1-10 /pol/
        results.update(**httpGET_json(gen_index_api_url('pol', i)))

    # Add on the posts from page 1 /x/
    results.update(**httpGET_json(gen_index_api_url('x', 1)))

    # Turn that json dict into a list of Post objects
    postList = FourPlebsAPI_Post.from_post_json(results)

    # # For all posts from the two index pages (/x/, /pol/)
    # for post in postList:
    # 	print(post)

    CSVPostWriter.write_posts_to_csv(postList,
                                     'out/post-output-small-example.csv',
                                     ALL_CONTENT_FLAGGERS)
 def gen_thread_api_url(self):
     return gen_thread_api_url(self.board_code, self.thread_num)
Beispiel #4
0
    def write_posts_to_stream(
            threads: List,
            stream: TextIO,
            content_flaggers: List[ContentFlagger] = None) -> None:
        """
		:param threads: The list of FourPlebsAPI_Post objects to save.
		:param stream: A TextIO object.
		:param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts.
			Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default.
		:return:
		"""

        # Fields we want to save in the CSV
        fieldnames = [
            'board',
            'post_id',
            'post_url',
            'thread_id',
            'thread_url',
            'full_comment',
            'thread_api_url',
            'post_api_url',
            'op',
            'country_code',
            'timestamp_epoch',
            'timestamp_ISO8601',
            'No content flagger tripped',  # it doesn't trip any content flaggers, presumably benign
        ]
        # Add our flagger descriptions
        fieldnames += [flagger.csv_description for flagger in content_flaggers]

        writer = csv.DictWriter(stream, fieldnames=fieldnames)
        writer.writeheader()

        for thread in threads:

            row_reply = {
                'board': thread.board_code,
                'post_id': thread.post_id,
                'post_url': thread.gen_post_url(),
                'post_api_url': thread.gen_post_api_url(),
                'thread_id': thread.thread_num,
                'thread_url': thread.gen_thread_url(),
                'thread_api_url': thread.gen_thread_api_url(),
                'country_code': thread.poster_country,
                'full_comment': csv_safe_string(thread.comment),
                'op': True,
                'timestamp_ISO8601': epoch_to_ISO8601(thread.timestamp),
                'timestamp_epoch': thread.timestamp,
            }

            # for every flagger, apply its analysis to the post's comment
            for flagger in content_flaggers:
                row_reply.update(**{
                    flagger.csv_description:
                    flagger.flag_content(thread.comment)
                })

            # calculate if this item has had ZERO detections.
            trips = 0
            for flagger in content_flaggers:
                if row_reply[flagger.csv_description]:
                    trips += 1

            # if so, write a row that accounts for this.
            if trips > 0:
                row_reply.update(**{'No content flagger tripped': False})
            else:
                row_reply.update(**{'No content flagger tripped': True})

            writer.writerow(row_reply)

            for reply in thread.subposts:
                # TODO: Find a more elegant way to process these subposts! This is duplicated code!

                # writer.writerow({'op':"this breaks test cases now! yes! :)"})

                # print("Subpost:")
                # print(subpost)
                row_reply = {
                    'board':
                    reply['board']['shortname'],
                    'post_id':
                    reply['num'],
                    'post_url':
                    gen_post_url(reply['board']['shortname'],
                                 reply['thread_num'], reply['num']),
                    'post_api_url':
                    gen_post_api_url(reply['board']['shortname'],
                                     reply['num']),
                    'thread_id':
                    reply['thread_num'],
                    'thread_url':
                    gen_thread_url(reply['board']['shortname'],
                                   reply['thread_num']),
                    'thread_api_url':
                    gen_thread_api_url(reply['board']['shortname'],
                                       reply['thread_num']),
                    'country_code':
                    reply['poster_country'],
                    'full_comment':
                    csv_safe_string(reply['comment']),
                    'op':
                    False,
                    'timestamp_ISO8601':
                    epoch_to_ISO8601(reply['timestamp']),
                    'timestamp_epoch':
                    reply['timestamp'],
                }

                # for every flagger, apply its analysis to the subpost's comment
                for flagger in content_flaggers:
                    row_reply.update(
                        **{
                            flagger.csv_description:
                            flagger.flag_content(reply['comment'])
                        })

                # calculate if this item has had ZERO detections.
                trips = 0
                for flagger in content_flaggers:
                    if row_reply[flagger.csv_description] is True:
                        trips += 1

                # if so, write a row that accounts for this.
                if trips > 0:
                    row_reply.update(**{'No content flagger tripped': False})
                else:
                    row_reply.update(**{'No content flagger tripped': True})

                writer.writerow(row_reply)
Beispiel #5
0
	def write_posts_to_csv(posts: List, filepath: str,
						   content_flaggers: List[ContentFlagger] = None) -> None:
		"""
		:param posts: The list of FourPlebsAPI_Post objects to save.
		:param filepath: The filepath of the CSV scraper.
		:param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts.
			Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default.
		:return:
		"""

		# Ensure that enclosing directory exists
		if not os.path.exists(os.path.dirname(filepath)):
			os.makedirs(os.path.dirname(filepath))

		if not filepath.split('.')[-1] == 'csv':
			raise Exception("File doesn't end in `csv`!")

		with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:

			# Fields we want to save in the CSV
			fieldnames = [
				'board',
				'post_id',
				'post_url',
				'thread_id',
				'thread_url',
				'full_comment',
				'thread_api_url',
				'post_api_url',
				'op',
				'country_code',
				'timestamp_epoch',
				'timestamp',
			]
			# Add our flagger descriptions
			fieldnames += [flagger.csv_description for flagger in content_flaggers]

			writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
			writer.writeheader()

			for post in posts:
				writer.writerow({
					'board': post.board_code,
					'post_id': post.post_id,
					'post_url': post.gen_post_url(),
					'post_api_url': post.gen_post_api_url(),
					'thread_id': post.thread_num,
					'thread_url': post.gen_thread_url(),
					'thread_api_url': post.gen_thread_api_url(),
					'country_code': post.poster_country,
					'full_comment': csv_safe_string(post.comment),
					'op': True,
					'timestamp': epoch_to_human_date(post.timestamp),
					'timestamp_epoch': post.timestamp,
				})

				# for every flagger, apply its analysis to the post's comment
				for flagger in content_flaggers:
					writer.writerow({
						flagger.csv_description: flagger.flag_content(post.comment)
					})

				for subpost in post.subposts:
					# TODO: Find a more elegant way to process these subposts! This is duplicated code!

					# print("Subpost:")
					# print(subpost)

					writer.writerow({
						'board': subpost['board']['shortname'],
						'post_id': subpost['num'],
						'post_url': gen_post_url(subpost['board']['shortname'], subpost['thread_num'], subpost['num']),
						'post_api_url': gen_post_api_url(subpost['board']['shortname'], subpost['num']),
						'thread_id': subpost['thread_num'],
						'thread_url': gen_thread_url(subpost['board']['shortname'], subpost['thread_num']),
						'thread_api_url': gen_thread_api_url(subpost['board']['shortname'], subpost['thread_num']),
						'country_code': subpost['poster_country'],
						'full_comment': csv_safe_string(subpost['comment']),
						'op': False,
						'timestamp': epoch_to_human_date(subpost['timestamp']),
						'timestamp_epoch': subpost['timestamp'],
					})

					# for every flagger, apply its analysis to the subpost's comment
					for flagger in content_flaggers:
						writer.writerow({
							flagger.csv_description: flagger.flag_content(subpost['comment'])
						})

		print("Enjoy your CSV file located at {}!".format(
			os.path.abspath(filepath),
		))
    def write_posts_to_stream(
            threads: List,
            stream: TextIO,
            content_flaggers: List[ContentFlagger] = None) -> None:
        """
		:param threads: The list of FourPlebsAPI_Post objects to save.
		:param stream: A TextIO object.
		:param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts.
			Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default.
		:return:
		"""

        # Fields we want to save in the CSV
        fieldnames = [
            'board',
            'post_id',
            'post_url',
            'thread_id',
            'thread_url',
            'full_comment',
            'thread_api_url',
            'post_api_url',
            'op',
            'country_code',
            'timestamp_epoch',
            'timestamp',
        ]
        # Add our flagger descriptions
        fieldnames += [flagger.csv_description for flagger in content_flaggers]

        writer = csv.DictWriter(stream, fieldnames=fieldnames)
        writer.writeheader()

        for thread in threads:

            row_reply = {
                'board': thread.board_code,
                'post_id': thread.post_id,
                'post_url': thread.gen_post_url(),
                'post_api_url': thread.gen_post_api_url(),
                'thread_id': thread.thread_num,
                'thread_url': thread.gen_thread_url(),
                'thread_api_url': thread.gen_thread_api_url(),
                'country_code': thread.poster_country,
                'full_comment': csv_safe_string(thread.comment),
                'op': True,
                'timestamp': epoch_to_human_date(thread.timestamp),
                'timestamp_epoch': thread.timestamp,
            }

            # for every flagger, apply its analysis to the post's comment
            for flagger in content_flaggers:
                row_reply.update(**{
                    flagger.csv_description:
                    flagger.flag_content(thread.comment)
                })

            writer.writerow(row_reply)

            for reply in thread.subposts:
                # TODO: Find a more elegant way to process these subposts! This is duplicated code!

                # print("Subpost:")
                # print(subpost)
                print('WOW WOW', reply)
                row_reply = {
                    'board':
                    reply['board']['shortname'],
                    'post_id':
                    reply['num'],
                    'post_url':
                    gen_post_url(reply['board']['shortname'],
                                 reply['thread_num'], reply['num']),
                    'post_api_url':
                    gen_post_api_url(reply['board']['shortname'],
                                     reply['num']),
                    'thread_id':
                    reply['thread_num'],
                    'thread_url':
                    gen_thread_url(reply['board']['shortname'],
                                   reply['thread_num']),
                    'thread_api_url':
                    gen_thread_api_url(reply['board']['shortname'],
                                       reply['thread_num']),
                    'country_code':
                    reply['poster_country'],
                    'full_comment':
                    csv_safe_string(reply['comment']),
                    'op':
                    False,
                    'timestamp':
                    epoch_to_human_date(reply['timestamp']),
                    'timestamp_epoch':
                    reply['timestamp'],
                }

                # for every flagger, apply its analysis to the subpost's comment
                for flagger in content_flaggers:
                    row_reply.update(
                        **{
                            flagger.csv_description:
                            flagger.flag_content(reply['comment'])
                        })

                writer.writerow(row_reply)