def testSmallCSVFile(self): results = {} # Add a specific thread, http://archive.4plebs.org/x/thread/23732801/ results.update(**httpGET_json(gen_thread_api_url('x', 23732801))) # First page of /x/ results.update(**httpGET_json(gen_index_api_url('x', 1))) # Turn that json dict into a list of Post objects postList = FourPlebsAPI_Post.from_post_json(results) output_csv_filepath = 'out/testcase-output-small-example.csv' CSVPostWriter.write_posts_to_csv(postList, output_csv_filepath, ALL_CONTENT_FLAGGERS) i = 0 # All lines in this CSV should contain commas! with open(output_csv_filepath, 'r') as f: for line in f: print("line {}".format(i)) self.assertIn(',', line) i += 1 with open(output_csv_filepath, 'r') as f: self.ensure_csv_has_no_empty_fields(f, count=4)
def generate_small_example_csv(): results = {} # Add a specific thread, http://archive.4plebs.org/x/thread/23732801/ results.update(**httpGET_json(gen_thread_api_url('x', 23732801))) for i in range(1, 10): # Get the posts from page 1-10 /pol/ results.update(**httpGET_json(gen_index_api_url('pol', i))) # Add on the posts from page 1 /x/ results.update(**httpGET_json(gen_index_api_url('x', 1))) # Turn that json dict into a list of Post objects postList = FourPlebsAPI_Post.from_post_json(results) # # For all posts from the two index pages (/x/, /pol/) # for post in postList: # print(post) CSVPostWriter.write_posts_to_csv(postList, 'out/post-output-small-example.csv', ALL_CONTENT_FLAGGERS)
def gen_thread_api_url(self): return gen_thread_api_url(self.board_code, self.thread_num)
def write_posts_to_stream( threads: List, stream: TextIO, content_flaggers: List[ContentFlagger] = None) -> None: """ :param threads: The list of FourPlebsAPI_Post objects to save. :param stream: A TextIO object. :param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts. Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default. :return: """ # Fields we want to save in the CSV fieldnames = [ 'board', 'post_id', 'post_url', 'thread_id', 'thread_url', 'full_comment', 'thread_api_url', 'post_api_url', 'op', 'country_code', 'timestamp_epoch', 'timestamp_ISO8601', 'No content flagger tripped', # it doesn't trip any content flaggers, presumably benign ] # Add our flagger descriptions fieldnames += [flagger.csv_description for flagger in content_flaggers] writer = csv.DictWriter(stream, fieldnames=fieldnames) writer.writeheader() for thread in threads: row_reply = { 'board': thread.board_code, 'post_id': thread.post_id, 'post_url': thread.gen_post_url(), 'post_api_url': thread.gen_post_api_url(), 'thread_id': thread.thread_num, 'thread_url': thread.gen_thread_url(), 'thread_api_url': thread.gen_thread_api_url(), 'country_code': thread.poster_country, 'full_comment': csv_safe_string(thread.comment), 'op': True, 'timestamp_ISO8601': epoch_to_ISO8601(thread.timestamp), 'timestamp_epoch': thread.timestamp, } # for every flagger, apply its analysis to the post's comment for flagger in content_flaggers: row_reply.update(**{ flagger.csv_description: flagger.flag_content(thread.comment) }) # calculate if this item has had ZERO detections. trips = 0 for flagger in content_flaggers: if row_reply[flagger.csv_description]: trips += 1 # if so, write a row that accounts for this. if trips > 0: row_reply.update(**{'No content flagger tripped': False}) else: row_reply.update(**{'No content flagger tripped': True}) writer.writerow(row_reply) for reply in thread.subposts: # TODO: Find a more elegant way to process these subposts! This is duplicated code! # writer.writerow({'op':"this breaks test cases now! yes! :)"}) # print("Subpost:") # print(subpost) row_reply = { 'board': reply['board']['shortname'], 'post_id': reply['num'], 'post_url': gen_post_url(reply['board']['shortname'], reply['thread_num'], reply['num']), 'post_api_url': gen_post_api_url(reply['board']['shortname'], reply['num']), 'thread_id': reply['thread_num'], 'thread_url': gen_thread_url(reply['board']['shortname'], reply['thread_num']), 'thread_api_url': gen_thread_api_url(reply['board']['shortname'], reply['thread_num']), 'country_code': reply['poster_country'], 'full_comment': csv_safe_string(reply['comment']), 'op': False, 'timestamp_ISO8601': epoch_to_ISO8601(reply['timestamp']), 'timestamp_epoch': reply['timestamp'], } # for every flagger, apply its analysis to the subpost's comment for flagger in content_flaggers: row_reply.update( **{ flagger.csv_description: flagger.flag_content(reply['comment']) }) # calculate if this item has had ZERO detections. trips = 0 for flagger in content_flaggers: if row_reply[flagger.csv_description] is True: trips += 1 # if so, write a row that accounts for this. if trips > 0: row_reply.update(**{'No content flagger tripped': False}) else: row_reply.update(**{'No content flagger tripped': True}) writer.writerow(row_reply)
def write_posts_to_csv(posts: List, filepath: str, content_flaggers: List[ContentFlagger] = None) -> None: """ :param posts: The list of FourPlebsAPI_Post objects to save. :param filepath: The filepath of the CSV scraper. :param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts. Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default. :return: """ # Ensure that enclosing directory exists if not os.path.exists(os.path.dirname(filepath)): os.makedirs(os.path.dirname(filepath)) if not filepath.split('.')[-1] == 'csv': raise Exception("File doesn't end in `csv`!") with open(filepath, 'w', newline='', encoding='utf-8') as csvfile: # Fields we want to save in the CSV fieldnames = [ 'board', 'post_id', 'post_url', 'thread_id', 'thread_url', 'full_comment', 'thread_api_url', 'post_api_url', 'op', 'country_code', 'timestamp_epoch', 'timestamp', ] # Add our flagger descriptions fieldnames += [flagger.csv_description for flagger in content_flaggers] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for post in posts: writer.writerow({ 'board': post.board_code, 'post_id': post.post_id, 'post_url': post.gen_post_url(), 'post_api_url': post.gen_post_api_url(), 'thread_id': post.thread_num, 'thread_url': post.gen_thread_url(), 'thread_api_url': post.gen_thread_api_url(), 'country_code': post.poster_country, 'full_comment': csv_safe_string(post.comment), 'op': True, 'timestamp': epoch_to_human_date(post.timestamp), 'timestamp_epoch': post.timestamp, }) # for every flagger, apply its analysis to the post's comment for flagger in content_flaggers: writer.writerow({ flagger.csv_description: flagger.flag_content(post.comment) }) for subpost in post.subposts: # TODO: Find a more elegant way to process these subposts! This is duplicated code! # print("Subpost:") # print(subpost) writer.writerow({ 'board': subpost['board']['shortname'], 'post_id': subpost['num'], 'post_url': gen_post_url(subpost['board']['shortname'], subpost['thread_num'], subpost['num']), 'post_api_url': gen_post_api_url(subpost['board']['shortname'], subpost['num']), 'thread_id': subpost['thread_num'], 'thread_url': gen_thread_url(subpost['board']['shortname'], subpost['thread_num']), 'thread_api_url': gen_thread_api_url(subpost['board']['shortname'], subpost['thread_num']), 'country_code': subpost['poster_country'], 'full_comment': csv_safe_string(subpost['comment']), 'op': False, 'timestamp': epoch_to_human_date(subpost['timestamp']), 'timestamp_epoch': subpost['timestamp'], }) # for every flagger, apply its analysis to the subpost's comment for flagger in content_flaggers: writer.writerow({ flagger.csv_description: flagger.flag_content(subpost['comment']) }) print("Enjoy your CSV file located at {}!".format( os.path.abspath(filepath), ))
def write_posts_to_stream( threads: List, stream: TextIO, content_flaggers: List[ContentFlagger] = None) -> None: """ :param threads: The list of FourPlebsAPI_Post objects to save. :param stream: A TextIO object. :param content_flaggers: Optional. The list of ContentFlagger objects that should flag posts. Use ALL_CONTENT_FLAGGERS to use all content flaggers that are defined by default. :return: """ # Fields we want to save in the CSV fieldnames = [ 'board', 'post_id', 'post_url', 'thread_id', 'thread_url', 'full_comment', 'thread_api_url', 'post_api_url', 'op', 'country_code', 'timestamp_epoch', 'timestamp', ] # Add our flagger descriptions fieldnames += [flagger.csv_description for flagger in content_flaggers] writer = csv.DictWriter(stream, fieldnames=fieldnames) writer.writeheader() for thread in threads: row_reply = { 'board': thread.board_code, 'post_id': thread.post_id, 'post_url': thread.gen_post_url(), 'post_api_url': thread.gen_post_api_url(), 'thread_id': thread.thread_num, 'thread_url': thread.gen_thread_url(), 'thread_api_url': thread.gen_thread_api_url(), 'country_code': thread.poster_country, 'full_comment': csv_safe_string(thread.comment), 'op': True, 'timestamp': epoch_to_human_date(thread.timestamp), 'timestamp_epoch': thread.timestamp, } # for every flagger, apply its analysis to the post's comment for flagger in content_flaggers: row_reply.update(**{ flagger.csv_description: flagger.flag_content(thread.comment) }) writer.writerow(row_reply) for reply in thread.subposts: # TODO: Find a more elegant way to process these subposts! This is duplicated code! # print("Subpost:") # print(subpost) print('WOW WOW', reply) row_reply = { 'board': reply['board']['shortname'], 'post_id': reply['num'], 'post_url': gen_post_url(reply['board']['shortname'], reply['thread_num'], reply['num']), 'post_api_url': gen_post_api_url(reply['board']['shortname'], reply['num']), 'thread_id': reply['thread_num'], 'thread_url': gen_thread_url(reply['board']['shortname'], reply['thread_num']), 'thread_api_url': gen_thread_api_url(reply['board']['shortname'], reply['thread_num']), 'country_code': reply['poster_country'], 'full_comment': csv_safe_string(reply['comment']), 'op': False, 'timestamp': epoch_to_human_date(reply['timestamp']), 'timestamp_epoch': reply['timestamp'], } # for every flagger, apply its analysis to the subpost's comment for flagger in content_flaggers: row_reply.update( **{ flagger.csv_description: flagger.flag_content(reply['comment']) }) writer.writerow(row_reply)