def prep_redditor(data): """ Prepare Redditor data. Calls previously defined public methods: CleanData.count_words() PrepMutts.prep_mutts() Parameters ---------- data: dict Dictionary containing extracted scrape data Returns ------- frequencies: dict Dictionary containing finalized word frequencies """ status = Status("Finished Redditor analysis.", "Analyzing Redditor scrape.", "white") plt_dict = dict() status.start() for interactions in data["interactions"].values(): PrepMutts.prep_mutts(interactions, plt_dict) status.succeed() return plt_dict
def _prep_raw(data, plt_dict): """ Prepare raw submission comments. Calls previously defined public method: CleanData.count_words() Parameters ---------- data: list List containing extracted scrape data plt_dict: dict Dictionary containing frequency data Returns ------- None """ status = Status("Finished raw submission comments analysis.", "Analyzing raw submission comments scrape.", "white") status.start() for comment in data: CleanData.count_words("body", comment, plt_dict) status.succeed()
def prep_subreddit(data): """ Prepare Subreddit data. Calls previously defined public method: CleanData.count_words() Parameters ---------- data: list List containing extracted scrape data Returns ------- frequencies: dict Dictionary containing finalized word frequencies """ status = Status("Finished Subreddit analysis.", "Analyzing Subreddit scrape.", "white") plt_dict = dict() status.start() for submission in data: CleanData.count_words("selftext", submission, plt_dict) CleanData.count_words("title", submission, plt_dict) status.succeed() return plt_dict
def __init__(self, args, submission, url): """ Initialize variables used in later methods: self._submission: PRAW submission object Calls replace_more() method on submission object to get nested comments. Parameters ---------- args: Namespace Namespace object containing all arguments that were defined in the CLI submission: PRAW submission object url: str String denoting the submission's url Returns ------- None """ self._args = args self._url = url more_comments_status = Status( "Finished resolving instances of MoreComments.", Fore.CYAN + Style.BRIGHT + "Resolving instances of MoreComments. This may take a while. Please wait.", "cyan" ) more_comments_status.start() self._submission = submission self._submission.comments.replace_more(limit = None) more_comments_status.succeed()
def format_json(args, skeleton, submissions, subreddit): """ Format submission metadata for JSON export. Parameters ---------- args: Namespace Namespace object containing all arguments that were defined in the CLI reddit: PRAW Reddit object skeleton: dict Dictionary containing all Subreddit scrape data sub: str String denoting the Subreddit name submissions: list List containing submission objects """ format_status = Status("Finished formatting data for JSON export.", "Formatting data for JSON export.", "white") format_status.start() skeleton["data"] = submissions if args.rules: FormatJSON._add_subreddit_rules(skeleton, subreddit) format_status.succeed()
def format_csv(submissions): """ Format submission metadata for CSV export. Parameters ---------- submissions: list List containing submission objects Returns ------- overview: dict Dictionary containing submission data """ format_status = Status("Finished formatting data for CSV export.", "Formatting data for CSV export.", "white") overview = dict() format_status.start() for submission in submissions: for field, metadata in submission.items(): if field not in overview.keys(): overview[field] = [] overview[field].append(metadata) format_status.succeed() return overview
def prep_subreddit(data, file): """ Prepare Subreddit data. Calls previously defined public method: CleanData.count_words() Parameters ---------- data: dict Dictionary containing extracted scrape data file: str String denoting the filepath Returns ------- frequencies: dict Dictionary containing finalized word frequencies """ status = Status("Finished Subreddit analysis.", "Analyzing Subreddit scrape.", "white") plt_dict = dict() status.start() for submission in data: CleanData.count_words("selftext", submission, plt_dict) CleanData.count_words("title", submission, plt_dict) status.succeed() return dict( sorted(plt_dict.items(), key=lambda item: item[1], reverse=True))
def generate(args): """ Generate frequencies. Calls previously defined public methods: ExportFrequencies.export() PrintConfirm().confirm() Sort().create_csv() Sort().create_json() Sort().get_data() Sort().name_and_create_dir() Calls public methods from external modules: AnalyticsTitles.f_title() Parameters ---------- args: Namespace Namespace object containing all arguments used in the CLI Returns ------- None """ AnalyticsTitles.f_title() for file in args.frequencies: f_type, filename = Sort().name_and_create_dir(args, file) plt_dict = Sort().get_data(file) Halo().info("Generating frequencies.") print() data = Sort().create_csv(plt_dict) \ if args.csv \ else Sort().create_json(file, plt_dict) export_status = Status( Style.BRIGHT + Fore.GREEN + "Frequencies exported to %s." % "/".join( filename.split("/") [filename.split("/").index("scrapes"):]), "Exporting frequencies.", "white") export_status.start() ExportFrequencies.export(data, f_type, filename) export_status.succeed() print()
def save_wordcloud(self, analytics_dir, scrape_file, wc): """ Save wordcloud to file. Calls a public method from an external module: GetPath.name_file() Parameters ---------- analytics_dir: str String denoting the path to the directory in which the analytical data will be written scrape_file: list List containing scrape files and file formats to generate wordcloud with wc: WordCloud Wordcloud instance Returns ------- new_filename: str String denoting the filename for the exported wordcloud """ filename = GetPath.name_file(analytics_dir, scrape_file[0]) split_path = list(Path(filename).parts) split_filename = split_path[-1].split(".") split_filename[-1] = scrape_file[-1] split_path[-1] = ".".join(split_filename) new_filename = "/".join(split_path) export_status = Status( Style.BRIGHT + Fore.GREEN + f"Wordcloud exported to {new_filename}.", "Exporting wordcloud.", "white" ) export_status.start() wc.to_file(new_filename) export_status.succeed() print() return new_filename
def _create_directory_tree(date_dir, tree): """ Create the directory Tree based on the date_dir Path using iterative depth-first search. Parameters ---------- date_dir: str String denoting the path to the date directory tree: Tree instance Returns ------- None """ build_tree_status = Status("Displaying directory tree.", f"Building directory tree for {date_dir}.", "cyan") stack = DateTree._create_stack(date_dir, tree) visited = set() visited.add(Path(date_dir)) build_tree_status.start() while stack: current = stack.pop(0) current_path, current_tree = current[0], current[1] if current_path in visited: continue elif current_path.is_dir(): sub_tree = current_tree.add(f"[bold blue]{current_path.name}") sub_paths = DateTree._create_stack(current_path, sub_tree) stack = sub_paths + stack elif current_path.is_file(): file_size = current_path.stat().st_size current_tree.add( f"[bold]{current_path.name} [{decimal(file_size)}]") visited.add(current_path) build_tree_status.succeed() print()
def sort_structured(submission, url): """ Sort all comments in structured format. Calls previously defined public methods: CommentNode() Forest() Forest().seed() CreateComment.create() Calls a public method from an external module: EncodeNode().encode() Parameters ---------- submission: PRAW submission object url: str String denoting the submission's url Returns ------- replies: list List containing `CommentNode`s """ forest = Forest(submission, url) seed_status = Status( "Forest has fully matured.", Fore.CYAN + Style.BRIGHT + "Seeding Forest.", "cyan" ) seed_status.start() for comment in submission.comments.list(): comment_node = CommentNode(CreateComment.create(comment)) EncodeNode().encode(comment_node) forest.seed(comment_node) seed_status.succeed() return forest.root.replies
def prep_redditor(data, file): """ Prepare Redditor data. Calls previously defined public method: CleanData.count_words() Parameters ---------- data: dict Dictionary containing extracted scrape data file: str String denoting the filepath Returns ------- frequencies: dict Dictionary containing finalized word frequencies """ status = Status("Finished Redditor analysis.", "Analyzing Redditor scrape.", "white") plt_dict = dict() status.start() for interactions in data["interactions"].values(): for obj in interactions: ### Indicates there is valid data in this field. if isinstance(obj, dict): if obj["type"] == "submission": CleanData.count_words("selftext", obj, plt_dict) CleanData.count_words("title", obj, plt_dict) elif obj["type"] == "comment": CleanData.count_words("body", obj, plt_dict) ### Indicates this field is forbidden. elif isinstance(obj, str): continue status.succeed() return dict( sorted(plt_dict.items(), key=lambda item: item[1], reverse=True))
def _prep_structured(data, plt_dict): """ An iterative implementation of depth-first search to prepare structured comments. Parameters ---------- data: list List containing extracted scrape data plt_dict: dict Dictionary containing frequency data Returns ------- None """ status = Status("Finished structured submission comments analysis.", "Analyzing structured submission comments scrape.", "white") status.start() for comment in data: CleanData.count_words("body", comment, plt_dict) stack = [] stack.append(comment) visited = [] visited.append(comment) while stack: current_comment = stack.pop(0) for reply in current_comment["replies"]: CleanData.count_words("body", reply, plt_dict) if reply not in visited: stack.insert(0, reply) visited.append(reply) status.succeed()
def prep_livestream(data): """ Prepare livestream data. Parameters ---------- data: list List containing extracted scrape data """ status = Status("Finished livestream analysis.", "Analyzing livestream scrape.", "white") plt_dict = {} status.start() PrepMutts.prep_mutts(data, plt_dict) status.succeed() return plt_dict
def initialize_wordcloud(file, scrape_type): """ Initialize wordcloud by setting dimensions, max font size, and generating it from word frequencies. Calls a public method from an external module: PrepData.prep() Parameters ---------- file: list List containing scrape files and file formats to generate wordcloud with scrape_type: str String denoting the scrape type Returns ------- wc: WordCloud WordCloud instance """ frequencies = PrepData.prep(file[0], scrape_type) initialize_status = Status( "Generated wordcloud.", "Generating wordcloud.", "white" ) initialize_status.start() wordcloud = WordCloud( height = 1200, max_font_size = 400, width = 1600 ).generate_from_frequencies(frequencies) initialize_status.succeed() return wordcloud
def save_wordcloud(self, file, wc): """ Save wordcloud to file. Calls public methods from external modules: GetPath.name_file() InitializeDirectory.make_analytics_directory() Parameters ---------- file: list List containing scrape files and file formats to generate wordcloud with wc: WordCloud Wordcloud instance Returns ------- filename: str String denoting the filename for the exported wordcloud """ date_dir, filename = GetPath.name_file(file[1], file[0], "wordclouds") export_status = Status( Style.BRIGHT + Fore.GREEN + "Wordcloud exported to %s." % "/".join(filename.split("/")[filename.split("/").index("scrapes"):]), "Exporting wordcloud.", "white" ) export_status.start() InitializeDirectory.make_analytics_directory(date_dir, "wordclouds") wc.to_file(filename) export_status.succeed() print() return filename
def validate(object_list, reddit, scraper_type): """ Check if Subreddit(s), Redditor(s), or submission(s) exist and catch PRAW exceptions. Log invalid Reddit objects to `urs.log` if applicable. Calls previously defined public method: Validation.check_existence() Parameters ---------- object_list: list List of Reddit objects to check reddit: Reddit object Reddit instance created by PRAW API credentials scraper_type: str String denoting the scraper type Returns ------- invalid: list List of invalid Reddit objects valid: list List of valid Reddit objects """ object_type = "submission" \ if scraper_type == "comments" \ else scraper_type.capitalize() check_status = Status( "Finished %s validation." % object_type, "Validating %s(s)" % object_type, "white" ) check_status.start() logging.info("Validating %s(s)..." % object_type) logging.info("") invalid, valid = Validation.check_existence(object_list, reddit, scraper_type) check_status.succeed() print() if invalid: warning_message = "The following %ss were not found and will be skipped:" % object_type print(Fore.YELLOW + Style.BRIGHT + warning_message) print(Fore.YELLOW + Style.BRIGHT + "-" * len(warning_message)) print(*invalid, sep = "\n") logging.warning("Failed to validate the following %ss:" % object_type) logging.warning("%s" % (invalid)) logging.warning("Skipping.") logging.info("") if not valid: logging.critical("ALL %sS FAILED VALIDATION." % object_type.upper()) Errors.n_title(object_type + "s") logging.critical("NO %sS LEFT TO SCRAPE." % object_type.upper()) logging.critical("ABORTING URS.\n") quit() return invalid, valid
def _make_json_skeleton(args, limit, submission, url): """ Create a skeleton for JSON export. Include scrape details at the top. Parameters ---------- args: Namespace Namespace object containing all arguments that were defined in the CLI limit: str Integer of string type denoting n_results or RAW format submission: PRAW submission object url: str String denoting submission URL Returns ------- skeleton: dict Dictionary containing scrape settings and all scrape data """ metadata_status = Status( "Extracted submission metadata.", "Extracting submission metadata.", "white" ) metadata_status.start() skeleton = { "scrape_settings": { "n_results": int(limit) \ if int(limit) > 0 \ else "all", "style": "structured" \ if not args.raw \ else "raw", "url": url }, "data": { "submission_metadata": { "author": "u/" + submission.author.name \ if hasattr(submission.author, "name") \ else "[deleted]", "created_utc": convert_time(submission.created_utc), "distinguished": submission.distinguished, "edited": submission.edited \ if submission.edited == False \ else convert_time(submission.edited), "is_original_content": submission.is_original_content, "is_self": submission.is_self, "link_flair_text": submission.link_flair_text, "locked": submission.locked, "num_comments": submission.num_comments, "nsfw": submission.over_18, "permalink": submission.permalink, "score": submission.score, "selftext": submission.selftext, "spoiler": submission.spoiler, "stickied": submission.stickied, "subreddit": submission.subreddit.display_name, "title": submission.title, "upvote_ratio": submission.upvote_ratio }, "comments": None } } metadata_status.succeed() return skeleton