Python RedditData Examples

Programming Language: Python

Namespace/Package Name: utils.reddit

Class/Type: RedditData

Examples at hotexamples.com: 2

Python RedditData - 2 examples found. These are the top rated real world Python examples of utils.reddit.RedditData extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

close(1)

Example #1

Show file

File: main.py Project: keeper-of-data/reddit-archive-x

    def __init__(self, reddit_data, save_path, num_threads, is_just_json):
        super().__init__('root')
        self.base_dir = self.norm_path(save_path)

        # Do we only want the json files?
        self.just_json = is_just_json

        # Thread life
        self.num_threads = num_threads
        self.q = Queue(maxsize=0)

        scraper_name = socket.gethostname()  # Name of scraper to put in the user agent
        self.reddit = RedditData(reddit_data, scraper_name)
        self.reddit.login()

        # We only need the static files if we are downloading the content as well
        if self.just_json is False:
            # Create a temp downloads folder
            self.download_path = self.create_save_path("temp", "downloads")

            # Add static templates to use
            self.static = StaticTemplates()

            # Create/update static assets
            self.gen_static_files()

            # Setup external scraper
            self.ed = ExternalDownload(self.base_dir, self.download_path, 'root')

            # Create failed domain down path
            self.failed_domain_file = os.path.join(self.base_dir, 'logs', 'failed_domains.csv')

        # Dict of users and subreddits to scrape
        self.scrape = {}

        # load content into self.scrape
        self.load_scrape_config()

        # Run parser
        self.main()

        # Clean up
        self.cleanup()

Example #2

Show file

File: main.py Project: keeper-of-data/reddit-archive-x

class RedditScraper(GeneralUtils):

    def __init__(self, reddit_data, save_path, num_threads, is_just_json):
        super().__init__('root')
        self.base_dir = self.norm_path(save_path)

        # Do we only want the json files?
        self.just_json = is_just_json

        # Thread life
        self.num_threads = num_threads
        self.q = Queue(maxsize=0)

        scraper_name = socket.gethostname()  # Name of scraper to put in the user agent
        self.reddit = RedditData(reddit_data, scraper_name)
        self.reddit.login()

        # We only need the static files if we are downloading the content as well
        if self.just_json is False:
            # Create a temp downloads folder
            self.download_path = self.create_save_path("temp", "downloads")

            # Add static templates to use
            self.static = StaticTemplates()

            # Create/update static assets
            self.gen_static_files()

            # Setup external scraper
            self.ed = ExternalDownload(self.base_dir, self.download_path, 'root')

            # Create failed domain down path
            self.failed_domain_file = os.path.join(self.base_dir, 'logs', 'failed_domains.csv')

        # Dict of users and subreddits to scrape
        self.scrape = {}

        # load content into self.scrape
        self.load_scrape_config()

        # Run parser
        self.main()

        # Clean up
        self.cleanup()

    def main(self):
        ###
        # Thread processing of each failed post
        ###
        for i in range(self.num_threads):
            worker = threading.Thread(target=self.post_worker)
            worker.setDaemon(True)
            worker.start()

        try:
            stream = praw.helpers.submission_stream(self.reddit.r, 'all', None, 0)
            for item in stream:
                self.q.put(item)
            self.q.join()
        except InterruptedError:
            return

    def post_worker(self):
        """
        Function to be used as the thread worker
        """
        try:
            while True:
                self.parse_post(self.q.get())
                self.q.task_done()
        except Exception as e:
            self.log("Exception in main for posts: " + str(e) + "\n" + str(traceback.format_exc()), level='critical')

    def load_scrape_config(self):
        """
        Load scrape.ini config file into self.scrape
        This will run every n seconds to get any updates to the config in its own thread
        """
        # Read scrap config file
        scrape_config_file = './configs/scrape.ini'
        if not os.path.isfile(scrape_config_file):
            self.cprint("\nScrape config file not found: " + scrape_config_file, log=True)
        config.read(scrape_config_file)

        temp_scrape = {'subreddits': [], 'users': [], 'content': {}}

        # Break down the params in the user and subreddit lists
        for feed in ['users', 'subreddits']:
            for subreddit in config['scrape'][feed].split("\n"):
                option = subreddit.lower().split(',')
                temp_scrape[feed].append(option[0].strip())
                if len(option) > 1:
                    temp_scrape['content'][option[0].strip()] = option[1].strip().lower()

        # Cpoy temp_scrape to self.scrape
        self.scrape = temp_scrape.copy()

        self.log("Reloaded scape config: " + str(self.scrape['subreddits']), level='debug')

        # Check to see if both the subreddit and user lists are blank
        #   If so exit the script as there is no reason to run
        if (len(temp_scrape['users']) == 1 and temp_scrape['users'][0] == '') and (len(temp_scrape['subreddits'])  == 1 and temp_scrape['subreddits'][0] == ''):
            self.cprint("You have no users or subreddits in ./configs/scrape.ini", log=True)
        else:
            self.cprint("Searching for posts", log=True)

        # Reload again in n seconds
        t_reload = threading.Timer(10, self.load_scrape_config)
        t_reload.setDaemon(True)
        t_reload.start()

    def parse_post(self, raw_post):
        """
        Process post
        """
        post = vars(raw_post)
        # Convert objects to strings
        if raw_post.author:
            post['author'] = raw_post.author.name
        else:
            post['author'] = '[deleted]'
        post['subreddit'] = str(raw_post.subreddit).lower()

        # Check if we even want this post
        if 'all' not in self.scrape['subreddits']:
            if post['subreddit'] not in self.scrape['subreddits'] and \
               post['author'].lower() not in self.scrape['subreddits']:
                # This is not the post we are looking for, move along
                return

        # Check if we want only sfw or nsfw content from this subreddit
        if 'all' not in self.scrape['content']:
            if post['subreddit'] in self.scrape['content']:
                if self.scrape['content'][post['subreddit']] == 'nsfw' and post['over_18'] is False:
                    return
                elif self.scrape['content'][post['subreddit']] == 'sfw' and post['over_18'] is True:
                    return
        else:
            if self.scrape['content']['all'] == 'nsfw' and post['over_18'] is False:
                return
            elif self.scrape['content']['all'] == 'sfw' and post['over_18'] is True:
                return

        # Remove, we do not need this
        post.pop('reddit_session')

        self.cprint("Checking post: " + post['id'])

        created = self.get_datetime(post['created_utc'])
        y = str(created.year)
        m = str(created.month)
        d = str(created.day)
        utc_str = str(int(post['created_utc']))

        # Check here if we just want the json
        #   If we do save `post` to json file and move on
        if self.just_json:
            # Also check if the first 3 letters match
            #  We already checked if the whole name was in bad_folders

            sub = post['subreddit'][0:3]
            sub_dir = sub
            # Check if first 3 letters of sub name is in bad_folders
            if sub in self.bad_folders or post['subreddit'] in self.bad_folders:
                sub_dir = sub + "_r_" + sub
            # Check if full sub name is in bad_folders
            if post['subreddit'] in self.bad_folders:
                post['subreddit_original'] = post['subreddit']
                post['subreddit'] = sub_dir

            # Create .json savepath, filename will be created_utc_id.json
            # Create directory 3 letters deep (min length of a subreddit name)
            self.log("Saving just json for subreddit: " + post['subreddit'], level='info')
            # Make sure the subreddit cannot create the folder `con` (Windows bug)
            jjson_save_path = self.create_base_path('subreddits',
                                                    post['subreddit'][0:1],
                                                    post['subreddit'][0:2],
                                                    sub_dir,
                                                    post['subreddit'],
                                                    y, m, d
                                                    )
            # Save json data
            jjson_save_file = os.path.join(jjson_save_path, utc_str + "_" + post['id'] + ".json")
            try:
                self.save_file(jjson_save_file, post, content_type='json')
            except Exception as e:
                self.log("Exception [just_json]: " + post['subreddit'] + "\n" + str(e) + " " + post['id'] + "\n" + str(traceback.format_exc()), level='critical')
            # We are done here
            return

        # Check for bad folder names, only care about authors if we are saving content
        if post['author'] in self.bad_folders:
            post['author_original'] = post['author']
            post['author'] = post['author'] + "_u_" + post['author']

        if post['subreddit'] in self.bad_folders:
            post['subreddit_original'] = post['subreddit']
            post['subreddit'] = post['subreddit'] + "_r_" + post['subreddit']

        ###
        # Used for linking on other pages
        ###
        post['user_web_path'] = self.create_web_path(post['author'], path_type="user")
        post['post_web_path'] = self.create_web_path(post['author'], y, m, d, utc_str, path_type="post")
        ###
        # Used to save files/content
        ###
        post['user_save_path'] = self.create_base_path(post['user_web_path'])
        post['post_save_path'] = self.create_base_path(post['post_web_path'])

        post_json_file = os.path.join(post['post_save_path'], "post.json")

        ###
        # If we already have the post then skip it
        ###
        if os.path.isfile(post_json_file):
            return True

        ###
        # If there is no user json file, create new user
        ###
        if not os.path.isfile(post['user_save_path'] + "user.json"):
            self.add_new_user(post)

        self.cprint("Getting post " + post['id'] + " by: " + post['author'])

        ###
        # Download thumbnail if there is one
        ###
        if len(post['thumbnail']) > 0 and post['thumbnail'].startswith('http'):
            post['thumbnail_original'] = post['thumbnail']
            download_response = self.ed.download(post['thumbnail_original'], post['user_save_path'])
            # If the thumbnail does not download then download_responce would have lenght 0
            if len(download_response) > 0:
                thumbnail_download = download_response[0]
                post['thumbnail'] = self.save_to_web_path(thumbnail_download)

        ###
        # Process post data and download any media needed
        ###
        if post['is_self'] is False:
            # Try to save the content
            post = self.download_content(post)

        ###
        # Now save post data to json
        ###
        self.save_file(post_json_file, post, content_type='json')

        ###
        # Create post html file
        ###
        self.save_file(os.path.join(post['post_save_path'], "index.html"), self.static.gen_frame('post_viewer'), content_type='html')

        url_appends = []
        ###
        # Add post to user urls
        ###
        user_post_base = self.create_base_path('user', post['author'][0], post['author'], 'posts')
        url_appends.append(self.create_joined_path(user_post_base, y))
        url_appends.append(self.create_joined_path(user_post_base, y, m))
        url_appends.append(self.create_joined_path(user_post_base, y, m, d))

        ###
        # Add post to subreddit urls
        ###
        subreddit_post_base = self.create_base_path('subreddit', post['subreddit'][0], post['subreddit'])
        url_appends.append(self.create_joined_path(subreddit_post_base, y))
        url_appends.append(self.create_joined_path(subreddit_post_base, y, m))
        url_appends.append(self.create_joined_path(subreddit_post_base, y, m, d))

        ###
        # Append urls to correct urls.csv files
        ###
        for path in url_appends:
            self.append_file(os.path.join(path, 'urls.csv'), post['post_web_path'])
            self.check_view_index(path)
            # self.log("Added " + post['post_web_path'] + " to " + path, level='debug')

        # Done doing things here
        return True

    def add_new_user(self, post):
        """
        Add new user to the system
        """
        # self.log("Adding new user: "******"index.html"), self.static.gen_redirect("./posts"), content_type='html')

    def check_view_index(self, path):
        """
        Check if there is an index.html in each of year, month, and day directories
        If not, create one
        """
        index_view_file = os.path.join(path, 'index.html')
        if not os.path.isfile(index_view_file):
            # self.log("Creating view index at: " + index_view_file, level='debug')
            self.save_file(index_view_file, self.static.gen_frame('csv_viewer'), content_type='html')

    def create_web_path(self, base, *args, path_type=''):
        """
        Creates absolute path that will be used on the web server
        """
        path = ''
        if path_type == 'user' or path_type == 'post':
            path = "/user/" + base[0] + "/" + base + "/"
            if path_type == 'post':
                path += "posts/" + "/".join(args) + "/"
        else:
            path = "/" + "/".join(args)

        return path

    def gen_static_files(self):
        """
        Every run, create/update the static files
        """
        save_path_js = self.create_base_path("assets", "js")
        self.copy_file("./static_assets/js/jquery.js", os.path.join(save_path_js, "jquery.js"))
        self.copy_file("./static_assets/js/csvToArray.js", os.path.join(save_path_js, "csvToArray.js"))
        self.copy_file("./static_assets/js/functions.js", os.path.join(save_path_js, "functions.js"))

        save_path_css = self.create_base_path("assets", "css")
        self.copy_file("./static_assets/css/styles.css", os.path.join(save_path_css, "styles.css"))

        save_path_templates = self.create_base_path("assets", "templates")
        self.copy_file("./static_assets/templates/csv_viewer.html", os.path.join(save_path_templates, "csv_viewer.html"))
        self.copy_file("./static_assets/templates/post_viewer.html", os.path.join(save_path_templates, "post_viewer.html"))

    def cleanup(self):
        self.reddit.close()