def check_user_existence(self, name: str): user = self.reddit_instance.redditor(name=name) try: if user.id: return except prawcore.exceptions.NotFound: raise errors.BulkDownloaderException(f'Could not find user {name}') except AttributeError: if hasattr(user, 'is_suspended'): raise errors.BulkDownloaderException(f'User {name} is banned')
def check_subreddit_status(subreddit: praw.models.Subreddit): if subreddit.display_name in ('all', 'friends'): return try: assert subreddit.id except prawcore.NotFound: raise errors.BulkDownloaderException( f'Source {subreddit.display_name} does not exist or cannot be found' ) except prawcore.Forbidden: raise errors.BulkDownloaderException( f'Source {subreddit.display_name} is private and cannot be scraped' )
def sanitise_subreddit_name(subreddit: str) -> str: pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$') match = re.match(pattern, subreddit) if not match: raise errors.BulkDownloaderException( f'Could not find subreddit name in string {subreddit}') return match.group(1)
def create_file_logger(self): main_logger = logging.getLogger() if self.args.log is None: log_path = Path(self.config_directory, 'log_output.txt') else: log_path = Path(self.args.log).resolve().expanduser() if not log_path.parent.exists(): raise errors.BulkDownloaderException( f'Designated location for logfile does not exist') backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3) file_handler = logging.handlers.RotatingFileHandler( log_path, mode='a', backupCount=backup_count, ) if log_path.exists(): try: file_handler.doRollover() except PermissionError: logger.critical( 'Cannot rollover logfile, make sure this is the only ' 'BDFR process or specify alternate logfile location') raise formatter = logging.Formatter( '[%(asctime)s - %(name)s - %(levelname)s] - %(message)s') file_handler.setFormatter(formatter) file_handler.setLevel(0) main_logger.addHandler(file_handler)
class RedditConnector(metaclass=ABCMeta): def __init__(self, args: Configuration): self.args = args self.config_directories = appdirs.AppDirs('bdfr', 'BDFR') self.run_time = datetime.now().isoformat() self._setup_internal_objects() self.reddit_lists = self.retrieve_reddit_lists() def _setup_internal_objects(self): self.determine_directories() self.load_config() self.create_file_logger() self.read_config() self.parse_disabled_modules() self.download_filter = self.create_download_filter() logger.log(9, 'Created download filter') self.time_filter = self.create_time_filter() logger.log(9, 'Created time filter') self.sort_filter = self.create_sort_filter() logger.log(9, 'Created sort filter') self.file_name_formatter = self.create_file_name_formatter() logger.log(9, 'Create file name formatter') self.create_reddit_instance() self.args.user = list( filter(None, [self.resolve_user_name(user) for user in self.args.user])) self.excluded_submission_ids = set.union( self.read_id_files(self.args.exclude_id_file), set(self.args.exclude_id), ) self.args.link = list( itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file))) self.master_hash_list = {} self.authenticator = self.create_authenticator() logger.log(9, 'Created site authenticator') self.args.skip_subreddit = self.split_args_input( self.args.skip_subreddit) self.args.skip_subreddit = set( [sub.lower() for sub in self.args.skip_subreddit]) def read_config(self): """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120) logger.debug( f'Setting maximum download wait time to {self.args.max_wait_time} seconds' ) if self.args.time_format is None: option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO') if re.match(r'^[\s\'\"]*$', option): option = 'ISO' logger.debug(f'Setting datetime format string to {option}') self.args.time_format = option if not self.args.disable_module: self.args.disable_module = [ self.cfg_parser.get('DEFAULT', 'disabled_modules', fallback='') ] # Update config on disk with open(self.config_location, 'w') as file: self.cfg_parser.write(file) def parse_disabled_modules(self): disabled_modules = self.args.disable_module disabled_modules = self.split_args_input(disabled_modules) disabled_modules = set( [name.strip().lower() for name in disabled_modules]) self.args.disable_module = disabled_modules logger.debug( f'Disabling the following modules: {", ".join(self.args.disable_module)}' ) def create_reddit_instance(self): if self.args.authenticate: logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): logger.log(9, 'Commencing OAuth2 authentication') scopes = self.cfg_parser.get( 'DEFAULT', 'scopes', fallback='identity, history, read, save') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( scopes, self.cfg_parser.get('DEFAULT', 'client_id'), self.cfg_parser.get('DEFAULT', 'client_secret'), ) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token with open(self.config_location, 'w') as file: self.cfg_parser.write(file, True) token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) self.authenticated = True self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), token_manager=token_manager, ) else: logger.debug('Using unauthenticated Reddit instance') self.authenticated = False self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), ) def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] master_list.extend(self.get_subreddits()) logger.log(9, 'Retrieved subreddits') master_list.extend(self.get_multireddits()) logger.log(9, 'Retrieved multireddits') master_list.extend(self.get_user_data()) logger.log(9, 'Retrieved user data') master_list.extend(self.get_submissions_from_link()) logger.log(9, 'Retrieved submissions for given links') return master_list def determine_directories(self): self.download_directory = Path( self.args.directory).resolve().expanduser() self.config_directory = Path(self.config_directories.user_config_dir) self.download_directory.mkdir(exist_ok=True, parents=True) self.config_directory.mkdir(exist_ok=True, parents=True) def load_config(self): self.cfg_parser = configparser.ConfigParser() if self.args.config: if (cfg_path := Path(self.args.config)).exists(): self.cfg_parser.read(cfg_path) self.config_location = cfg_path return possible_paths = [ Path('./config.cfg'), Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), Path(self.config_directory, 'default_config.cfg'), ] self.config_location = None for path in possible_paths: if path.resolve().expanduser().exists(): self.config_location = path logger.debug(f'Loading configuration from {path}') break if not self.config_location: with importlib.resources.path('bdfr', 'default_config.cfg') as path: self.config_location = path shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) if not self.config_location: raise errors.BulkDownloaderException( 'Could not find a configuration file to load') self.cfg_parser.read(self.config_location)
class RedditDownloader: def __init__(self, args: Configuration): self.args = args self.config_directories = appdirs.AppDirs('bdfr', 'BDFR') self.run_time = datetime.now().isoformat() self._setup_internal_objects() self.reddit_lists = self._retrieve_reddit_lists() def _setup_internal_objects(self): self._determine_directories() self._load_config() self._create_file_logger() self._read_config() self.download_filter = self._create_download_filter() logger.log(9, 'Created download filter') self.time_filter = self._create_time_filter() logger.log(9, 'Created time filter') self.sort_filter = self._create_sort_filter() logger.log(9, 'Created sort filter') self.file_name_formatter = self._create_file_name_formatter() logger.log(9, 'Create file name formatter') self._create_reddit_instance() self._resolve_user_name() self.excluded_submission_ids = self._read_excluded_ids() if self.args.search_existing: self.master_hash_list = self.scan_existing_files( self.download_directory) else: self.master_hash_list = {} self.authenticator = self._create_authenticator() logger.log(9, 'Created site authenticator') self.args.skip_subreddit = self._split_args_input( self.args.skip_subreddit) self.args.skip_subreddit = set( [sub.lower() for sub in self.args.skip_subreddit]) def _read_config(self): """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: if not self.cfg_parser.has_option('DEFAULT', 'max_wait_time'): self.cfg_parser.set('DEFAULT', 'max_wait_time', '120') logger.log( 9, 'Wrote default download wait time download to config file') self.args.max_wait_time = self.cfg_parser.getint( 'DEFAULT', 'max_wait_time') logger.debug( f'Setting maximum download wait time to {self.args.max_wait_time} seconds' ) # Update config on disk with open(self.config_location, 'w') as file: self.cfg_parser.write(file) def _create_reddit_instance(self): if self.args.authenticate: logger.debug('Using authenticated Reddit instance') if not self.cfg_parser.has_option('DEFAULT', 'user_token'): logger.log(9, 'Commencing OAuth2 authentication') scopes = self.cfg_parser.get('DEFAULT', 'scopes') scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( scopes, self.cfg_parser.get('DEFAULT', 'client_id'), self.cfg_parser.get('DEFAULT', 'client_secret'), ) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser['DEFAULT']['user_token'] = token with open(self.config_location, 'w') as file: self.cfg_parser.write(file, True) token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location) self.authenticated = True self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), token_manager=token_manager, ) else: logger.debug('Using unauthenticated Reddit instance') self.authenticated = False self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get('DEFAULT', 'client_id'), client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'), user_agent=socket.gethostname(), ) def _retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: master_list = [] master_list.extend(self._get_subreddits()) logger.log(9, 'Retrieved subreddits') master_list.extend(self._get_multireddits()) logger.log(9, 'Retrieved multireddits') master_list.extend(self._get_user_data()) logger.log(9, 'Retrieved user data') master_list.extend(self._get_submissions_from_link()) logger.log(9, 'Retrieved submissions for given links') return master_list def _determine_directories(self): self.download_directory = Path( self.args.directory).resolve().expanduser() self.config_directory = Path(self.config_directories.user_config_dir) self.download_directory.mkdir(exist_ok=True, parents=True) self.config_directory.mkdir(exist_ok=True, parents=True) def _load_config(self): self.cfg_parser = configparser.ConfigParser() if self.args.config: if (cfg_path := Path(self.args.config)).exists(): self.cfg_parser.read(cfg_path) self.config_location = cfg_path return possible_paths = [ Path('./config.cfg'), Path('./default_config.cfg'), Path(self.config_directory, 'config.cfg'), Path(self.config_directory, 'default_config.cfg'), ] self.config_location = None for path in possible_paths: if path.resolve().expanduser().exists(): self.config_location = path logger.debug(f'Loading configuration from {path}') break if not self.config_location: self.config_location = list( importlib.resources.path('bdfr', 'default_config.cfg').gen)[0] shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg')) if not self.config_location: raise errors.BulkDownloaderException( 'Could not find a configuration file to load') self.cfg_parser.read(self.config_location)