def format_path( self, resource: Resource, destination_directory: Path, index: Optional[int] = None, ) -> Path: subfolder = Path( destination_directory, *[ self._format_name(resource.source_submission, part) for part in self.directory_format_string ]) index = f'_{str(index)}' if index else '' if not resource.extension: raise BulkDownloaderException( f'Resource from {resource.url} has no extension') ending = index + resource.extension file_name = str( self._format_name(resource.source_submission, self.file_format_string)) file_name = self._limit_file_name_length(file_name, ending) try: file_path = Path(subfolder, file_name) except TypeError: raise BulkDownloaderException( f'Could not determine path name: {subfolder}, {index}, {resource.extension}' ) return file_path
def retry_download(url: str, max_wait_time: int) -> Optional[bytes]: wait_time = 60 try: response = requests.get(url) if re.match(r'^2\d{2}', str( response.status_code)) and response.content: return response.content elif response.status_code in (408, 429): raise requests.exceptions.ConnectionError( f'Response code {response.status_code}') else: raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}' ) except requests.exceptions.ConnectionError as e: logger.warning( f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}' ) time.sleep(wait_time) if wait_time < max_wait_time: return Resource.retry_download(url, max_wait_time) else: logger.error( f'Max wait time exceeded for resource at url {url}') raise
def http_download(url: str, download_parameters: dict) -> Optional[bytes]: headers = download_parameters.get('headers') current_wait_time = 60 if 'max_wait_time' in download_parameters: max_wait_time = download_parameters['max_wait_time'] else: max_wait_time = 300 while True: try: response = requests.get(url, headers=headers) if re.match(r'^2\d{2}', str( response.status_code)) and response.content: return response.content elif response.status_code in (408, 429): raise requests.exceptions.ConnectionError( f'Response code {response.status_code}') else: raise BulkDownloaderException( f'Unrecoverable error requesting resource: HTTP Code {response.status_code}' ) except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e: logger.warning( f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}' ) time.sleep(current_wait_time) if current_wait_time < max_wait_time: current_wait_time += 60 else: logger.error( f'Max wait time exceeded for resource at url {url}') raise
def _format_name(submission: (Comment, Submission), format_string: str) -> str: if isinstance(submission, Submission): attributes = FileNameFormatter._generate_name_dict_from_submission( submission) elif isinstance(submission, Comment): attributes = FileNameFormatter._generate_name_dict_from_comment( submission) else: raise BulkDownloaderException( f'Cannot name object {type(submission).__name__}') result = format_string for key in attributes.keys(): if re.search(fr'(?i).*{{{key}}}.*', result): key_value = str(attributes.get(key, 'unknown')) key_value = FileNameFormatter._convert_unicode_escapes( key_value) key_value = key_value.replace('\\', '\\\\') result = re.sub(fr'(?i){{{key}}}', key_value, result) result = result.replace('/', '') if platform.system() == 'Windows': result = FileNameFormatter._format_for_windows(result) return result
def __init__(self, file_format_string: str, directory_format_string: str): if not self.validate_string(file_format_string): raise BulkDownloaderException( f'"{file_format_string}" is not a valid format string') self.file_format_string = file_format_string self.directory_format_string: list[ str] = directory_format_string.split('/')
def _check_scopes(wanted_scopes: set[str]): response = requests.get('https://www.reddit.com/api/v1/scopes.json', headers={'User-Agent': 'fetch-scopes test'}) known_scopes = [scope for scope, data in response.json().items()] known_scopes.append('*') for scope in wanted_scopes: if scope not in known_scopes: raise BulkDownloaderException( f'Scope {scope} is not known to reddit')
def download(self, max_wait_time: int): if not self.content: try: content = self.retry_download(self.url, max_wait_time) except requests.exceptions.ConnectionError as e: raise BulkDownloaderException( f'Could not download resource: {e}') except BulkDownloaderException: raise if content: self.content = content if not self.hash and self.content: self.create_hash()
def download(self, download_parameters: Optional[dict] = None): if download_parameters is None: download_parameters = {} if not self.content: try: content = self.download_function(download_parameters) except requests.exceptions.ConnectionError as e: raise BulkDownloaderException( f'Could not download resource: {e}') except BulkDownloaderException: raise if content: self.content = content if not self.hash and self.content: self.create_hash()