def format_path(
        self,
        resource: Resource,
        destination_directory: Path,
        index: Optional[int] = None,
    ) -> Path:
        subfolder = Path(
            destination_directory, *[
                self._format_name(resource.source_submission, part)
                for part in self.directory_format_string
            ])
        index = f'_{str(index)}' if index else ''
        if not resource.extension:
            raise BulkDownloaderException(
                f'Resource from {resource.url} has no extension')
        ending = index + resource.extension
        file_name = str(
            self._format_name(resource.source_submission,
                              self.file_format_string))
        file_name = self._limit_file_name_length(file_name, ending)

        try:
            file_path = Path(subfolder, file_name)
        except TypeError:
            raise BulkDownloaderException(
                f'Could not determine path name: {subfolder}, {index}, {resource.extension}'
            )
        return file_path
 def retry_download(url: str, max_wait_time: int) -> Optional[bytes]:
     wait_time = 60
     try:
         response = requests.get(url)
         if re.match(r'^2\d{2}', str(
                 response.status_code)) and response.content:
             return response.content
         elif response.status_code in (408, 429):
             raise requests.exceptions.ConnectionError(
                 f'Response code {response.status_code}')
         else:
             raise BulkDownloaderException(
                 f'Unrecoverable error requesting resource: HTTP Code {response.status_code}'
             )
     except requests.exceptions.ConnectionError as e:
         logger.warning(
             f'Error occured downloading from {url}, waiting {wait_time} seconds: {e}'
         )
         time.sleep(wait_time)
         if wait_time < max_wait_time:
             return Resource.retry_download(url, max_wait_time)
         else:
             logger.error(
                 f'Max wait time exceeded for resource at url {url}')
             raise
 def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
     headers = download_parameters.get('headers')
     current_wait_time = 60
     if 'max_wait_time' in download_parameters:
         max_wait_time = download_parameters['max_wait_time']
     else:
         max_wait_time = 300
     while True:
         try:
             response = requests.get(url, headers=headers)
             if re.match(r'^2\d{2}', str(
                     response.status_code)) and response.content:
                 return response.content
             elif response.status_code in (408, 429):
                 raise requests.exceptions.ConnectionError(
                     f'Response code {response.status_code}')
             else:
                 raise BulkDownloaderException(
                     f'Unrecoverable error requesting resource: HTTP Code {response.status_code}'
                 )
         except (requests.exceptions.ConnectionError,
                 requests.exceptions.ChunkedEncodingError) as e:
             logger.warning(
                 f'Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}'
             )
             time.sleep(current_wait_time)
             if current_wait_time < max_wait_time:
                 current_wait_time += 60
             else:
                 logger.error(
                     f'Max wait time exceeded for resource at url {url}')
                 raise
    def _format_name(submission: (Comment, Submission),
                     format_string: str) -> str:
        if isinstance(submission, Submission):
            attributes = FileNameFormatter._generate_name_dict_from_submission(
                submission)
        elif isinstance(submission, Comment):
            attributes = FileNameFormatter._generate_name_dict_from_comment(
                submission)
        else:
            raise BulkDownloaderException(
                f'Cannot name object {type(submission).__name__}')
        result = format_string
        for key in attributes.keys():
            if re.search(fr'(?i).*{{{key}}}.*', result):
                key_value = str(attributes.get(key, 'unknown'))
                key_value = FileNameFormatter._convert_unicode_escapes(
                    key_value)
                key_value = key_value.replace('\\', '\\\\')
                result = re.sub(fr'(?i){{{key}}}', key_value, result)

        result = result.replace('/', '')

        if platform.system() == 'Windows':
            result = FileNameFormatter._format_for_windows(result)

        return result
 def __init__(self, file_format_string: str, directory_format_string: str):
     if not self.validate_string(file_format_string):
         raise BulkDownloaderException(
             f'"{file_format_string}" is not a valid format string')
     self.file_format_string = file_format_string
     self.directory_format_string: list[
         str] = directory_format_string.split('/')
Example #6
0
 def _check_scopes(wanted_scopes: set[str]):
     response = requests.get('https://www.reddit.com/api/v1/scopes.json',
                             headers={'User-Agent': 'fetch-scopes test'})
     known_scopes = [scope for scope, data in response.json().items()]
     known_scopes.append('*')
     for scope in wanted_scopes:
         if scope not in known_scopes:
             raise BulkDownloaderException(
                 f'Scope {scope} is not known to reddit')
 def download(self, max_wait_time: int):
     if not self.content:
         try:
             content = self.retry_download(self.url, max_wait_time)
         except requests.exceptions.ConnectionError as e:
             raise BulkDownloaderException(
                 f'Could not download resource: {e}')
         except BulkDownloaderException:
             raise
         if content:
             self.content = content
     if not self.hash and self.content:
         self.create_hash()
 def download(self, download_parameters: Optional[dict] = None):
     if download_parameters is None:
         download_parameters = {}
     if not self.content:
         try:
             content = self.download_function(download_parameters)
         except requests.exceptions.ConnectionError as e:
             raise BulkDownloaderException(
                 f'Could not download resource: {e}')
         except BulkDownloaderException:
             raise
         if content:
             self.content = content
     if not self.hash and self.content:
         self.create_hash()