def download_file_stream(url: str, file_stream: IO, block_size: int = 1024 * 8, with_progress_bar: bool = True, fatal: bool = True) -> bool: try: download_stream = urllib.request.urlopen(url) except Exception as e: log(f'Failed on: {url}', fatal=False, log_type=LogType.ERROR) log(e, fatal=fatal, log_type=LogType.ERROR) return False with download_stream: res_headers: HTTPMessage = download_stream.info() total_size = int(res_headers.get('Content-Length', failobj=0)) progress_bar = None if with_progress_bar and total_size > 0: progress_bar = DownloadProgressBar(total_size) read = 0 while True: block: bytes = download_stream.read(block_size) if not block: break read += len(block) file_stream.write(block) if progress_bar and not progress_bar.run(len(block)): break if total_size >= 0 and read < total_size: return False return True
def move_file(old: str, new: str, make_dirs: bool = True, duplicate_handler: DuplicateHandler = None) -> str: if make_dirs: os.makedirs(Path(new).parent, exist_ok=True) if duplicate_handler and Path(new).exists(): if duplicate_handler == DuplicateHandler.FIND_VALID_FILE: new = get_valid_filename(new) elif duplicate_handler == DuplicateHandler.THROW_ERROR: log(f'File "{new}" already exists', log_type=LogType.ERROR) elif duplicate_handler == DuplicateHandler.OVERWRITE: os.remove(new) elif duplicate_handler == DuplicateHandler.SKIP: return new elif duplicate_handler == DuplicateHandler.HASH_COMPARE: old_file = get_sha1_hash_file(old) new_file = get_sha1_hash_file(new) if old_file == new_file: return new new = get_valid_filename(new) file = str(Path(old).rename(new)).replace('\\', '/') return file
def json_parse(json, key, default=None, fatal=False): if key in json: return json[key] if fatal: log(f'Cannot find {key} in {json}.', log_type=LogType.ERROR) return default
def get_config(file=CONFIG_FILE): config_file = Path(file) if not config_file.is_file(): log(f'{file} not found in path', log_type=LogType.ERROR) with open(file, 'r') as file_obj: text = ''.join(file_obj.readlines()) json_obj = json.loads(text) return json_to_config(json_obj)
def add_to_download_cache(download_cache, *urls, headers: HTTPMessage = None, filename: str = None, result=DownloadedFileResult.SUCCESS) \ -> Optional[DownloadedFile]: if len(urls) == 0: log(f'Cache fail, no url sent.', log_type=LogType.ERROR) downloaded_file = DownloadedFile(filename=filename, url=urls[0], headers=headers, result=result) for url in urls: download_cache[url] = downloaded_file return downloaded_file
def split_filename(s: str, fatal=False, include_ext_period: bool = False) -> Tuple[str, Optional[str]]: split = s.split('.') if len(split) == 1: if fatal: log(f'No file extension found: {s}', log_type=LogType.ERROR) return split[0], None ext = handle_extension_period(split[-1], include_ext_period=include_ext_period) return '.'.join(split[:-1]), ext
def json_parse_enum(obj, json_val, class_type, fatal=False): val = json_parse(obj, json_val, default=None, fatal=fatal) if not val: return None val = str(val).upper() if val not in class_type.__dict__.keys(): log(f'Invalid Enum: {val}, Keys: {class_type.__dict__.keys()}', log_type=LogType.ERROR) return class_type.__dict__[val]
def __init__(self, dictionary: dict = None, name: str = '', value: str = '', domain: str = '', path: str = ''): self.name = name self.value = value self.domain = domain self.path = path if dictionary: for key in dictionary: setattr(self, key, dictionary[key]) if not self.name or not self.value or not self.domain or not self.path: log(f'Invalid Cookie', log_type=LogType.ERROR)
def validate_path(directory: str, default_path: str = join_path(os.getcwd(), '/out'), fatal: bool = False) -> str: if is_blank(directory): if fatal: log(f'Path {directory} does not exist.', log_type=LogType.ERROR) directory = default_path path = directory.replace('\\', '/') if not os.path.exists(path): os.makedirs(path, exist_ok=True) return path
def driver_go_and_wait(driver: WebDriver, url: str, scroll_pause_time: float, fail: int = 0): if fail >= 5: log(f'URL does not ever match, {url} never becomes {driver.current_url}', log_type=LogType.ERROR) driver.get(url) wait_page_load(driver) if not is_url_exact(driver.current_url, url): driver_go_and_wait(driver, url, scroll_pause_time, fail=fail + 1) scroll_to_bottom(driver, scroll_pause_time=scroll_pause_time)
def get_content_type_head(url: str): try: req = urllib.request.Request(url, method='HEAD') with urllib.request.urlopen(req) as response: res_headers: HTTPMessage = response.info() content_type = get_content_type_from_headers(res_headers) except HTTPError as e: if e and e.headers: content_type_tmp = e.headers.get('Content-Type', None) content_type = get_content_type_from_header(content_type_tmp) else: log(e, extra=f'URL: {url}', fatal=False, log_type=LogType.ERROR) content_type = None except Exception as e: log(e, extra=f'General Exception URL: {url}', fatal=False, log_type=LogType.ERROR) content_type = None return content_type
def extract_json_from_text(s: str): start = 0 if not s.startswith('[') and not s.startswith('{'): bracket_i = brace_i = 1 open_bracket = open_brace = 0 while (open_bracket != -1 and open_brace != -1) and start == 0: open_bracket = find_nth(s, '[', bracket_i) open_brace = find_nth(s, '{', brace_i) lower, higher, lower_val, higher_val = min_val(open_brace, open_bracket, '{', '[', min_possible_val=0) prev_lower_char = find_first_previous_char(s, lower, exclude=[' ']) if s[prev_lower_char] != '=' and s[prev_lower_char] != '(': if lower_val == '{': brace_i += 1 elif lower_val == '[': bracket_i += 1 else: start = lower if start == 0: return None end = 0 stack = LifoQueue() for i in range(start, len(s)): if s[i] == '{': stack.put('{') if s[i] == '}': stack.get() if stack.empty(): end = i break if end == 0: log('Error parsing JSON', log_type=LogType.ERROR) json = s[start:end + 1] return json
def download_file_impl(url: str, filename: str, download_cache: Optional[shelve.DbfilenameShelf], block_size: int = 1024 * 8, with_progress_bar: bool = True) \ -> Union[Tuple[str, str, HTTPMessage], DownloadedFile, None]: try: download_stream = urllib.request.urlopen(url) except: return None old_url: str = '' with download_stream: res_headers: HTTPMessage = download_stream.info() new_url: str = download_stream.geturl() if download_cache and new_url in download_cache: return download_cache[new_url] if url != new_url: old_url = url url = new_url total_size = int(res_headers.get('Content-Length', failobj=0)) progress_bar = None if with_progress_bar and total_size > 0: progress_bar = DownloadProgressBar(total_size, on_complete=lambda x: log(f'Downloaded {url} to {filename}')) read = 0 with open(filename, 'w+b') as file_stream: while True: block: bytes = download_stream.read(block_size) if not block: break read += len(block) file_stream.write(block) if progress_bar and not progress_bar.run(len(block)): break if total_size >= 0 and read < total_size: log(f'File download incomplete, received {read} out of {total_size} bytes. URL: {url}, filename: {filename}', fatal=False, log_type=LogType.ERROR) return url, old_url, res_headers
def json_parse_class(json: dict, class_type: type): signature = inspect.signature(class_type.__init__) args = signature.parameters.keys() args = [arg for arg in args if arg != 'self'] d = {} for arg in args: arg_type = signature.parameters[arg].annotation if arg in SAFE_PARAMETER_MAPPING: temp_arg = SAFE_PARAMETER_MAPPING[arg] else: temp_arg = arg if temp_arg not in json: continue if arg_type in PRIMITIVE_TYPES: generated_arg_obj = json[temp_arg] elif issubclass(arg_type, Enum): generated_arg_obj = json_parse_enum(json, temp_arg, arg_type, fatal=True) elif get_origin(arg_type) and get_origin(arg_type) == list: list_type = first_or_none(get_args(arg_type)) if not list_type: log(f'List Type {arg_type} was None, origin: {get_origin(arg_type)}', log_type=LogType.ERROR) generated_arg_obj = json_parse_class_list(json[temp_arg], list_type, fatal=True) else: generated_arg_obj = json_parse_class(json[temp_arg], arg_type) d[arg] = generated_arg_obj obj = class_type(**d) return obj
def get_sub_directory_path(base_url: str, new_url: str, prepend_dir: str = None, prepend_slash: bool = True, append_slash: bool = True) -> str: if not base_url: log(f'Invalid params: {base_url}, {new_url}, {prepend_dir}.', name_of(get_sub_directory_path), log_type=LogType.ERROR) if new_url.endswith('/'): new_url = new_url[:-1] base_url = get_base_url(base_url) if base_url not in new_url: log(f'Invalid params: {base_url}, {new_url}, {prepend_dir}.', name_of(get_sub_directory_path), log_type=LogType.ERROR) sub_dir = new_url[new_url.index(base_url) + len(base_url):] if not sub_dir.startswith('/'): sub_dir = f'/{sub_dir}' if append_slash: if not sub_dir.endswith('/'): sub_dir += '/' else: if sub_dir.endswith('/'): sub_dir = sub_dir[:-1] if prepend_dir: if prepend_dir.endswith('/'): prepend_dir = prepend_dir[:-1] return prepend_dir + sub_dir if prepend_slash: if not sub_dir.startswith('/'): sub_dir = f'/{sub_dir}' else: if sub_dir.startswith('/'): sub_dir = sub_dir[1:] return sub_dir
import sys from src.config import CONFIG_FILE, get_config from src.post_scrape import run_post_scrape from src.scrape import scrape from src.util.generic import log ''' sys.argv = [ __file__, 'job3.json' ]''' if __name__ == '__main__': config_file = CONFIG_FILE if len(sys.argv) == 1 else sys.argv[1] config = get_config(file=config_file) if not config.post_scrape_jobs_only: scrape(config) run_post_scrape(config) log('Completed everything. The program will now exit.')