def fetch_bag_files(bag, keychain_file=DEFAULT_KEYCHAIN_FILE, config_file=DEFAULT_CONFIG_FILE, force=False, callback=None, filter_expr=None, **kwargs): auth = read_keychain(keychain_file) config = read_config(config_file) cookies = get_request_cookies(config) if kwargs.get("cookie_scan", True) else None success = True current = 0 total = 0 if not callback else len(set(bag.files_to_be_fetched())) start = datetime.datetime.now() for entry in map(FetchEntry._make, bag.fetch_entries()): filename = urlunquote(entry.filename) if filter_expr: if not filter_dict(filter_expr, entry._asdict()): continue output_path = os.path.normpath(os.path.join(bag.path, filename)) local_size = os.path.getsize(output_path) if os.path.exists( output_path) else None try: remote_size = int(entry.length) except ValueError: remote_size = None missing = True if local_size is not None: if local_size == remote_size or remote_size is None: missing = False if not force and not missing: if logger.isEnabledFor(logging.DEBUG): logger.debug("Not fetching already present file: %s" % output_path) pass else: result_path = fetch_file(entry.url, output_path, auth, size=entry.length, config=config, cookies=cookies, **kwargs) if not result_path: success = False if callback: current += 1 if not callback(current, total): logger.warning("Fetch cancelled by user...") success = False break elapsed = datetime.datetime.now() - start logger.info("Fetch complete. Elapsed time: %s" % elapsed) cleanup_transports() return success
def test_filter_dict(self): logger.info(self.getTestHeader('test filter function')) msg = "evaluating filter expression: %s" test_url = "http://example.com/files/examples/README.txt" test_length = 250624 test_filename = "data/examples/README.txt" test_entry = {"url": test_url, "length": test_length, "filename": test_filename} pos_exprs = ["url==%s" % test_url, "url!=http://foo", "url=*/files/", "filename!*/files/", "filename^*data/", "filename$*.txt", "length>250623", "length>=250624", "length<250625", "length<=250624"] neg_exprs = ["url!=%s" % test_url, "url==http://foo", "url=*/fils/", "filename!*/examples/", "filename^*dat/", "filename$*.tx", "length>250624", "length>=250625", "length<250624", "length<=250623", "length<=-"] bad_exprs = ["url*=http://foo", "url=http://foo"] try: for expr in pos_exprs: result = filter_dict(expr, test_entry) self.assertTrue(result, msg % expr) for expr in neg_exprs: result = filter_dict(expr, test_entry) self.assertFalse(result, msg % expr) for expr in bad_exprs: self.assertRaises(ValueError, filter_dict, expr, test_entry) except Exception as e: self.fail(get_typed_exception(e))
def create_rfm_from_file(args): if not (args.md5_col or args.sha1_col or args.sha256_col or args.sha512_col): raise ValueError( "At least one checksum algorithm column mapping must be specified." ) with open(args.output_file, 'w') as rfm_file, open(args.input_file, 'r') as input_file: rfm = list() if not args.input_format == 'json': dialect = Sniffer().sniff(input_file.read(4096)) input_file.seek(0) rows = DictReader(input_file, dialect=dialect) else: rows = json.load(input_file) for row in rows: if not filter_dict(args.filter, row): continue rfm_entry = dict() rfm_entry["url"] = row[args.url_col] rfm_entry["length"] = int(row[args.length_col]) rfm_entry["filename"] = urlsplit( row[args.filename_col]).path.lstrip("/") if args.md5_col: rfm_entry["md5"] = row[args.md5_col] rfm_entry["md5_base64"] = encode_hex_to_base64( rfm_entry["md5"]) if args.sha1_col: rfm_entry["sha1"] = row[args.sha1_col] rfm_entry["sha1_base64"] = encode_hex_to_base64( rfm_entry["sha1"]) if args.sha256_col: rfm_entry["sha256"] = row[args.sha256_col] rfm_entry["sha256_base64"] = encode_hex_to_base64( rfm_entry["sha256"]) if args.sha512_col: rfm_entry["sha512"] = row[args.sha512_col] rfm_entry["sha512_base64"] = encode_hex_to_base64( rfm_entry["sha512"]) rfm.append(rfm_entry) entries = deduplicate_rfm_entries(rfm) logger.info("Writing %d entries to remote file manifest" % len(entries)) rfm_file.write(json.dumps(entries, sort_keys=True, indent=2)) logger.info("Successfully created remote file manifest: %s" % args.output_file)
def create_rfm_from_filesystem(args): with open(args.output_file, 'w') as rfm_file: rfm = list() if not os.path.isdir(args.input_path): raise ValueError( "The following path does not exist or is not a directory: [%s]" % args.input_path) for dirpath, dirnames, filenames in os.walk(args.input_path): subdirs_count = dirnames.__len__() if subdirs_count: logger.info( "%s subdirectories found in input directory %s %s" % (subdirs_count, args.input_path, dirnames)) filenames.sort() for fn in filenames: rfm_entry = dict() input_file = os.path.join(dirpath, fn) logger.debug("Processing input file %s" % input_file) input_rel_path = input_file.replace(args.input_path, '') filepath = args.base_payload_path if args.base_payload_path else "" filepath = "".join([filepath, input_rel_path]) rfm_entry["filename"] = filepath.replace("\\", "/").lstrip("/") rfm_entry["url"] = url_format(args.url_formatter, base_url=args.base_url, filepath=input_rel_path.replace( "\\", "/").lstrip("/"), filename=fn) rfm_entry["length"] = os.path.getsize(input_file) rfm_entry.update(compute_file_hashes(input_file, args.checksum)) if not filter_dict(args.filter, rfm_entry): continue if args.streaming_json: rfm_file.writelines(''.join( [json.dumps(rfm_entry, sort_keys=True), '\n'])) else: rfm.append(rfm_entry) if not args.streaming_json: rfm_file.write(json.dumps(rfm, sort_keys=True, indent=2)) logger.info("Successfully created remote file manifest: %s" % args.output_file)
def fetch_bag_files(bag, keychain_file, force=False, callback=None, config=DEFAULT_CONFIG, filter_expr=None): success = True auth = read_keychain(keychain_file) resolvers = config.get(ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS current = 0 total = 0 if not callback else len(set(bag.files_to_be_fetched())) start = datetime.datetime.now() for entry in map(FetchEntry._make, bag.fetch_entries()): if filter_expr: if not filter_dict(filter_expr, entry._asdict()): continue output_path = os.path.normpath(os.path.join(bag.path, entry.filename)) local_size = os.path.getsize(output_path) if os.path.exists(output_path) else None try: remote_size = int(entry.length) except ValueError: remote_size = None missing = True if local_size is not None: if local_size == remote_size or remote_size is None: missing = False if not force and not missing: if logger.isEnabledFor(logging.DEBUG): logger.debug("Not fetching already present file: %s" % output_path) pass else: success = fetch_file(entry.url, entry.length, output_path, auth, resolvers=resolvers) if callback: current += 1 if not callback(current, total): logger.warning("Fetch cancelled by user...") break elapsed = datetime.datetime.now() - start logger.info("Fetch complete. Elapsed time: %s" % elapsed) cleanup_transports() return success
def create_rfm_from_url_list(args): keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE auth = read_keychain(keychain_file) with open(args.output_file, 'w') as rfm_file, open(args.input_file, 'r') as input_file: rfm = list() for url in input_file.readlines(): rfm_entry = dict() url = url.strip() logger.debug("Processing input URL %s" % url) try: headers = head_for_headers(url, auth, raise_for_status=True) except Exception as e: logging.warning("HEAD request failed for URL [%s]: %s" % (url, gte(e))) continue logger.debug("Result headers: %s" % headers) length = headers.get("Content-Length") content_type = headers.get("Content-Type") content_disposition = headers.get("Content-Disposition") md5_header = args.md5_header if args.md5_header else "Content-MD5" md5 = headers.get(md5_header) md5 = get_checksum_from_string_list("md5", md5) if md5 and not args.disable_hash_decode_base64: rfm_entry["md5_base64"] = md5 md5 = decode_base64_to_hex(md5) rfm_entry["md5"] = md5 sha256_header = args.sha256_header if args.sha256_header else "Content-SHA256" sha256 = headers.get(sha256_header) sha256 = get_checksum_from_string_list("sha256", sha256) if sha256 and not args.disable_hash_decode_base64: rfm_entry["sha256_base64"] = sha256 sha256 = decode_base64_to_hex(sha256) rfm_entry["sha256"] = sha256 # if content length or both hash values are missing, there is a problem if not length: logging.warning("Could not determine Content-Length for %s" % url) if not (md5 or sha256): logging.warning( "Could not locate an MD5 or SHA256 hash for %s" % url) # try to construct filename using content_disposition, if available, else fallback to the URL path fragment filepath = urlsplit(url).path filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) subdir = args.base_payload_path if args.base_payload_path else "" output_path = ''.join( [subdir, os.path.dirname(filepath), "/", filename]) rfm_entry['url'] = url rfm_entry['length'] = length rfm_entry['filename'] = output_path.lstrip("/") if content_type: rfm_entry["content_type"] = content_type if not filter_dict(args.filter, rfm_entry): continue if args.streaming_json: rfm_file.writelines(''.join( [json.dumps(rfm_entry, sort_keys=True), '\n'])) else: rfm.append(rfm_entry) if not args.streaming_json: rfm_file.write( json.dumps(deduplicate_rfm_entries(rfm), sort_keys=True, indent=2)) logger.info("Successfully created remote file manifest: %s" % args.output_file)