def __init__(self, url_prefix=None, name_field=None, md5_field=None, acl_field=None, size_field=None, location_field=None): """ If url_prefix is given, then it will prepend to file names to get original URL, Otherwise, it will assume name_field contains complete URLs :param name_field: field name used to store file name :param md5_field: field name used to store original MD5 :param size_field: field name used to store original file size :param url_prefix: URL prefix to prepend to all file names :param verify: whether or not to verify MD5 and size """ super().__init__(name_field=name_field, md5_field=md5_field, size_field=size_field, acl_field=acl_field, location_field=location_field) if isinstance(url_prefix, str) and url_prefix: self.url_prefix = removeTrailingSlash(url_prefix) else: self.url_prefix = None
def __init__(self, bucket_name, prefix, adapter): """" Copy file from URL or local file to S3 bucket :param bucket_name: string type """ if not bucket_name: raise ValueError('Empty destination bucket name') self.bucket_name = bucket_name self.bucket = S3Bucket(self.bucket_name) if prefix and isinstance(prefix, str): self.prefix = removeTrailingSlash(prefix) else: raise ValueError(f'Invalid prefix: "{prefix}"') # Verify adapter has all functions needed for attr in self.adapter_attrs: if not hasattr(adapter, attr): raise TypeError(f'Adapter doesn\'t have "{attr}" attribute/method') self.adapter = adapter self.log = get_logger('Copier') self.files_exist_at_dest = 0 self.files_copied = 0 self.files_not_found = set()
def __init__(self, file_name): self.log = get_logger('Configuration') # Read the Configuration File with open(file_name) as config_file: self.data = json.load(config_file) # Read the region self.region = self.data['region'] self.domain = self.data['domain'] # Read arm objects self.arms = [] for obj in self.data['arms']: self.arms.append(Arm(obj)) # Get List of Arms self.cipher_key = self.data['cipher_key'] self.use_prod = self.data['useProd'] # Get the Secret Name UAT self.secret_name = self.data['secretName'] # Get Okta UAT Authorization URL self.okta_auth_url = self.data["oktaAuthUrl"] # Get the Match UAT Treatment Arm Api URL self.match_base_url = removeTrailingSlash(self.data['matchBaseUrl']) # Get CTDC API URL self.api_url = self.data['API_URL'] if self.use_prod == False: self.log.info('Using Match UAT Environment') else: self.log.info('Using Match Production Environment')
def main(args): log = get_logger('Raw file processor - main') config = BentoConfig(args.config_file) if not args.queue: log.error('Please specify queue name with -q/--queue argument') sys.exit(1) uri = args.uri if args.uri else "bolt://localhost:7687" uri = removeTrailingSlash(uri) password = args.password if not password: if config.PSWD_ENV not in os.environ: log.error( 'Password not specified! Please specify password with -p or --password argument, or set {} env var'.format( config.PSWD_ENV)) sys.exit(1) else: password = os.environ[config.PSWD_ENV] user = args.user if args.user else 'neo4j' if not args.schema: log.error('Please specify schema file(s) with -s or --schema argument') sys.exit(1) for schema_file in args.schema: if not os.path.isfile(schema_file): log.error('{} is not a file'.format(schema_file)) sys.exit(1) if not args.bucket: log.error('Please specify output S3 bucket for final manifest(s) using -b/--bucket argument') sys.exit(1) if not args.s3_folder: log.error('Please specify output S3 folder for final manifest(s) using -f/--s3-folder argument') sys.exit(1) driver = None try: props = Props(args.prop_file) schema = ICDC_Schema(args.schema, props) driver = neo4j.GraphDatabase.driver(uri, auth=(user, password)) processor = FileLoader(args.queue, driver, schema, config, args.bucket, args.s3_folder, args.dry_run) processor.listen() except neo4j.ServiceUnavailable as err: log.exception(err) log.critical("Can't connect to Neo4j server at: \"{}\"".format(uri)) except KeyboardInterrupt: log.info("\nBye!") sys.exit() finally: if driver: driver.close()
def test_remove_traling_slash(self): self.assertEqual('abc', removeTrailingSlash('abc/')) self.assertEqual('abc', removeTrailingSlash('abc')) self.assertEqual('abc', removeTrailingSlash('abc//')) self.assertEqual('bolt://12.34.56.78', removeTrailingSlash('bolt://12.34.56.78')) self.assertEqual('bolt://12.34.56.78', removeTrailingSlash('bolt://12.34.56.78/')) self.assertEqual('bolt://12.34.56.78', removeTrailingSlash('bolt://12.34.56.78//')) self.assertEqual('bolt://12.34.56.78', removeTrailingSlash('bolt://12.34.56.78////'))
def __init__(self, mode, adapter_module=None, adapter_class=None, adapter_params=None, domain=None, bucket=None, prefix=None, pre_manifest=None, first=1, count=-1, job_queue=None, result_queue=None, retry=3, overwrite=False, dryrun=False, verify_md5=False): """" :param bucket: string type :param pre_manifest: string type, holds path to pre-manifest :param first: first file of files to process, file 1 is in line 2 of pre-manifest :param count: number of files to process :param adapter: any object that has following methods/properties defined in adapter_attrs """ if mode not in Config.valid_modes: raise ValueError(f'Invalid loading mode: {mode}') self.mode = mode if mode != SOLO_MODE: if not job_queue: raise ValueError( f'Job queue name is required in {self.mode} mode!') self.job_queue_name = job_queue self.job_queue = Queue(job_queue) if not result_queue: raise ValueError( f'Result queue name is required in {self.mode} mode!') self.result_queue_name = result_queue self.result_queue = Queue(result_queue) if self.mode != SLAVE_MODE: if not bucket: raise ValueError('Empty destination bucket name') self.bucket_name = bucket if prefix and isinstance(prefix, str): self.prefix = removeTrailingSlash(prefix) else: raise ValueError(f'Invalid prefix: "{prefix}"') if not pre_manifest or not os.path.isfile(pre_manifest): raise ValueError( f'Pre-manifest: "{pre_manifest}" dosen\'t exist') self.pre_manifest = pre_manifest if not domain: raise ValueError(f'Empty domain!') self.domain = domain self.adapter_config = { self.ADAPTER_PARAMS: adapter_params, self.ADAPTER_CLASS: adapter_class, self.ADAPTER_MODULE: adapter_module } self._init_adapter(adapter_module, adapter_class, adapter_params) else: self.adapter = None self.adapter_config = {} self.copier = None if not first > 0 or count == 0: raise ValueError(f'Invalid first ({first}) or count ({count})') self.skip = first - 1 self.count = count if not isinstance(retry, int) and retry > 0: raise ValueError(f'Invalid retry value: {retry}') self.retry = retry if not isinstance(overwrite, bool): raise TypeError(f'Invalid overwrite value: {overwrite}') self.overwrite = overwrite if not isinstance(dryrun, bool): raise TypeError(f'Invalid dryrun value: {dryrun}') self.dryrun = dryrun self.verify_md5 = verify_md5 self.log = get_logger('FileLoader') # Statistics self.files_processed = 0 self.files_skipped = 0 self.files_failed = 0
def main(): parser = argparse.ArgumentParser( description='Script to validate file copying') parser.add_argument('-sp', '--src-path', help='Source S3 bucket name and optional path') parser.add_argument('-db', '--dest-bucket', help='Destination S3 bucket name') parser.add_argument('-pf', '--previous-file', type=argparse.FileType('r'), help='Previous output CSV file of this script') args = parser.parse_args() start_time = timer() fieldnames = [ 'src_bucket', 'dest_bucket', 'file_name', 'file_size', 'result', 'reason' ] s3 = boto3.client('s3') # Revalidate a previous validation file if args.previous_file: log.info(f'Previous validation file: {args.previous_file.name}') reader = csv.DictReader(args.previous_file) file_list = [] for obj in reader: src_bucket = obj['src_bucket'] dest_bucket = obj['dest_bucket'] if obj['result'] == SUCCEEDED: if not obj['reason'].endswith(PREVIOUSE_VALIDATED): obj['reason'] += PREVIOUSE_VALIDATED file_list.append(obj) else: file = s3.head_object(Bucket=src_bucket, Key=obj['file_name']) file['Size'] = file['ContentLength'] file['Key'] = obj['file_name'] file_list.append(file) else: if not args.src_path or not args.dest_bucket: log.error('Source S3 path and Destination S3 bucket are required!') return source_path = removeTrailingSlash(args.src_path) dest_bucket = removeTrailingSlash(args.dest_bucket) src_bucket, s3_path = split_s3_path(source_path) log.info(f"Source bucket: {src_bucket}") log.info(f"Dest bucket: {dest_bucket}") log.info(f"Prefix: {s3_path}") file_list = list_files(s3, src_bucket, s3_path) num_files = len(file_list) log.info(f"There are {num_files} files to compare") os.makedirs(tmp_folder, exist_ok=True) output_file = f'{tmp_folder}/copy-file-validation-{get_time_stamp()}.csv' with open(output_file, 'w') as of: writer = csv.DictWriter(of, fieldnames=fieldnames) writer.writeheader() counter = 0 succeeded = 0 total_size = 0 for file in file_list: counter += 1 # These files has been successfully validated last time if 'result' in file: writer.writerow(file) file_size = int(file['file_size']) total_size += file_size log.info( f"Valiating file {counter}/{num_files} ({format_bytes(file_size)}): {file['file_name']}" ) log.info('Validated in previous run') continue file_size = file['Size'] total_size += file_size try: log.info( f'Valiating file {counter}/{num_files} ({format_bytes(file_size)}): {file["Key"]}' ) result, message = validate_file(s3, file, src_bucket, dest_bucket) except Exception as e: log.exception(e) log.error( f'Valiating file: {file["Key"]} failed! See errors above.') result = FAILED message = e if result == SUCCEEDED: log.info(f"{result}: {message}") succeeded += 1 else: log.error(f"{result}: {message}") log.info(f"Total Verified file size: {format_bytes(total_size)}") writer.writerow({ 'src_bucket': src_bucket, 'dest_bucket': dest_bucket, 'file_name': file['Key'], 'file_size': file_size, 'result': result, 'reason': message }) end_time = timer() log.info( f"Comparing finished! Total files validated: {counter}, total file size: {format_bytes(total_size)}" ) log.info(f"Comparing succeeded: {succeeded} out of {num_files} files") log.info(f"Running time: {end_time - start_time:.2f} seconds") log.info(f"Output file is at: {output_file}") log.info(f"Log file is at: {get_log_file()}")
def set_prefix(self, raw_prefix): prefix = removeTrailingSlash(raw_prefix) if prefix != self.prefix: self.prefix = prefix
def process_arguments(args, log): config_file = None if args.config_file: config_file = args.config_file config = BentoConfig(config_file) # Required Fields if args.dataset: config.dataset = args.dataset if not config.dataset: log.error( 'No dataset specified! Please specify a dataset in config file or with CLI argument --dataset' ) sys.exit(1) if not config.s3_folder and not os.path.isdir(config.dataset): log.error('{} is not a directory!'.format(config.dataset)) sys.exit(1) if args.prop_file: config.prop_file = args.prop_file if not config.prop_file: log.error( 'No properties file specified! ' + 'Please specify a properties file in config file or with CLI argument --prop-file' ) sys.exit(1) if args.schema: config.schema_files = args.schema if not config.schema_files: log.error( 'No schema file specified! ' + 'Please specify at least one schema file in config file or with CLI argument --schema' ) sys.exit(1) if config.PSWD_ENV in os.environ and not config.neo4j_password: config.neo4j_password = os.environ[config.PSWD_ENV] if args.password: config.neo4j_password = args.password if not config.neo4j_password: log.error( 'Password not specified! Please specify password with -p or --password argument,' + ' or set {} env var'.format(config.PSWD_ENV)) sys.exit(1) # Conditionally Required Fields if args.split_transactions: config.split_transactions = args.split_transactions if args.no_backup: config.no_backup = args.no_backup if args.backup_folder: config.backup_folder = args.backup_folder if config.split_transactions and config.no_backup: log.error( '--split-transaction and --no-backup cannot both be enabled, a backup is required when running' ' in split transactions mode') sys.exit(1) if not config.backup_folder and not config.no_backup: log.error( 'Backup folder not specified! A backup folder is required unless the --no-backup argument is used' ) sys.exit(1) if args.s3_folder: config.s3_folder = args.s3_folder if config.s3_folder: if not os.path.exists(config.dataset): os.makedirs(config.dataset) else: exist_files = glob.glob('{}/*.txt'.format(config.dataset)) if len(exist_files) > 0: log.error( 'Folder: "{}" is not empty, please empty it first'.format( config.dataset)) sys.exit(1) if args.bucket: config.s3_bucket = args.bucket if not config.s3_bucket: log.error( 'Please specify S3 bucket name with -b/--bucket argument!') sys.exit(1) bucket = S3Bucket(config.s3_bucket) if not os.path.isdir(config.dataset): log.error('{} is not a directory!'.format(config.dataset)) sys.exit(1) log.info( f'Loading data from s3://{config.s3_bucket}/{config.s3_folder}') if not bucket.download_files_in_folder(config.s3_folder, config.dataset): log.error('Download files from S3 bucket "{}" failed!'.format( config.s3_bucket)) sys.exit(1) # Optional Fields if args.uri: config.neo4j_uri = args.uri if not config.neo4j_uri: config.neo4j_uri = 'bolt://localhost:7687' config.neo4j_uri = removeTrailingSlash(config.neo4j_uri) log.info(f"Loading into Neo4j at: {config.neo4j_uri}") if args.user: config.neo4j_user = args.user if not config.neo4j_user: config.neo4j_user = '******' if args.wipe_db: config.wipe_db = args.wipe_db if args.yes: config.yes = args.yes if args.dry_run: config.dry_run = args.dry_run if args.cheat_mode: config.cheat_mode = args.cheat_mode if args.mode: config.loading_mode = args.mode if not config.loading_mode: config.loading_mode = "UPSERT_MODE" if args.max_violations: config.max_violations = int(args.max_violations) if not config.max_violations: config.max_violations = 10 return config