예제 #1
0
    def __init__(self,
                 url_prefix=None,
                 name_field=None,
                 md5_field=None,
                 acl_field=None,
                 size_field=None,
                 location_field=None):
        """
        If url_prefix is given, then it will prepend to file names to get original URL,
        Otherwise, it will assume name_field contains complete URLs

        :param name_field: field name used to store file name
        :param md5_field: field name used to store original MD5
        :param size_field: field name used to store original file size
        :param url_prefix: URL prefix to prepend to all file names
        :param verify: whether or not to verify MD5 and size
        """
        super().__init__(name_field=name_field,
                         md5_field=md5_field,
                         size_field=size_field,
                         acl_field=acl_field,
                         location_field=location_field)
        if isinstance(url_prefix, str) and url_prefix:
            self.url_prefix = removeTrailingSlash(url_prefix)
        else:
            self.url_prefix = None
예제 #2
0
    def __init__(self, bucket_name, prefix, adapter):

        """"
        Copy file from URL or local file to S3 bucket
        :param bucket_name: string type
        """
        if not bucket_name:
            raise ValueError('Empty destination bucket name')
        self.bucket_name = bucket_name
        self.bucket = S3Bucket(self.bucket_name)

        if prefix and isinstance(prefix, str):
            self.prefix = removeTrailingSlash(prefix)
        else:
            raise ValueError(f'Invalid prefix: "{prefix}"')

        # Verify adapter has all functions needed
        for attr in self.adapter_attrs:
            if not hasattr(adapter, attr):
                raise TypeError(f'Adapter doesn\'t have "{attr}" attribute/method')
        self.adapter = adapter

        self.log = get_logger('Copier')
        self.files_exist_at_dest = 0
        self.files_copied = 0
        self.files_not_found = set()
예제 #3
0
    def __init__(self, file_name):
        self.log = get_logger('Configuration')
        # Read the Configuration File
        with open(file_name) as config_file:
            self.data = json.load(config_file)

        # Read the region
        self.region = self.data['region']
        self.domain = self.data['domain']

        # Read arm objects
        self.arms = []
        for obj in self.data['arms']:
            self.arms.append(Arm(obj))

        # Get List of Arms
        self.cipher_key = self.data['cipher_key']
        self.use_prod = self.data['useProd']

        # Get the Secret Name UAT
        self.secret_name = self.data['secretName']
        # Get Okta UAT Authorization URL
        self.okta_auth_url = self.data["oktaAuthUrl"]
        # Get the Match UAT Treatment Arm Api URL
        self.match_base_url = removeTrailingSlash(self.data['matchBaseUrl'])

        # Get CTDC API URL
        self.api_url = self.data['API_URL']

        if self.use_prod == False:
            self.log.info('Using Match UAT Environment')
        else:
            self.log.info('Using Match Production Environment')
예제 #4
0
def main(args):
    log = get_logger('Raw file processor - main')
    config = BentoConfig(args.config_file)

    if not args.queue:
        log.error('Please specify queue name with -q/--queue argument')
        sys.exit(1)

    uri = args.uri if args.uri else "bolt://localhost:7687"
    uri = removeTrailingSlash(uri)

    password = args.password
    if not password:
        if config.PSWD_ENV not in os.environ:
            log.error(
                'Password not specified! Please specify password with -p or --password argument, or set {} env var'.format( config.PSWD_ENV))
            sys.exit(1)
        else:
            password = os.environ[config.PSWD_ENV]
    user = args.user if args.user else 'neo4j'

    if not args.schema:
        log.error('Please specify schema file(s) with -s or --schema argument')
        sys.exit(1)

    for schema_file in args.schema:
        if not os.path.isfile(schema_file):
            log.error('{} is not a file'.format(schema_file))
            sys.exit(1)

    if not args.bucket:
        log.error('Please specify output S3 bucket for final manifest(s) using -b/--bucket argument')
        sys.exit(1)

    if not args.s3_folder:
        log.error('Please specify output S3 folder for final manifest(s) using -f/--s3-folder argument')
        sys.exit(1)

    driver = None
    try:
        props = Props(args.prop_file)
        schema = ICDC_Schema(args.schema, props)
        driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
        processor = FileLoader(args.queue, driver, schema, config, args.bucket, args.s3_folder, args.dry_run)
        processor.listen()

    except neo4j.ServiceUnavailable as err:
        log.exception(err)
        log.critical("Can't connect to Neo4j server at: \"{}\"".format(uri))

    except KeyboardInterrupt:
        log.info("\nBye!")
        sys.exit()

    finally:
        if driver:
            driver.close()
예제 #5
0
 def test_remove_traling_slash(self):
     self.assertEqual('abc', removeTrailingSlash('abc/'))
     self.assertEqual('abc', removeTrailingSlash('abc'))
     self.assertEqual('abc', removeTrailingSlash('abc//'))
     self.assertEqual('bolt://12.34.56.78',
                      removeTrailingSlash('bolt://12.34.56.78'))
     self.assertEqual('bolt://12.34.56.78',
                      removeTrailingSlash('bolt://12.34.56.78/'))
     self.assertEqual('bolt://12.34.56.78',
                      removeTrailingSlash('bolt://12.34.56.78//'))
     self.assertEqual('bolt://12.34.56.78',
                      removeTrailingSlash('bolt://12.34.56.78////'))
예제 #6
0
    def __init__(self,
                 mode,
                 adapter_module=None,
                 adapter_class=None,
                 adapter_params=None,
                 domain=None,
                 bucket=None,
                 prefix=None,
                 pre_manifest=None,
                 first=1,
                 count=-1,
                 job_queue=None,
                 result_queue=None,
                 retry=3,
                 overwrite=False,
                 dryrun=False,
                 verify_md5=False):
        """"

        :param bucket: string type
        :param pre_manifest: string type, holds path to pre-manifest
        :param first: first file of files to process, file 1 is in line 2 of pre-manifest
        :param count: number of files to process
        :param adapter: any object that has following methods/properties defined in adapter_attrs

        """
        if mode not in Config.valid_modes:
            raise ValueError(f'Invalid loading mode: {mode}')
        self.mode = mode

        if mode != SOLO_MODE:
            if not job_queue:
                raise ValueError(
                    f'Job queue name is required in {self.mode} mode!')
            self.job_queue_name = job_queue
            self.job_queue = Queue(job_queue)
            if not result_queue:
                raise ValueError(
                    f'Result queue name is required in {self.mode} mode!')
            self.result_queue_name = result_queue
            self.result_queue = Queue(result_queue)

        if self.mode != SLAVE_MODE:
            if not bucket:
                raise ValueError('Empty destination bucket name')
            self.bucket_name = bucket

            if prefix and isinstance(prefix, str):
                self.prefix = removeTrailingSlash(prefix)
            else:
                raise ValueError(f'Invalid prefix: "{prefix}"')

            if not pre_manifest or not os.path.isfile(pre_manifest):
                raise ValueError(
                    f'Pre-manifest: "{pre_manifest}" dosen\'t exist')
            self.pre_manifest = pre_manifest

            if not domain:
                raise ValueError(f'Empty domain!')
            self.domain = domain

            self.adapter_config = {
                self.ADAPTER_PARAMS: adapter_params,
                self.ADAPTER_CLASS: adapter_class,
                self.ADAPTER_MODULE: adapter_module
            }
            self._init_adapter(adapter_module, adapter_class, adapter_params)
        else:
            self.adapter = None
            self.adapter_config = {}

        self.copier = None

        if not first > 0 or count == 0:
            raise ValueError(f'Invalid first ({first}) or count ({count})')
        self.skip = first - 1
        self.count = count

        if not isinstance(retry, int) and retry > 0:
            raise ValueError(f'Invalid retry value: {retry}')
        self.retry = retry
        if not isinstance(overwrite, bool):
            raise TypeError(f'Invalid overwrite value: {overwrite}')
        self.overwrite = overwrite
        if not isinstance(dryrun, bool):
            raise TypeError(f'Invalid dryrun value: {dryrun}')
        self.dryrun = dryrun
        self.verify_md5 = verify_md5

        self.log = get_logger('FileLoader')

        # Statistics
        self.files_processed = 0
        self.files_skipped = 0
        self.files_failed = 0
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Script to validate file copying')
    parser.add_argument('-sp',
                        '--src-path',
                        help='Source S3 bucket name and optional path')
    parser.add_argument('-db',
                        '--dest-bucket',
                        help='Destination S3 bucket name')
    parser.add_argument('-pf',
                        '--previous-file',
                        type=argparse.FileType('r'),
                        help='Previous output CSV file of this script')
    args = parser.parse_args()
    start_time = timer()
    fieldnames = [
        'src_bucket', 'dest_bucket', 'file_name', 'file_size', 'result',
        'reason'
    ]
    s3 = boto3.client('s3')

    # Revalidate a previous validation file
    if args.previous_file:
        log.info(f'Previous validation file: {args.previous_file.name}')
        reader = csv.DictReader(args.previous_file)
        file_list = []

        for obj in reader:
            src_bucket = obj['src_bucket']
            dest_bucket = obj['dest_bucket']
            if obj['result'] == SUCCEEDED:
                if not obj['reason'].endswith(PREVIOUSE_VALIDATED):
                    obj['reason'] += PREVIOUSE_VALIDATED
                file_list.append(obj)
            else:
                file = s3.head_object(Bucket=src_bucket, Key=obj['file_name'])
                file['Size'] = file['ContentLength']
                file['Key'] = obj['file_name']
                file_list.append(file)

    else:
        if not args.src_path or not args.dest_bucket:
            log.error('Source S3 path and Destination S3 bucket are required!')
            return
        source_path = removeTrailingSlash(args.src_path)
        dest_bucket = removeTrailingSlash(args.dest_bucket)
        src_bucket, s3_path = split_s3_path(source_path)

        log.info(f"Source bucket: {src_bucket}")
        log.info(f"Dest   bucket: {dest_bucket}")
        log.info(f"Prefix: {s3_path}")

        file_list = list_files(s3, src_bucket, s3_path)

    num_files = len(file_list)
    log.info(f"There are {num_files} files to compare")

    os.makedirs(tmp_folder, exist_ok=True)
    output_file = f'{tmp_folder}/copy-file-validation-{get_time_stamp()}.csv'
    with open(output_file, 'w') as of:
        writer = csv.DictWriter(of, fieldnames=fieldnames)
        writer.writeheader()

        counter = 0
        succeeded = 0
        total_size = 0
        for file in file_list:
            counter += 1

            # These files has been successfully validated last time
            if 'result' in file:
                writer.writerow(file)
                file_size = int(file['file_size'])
                total_size += file_size
                log.info(
                    f"Valiating file {counter}/{num_files} ({format_bytes(file_size)}): {file['file_name']}"
                )
                log.info('Validated in previous run')
                continue

            file_size = file['Size']
            total_size += file_size
            try:
                log.info(
                    f'Valiating file {counter}/{num_files} ({format_bytes(file_size)}): {file["Key"]}'
                )
                result, message = validate_file(s3, file, src_bucket,
                                                dest_bucket)
            except Exception as e:
                log.exception(e)
                log.error(
                    f'Valiating file: {file["Key"]} failed! See errors above.')
                result = FAILED
                message = e

            if result == SUCCEEDED:
                log.info(f"{result}: {message}")
                succeeded += 1
            else:
                log.error(f"{result}: {message}")
            log.info(f"Total Verified file size: {format_bytes(total_size)}")
            writer.writerow({
                'src_bucket': src_bucket,
                'dest_bucket': dest_bucket,
                'file_name': file['Key'],
                'file_size': file_size,
                'result': result,
                'reason': message
            })

        end_time = timer()
        log.info(
            f"Comparing finished! Total files validated: {counter}, total file size: {format_bytes(total_size)}"
        )
        log.info(f"Comparing succeeded: {succeeded} out of {num_files} files")
        log.info(f"Running time: {end_time - start_time:.2f} seconds")
        log.info(f"Output file is at: {output_file}")
        log.info(f"Log file is at: {get_log_file()}")
예제 #8
0
 def set_prefix(self, raw_prefix):
     prefix = removeTrailingSlash(raw_prefix)
     if prefix != self.prefix:
         self.prefix = prefix
예제 #9
0
def process_arguments(args, log):
    config_file = None
    if args.config_file:
        config_file = args.config_file
    config = BentoConfig(config_file)

    # Required Fields
    if args.dataset:
        config.dataset = args.dataset
    if not config.dataset:
        log.error(
            'No dataset specified! Please specify a dataset in config file or with CLI argument --dataset'
        )
        sys.exit(1)
    if not config.s3_folder and not os.path.isdir(config.dataset):
        log.error('{} is not a directory!'.format(config.dataset))
        sys.exit(1)

    if args.prop_file:
        config.prop_file = args.prop_file
    if not config.prop_file:
        log.error(
            'No properties file specified! ' +
            'Please specify a properties file in config file or with CLI argument --prop-file'
        )
        sys.exit(1)

    if args.schema:
        config.schema_files = args.schema
    if not config.schema_files:
        log.error(
            'No schema file specified! ' +
            'Please specify at least one schema file in config file or with CLI argument --schema'
        )
        sys.exit(1)

    if config.PSWD_ENV in os.environ and not config.neo4j_password:
        config.neo4j_password = os.environ[config.PSWD_ENV]
    if args.password:
        config.neo4j_password = args.password
    if not config.neo4j_password:
        log.error(
            'Password not specified! Please specify password with -p or --password argument,'
            + ' or set {} env var'.format(config.PSWD_ENV))
        sys.exit(1)

    # Conditionally Required Fields
    if args.split_transactions:
        config.split_transactions = args.split_transactions
    if args.no_backup:
        config.no_backup = args.no_backup
    if args.backup_folder:
        config.backup_folder = args.backup_folder
    if config.split_transactions and config.no_backup:
        log.error(
            '--split-transaction and --no-backup cannot both be enabled, a backup is required when running'
            ' in split transactions mode')
        sys.exit(1)
    if not config.backup_folder and not config.no_backup:
        log.error(
            'Backup folder not specified! A backup folder is required unless the --no-backup argument is used'
        )
        sys.exit(1)

    if args.s3_folder:
        config.s3_folder = args.s3_folder
    if config.s3_folder:
        if not os.path.exists(config.dataset):
            os.makedirs(config.dataset)
        else:
            exist_files = glob.glob('{}/*.txt'.format(config.dataset))
            if len(exist_files) > 0:
                log.error(
                    'Folder: "{}" is not empty, please empty it first'.format(
                        config.dataset))
                sys.exit(1)

        if args.bucket:
            config.s3_bucket = args.bucket
        if not config.s3_bucket:
            log.error(
                'Please specify S3 bucket name with -b/--bucket argument!')
            sys.exit(1)
        bucket = S3Bucket(config.s3_bucket)
        if not os.path.isdir(config.dataset):
            log.error('{} is not a directory!'.format(config.dataset))
            sys.exit(1)
        log.info(
            f'Loading data from s3://{config.s3_bucket}/{config.s3_folder}')
        if not bucket.download_files_in_folder(config.s3_folder,
                                               config.dataset):
            log.error('Download files from S3 bucket "{}" failed!'.format(
                config.s3_bucket))
            sys.exit(1)

    # Optional Fields
    if args.uri:
        config.neo4j_uri = args.uri
    if not config.neo4j_uri:
        config.neo4j_uri = 'bolt://localhost:7687'
    config.neo4j_uri = removeTrailingSlash(config.neo4j_uri)
    log.info(f"Loading into Neo4j at: {config.neo4j_uri}")

    if args.user:
        config.neo4j_user = args.user
    if not config.neo4j_user:
        config.neo4j_user = '******'

    if args.wipe_db:
        config.wipe_db = args.wipe_db

    if args.yes:
        config.yes = args.yes

    if args.dry_run:
        config.dry_run = args.dry_run

    if args.cheat_mode:
        config.cheat_mode = args.cheat_mode

    if args.mode:
        config.loading_mode = args.mode
    if not config.loading_mode:
        config.loading_mode = "UPSERT_MODE"

    if args.max_violations:
        config.max_violations = int(args.max_violations)
    if not config.max_violations:
        config.max_violations = 10

    return config