def __init__(self, file_name): self.log = get_logger('Configuration') # Read the Configuration File with open(file_name) as config_file: self.data = json.load(config_file) # Read the region self.region = self.data['region'] self.domain = self.data['domain'] # Read arm objects self.arms = [] for obj in self.data['arms']: self.arms.append(Arm(obj)) # Get List of Arms self.cipher_key = self.data['cipher_key'] self.use_prod = self.data['useProd'] # Get the Secret Name UAT self.secret_name = self.data['secretName'] # Get Okta UAT Authorization URL self.okta_auth_url = self.data["oktaAuthUrl"] # Get the Match UAT Treatment Arm Api URL self.match_base_url = removeTrailingSlash(self.data['matchBaseUrl']) # Get CTDC API URL self.api_url = self.data['API_URL'] if self.use_prod == False: self.log.info('Using Match UAT Environment') else: self.log.info('Using Match Production Environment')
def __init__(self, bucket_name, prefix, adapter): """" Copy file from URL or local file to S3 bucket :param bucket_name: string type """ if not bucket_name: raise ValueError('Empty destination bucket name') self.bucket_name = bucket_name self.bucket = S3Bucket(self.bucket_name) if prefix and isinstance(prefix, str): self.prefix = removeTrailingSlash(prefix) else: raise ValueError(f'Invalid prefix: "{prefix}"') # Verify adapter has all functions needed for attr in self.adapter_attrs: if not hasattr(adapter, attr): raise TypeError(f'Adapter doesn\'t have "{attr}" attribute/method') self.adapter = adapter self.log = get_logger('Copier') self.files_exist_at_dest = 0 self.files_copied = 0 self.files_not_found = set()
def __init__(self, schema): if not schema or not isinstance(schema, ICDC_Schema): raise Exception('Invalid ICDC_Schema object') self.schema = schema self.log = get_logger('VisitCreator') self.nodes_created = 0 self.relationships_created = 0 self.nodes_stat = {} self.relationships_stat = {}
def main(args): log = get_logger('Raw file processor - main') config = BentoConfig(args.config_file) if not args.queue: log.error('Please specify queue name with -q/--queue argument') sys.exit(1) uri = args.uri if args.uri else "bolt://localhost:7687" uri = removeTrailingSlash(uri) password = args.password if not password: if config.PSWD_ENV not in os.environ: log.error( 'Password not specified! Please specify password with -p or --password argument, or set {} env var'.format( config.PSWD_ENV)) sys.exit(1) else: password = os.environ[config.PSWD_ENV] user = args.user if args.user else 'neo4j' if not args.schema: log.error('Please specify schema file(s) with -s or --schema argument') sys.exit(1) for schema_file in args.schema: if not os.path.isfile(schema_file): log.error('{} is not a file'.format(schema_file)) sys.exit(1) if not args.bucket: log.error('Please specify output S3 bucket for final manifest(s) using -b/--bucket argument') sys.exit(1) if not args.s3_folder: log.error('Please specify output S3 folder for final manifest(s) using -f/--s3-folder argument') sys.exit(1) driver = None try: props = Props(args.prop_file) schema = ICDC_Schema(args.schema, props) driver = neo4j.GraphDatabase.driver(uri, auth=(user, password)) processor = FileLoader(args.queue, driver, schema, config, args.bucket, args.s3_folder, args.dry_run) processor.listen() except neo4j.ServiceUnavailable as err: log.exception(err) log.critical("Can't connect to Neo4j server at: \"{}\"".format(uri)) except KeyboardInterrupt: log.info("\nBye!") sys.exit() finally: if driver: driver.close()
def __init__(self, yaml_files, props): assert isinstance(props, Props) self.props = props self.rel_prop_delimiter = props.rel_prop_delimiter if not yaml_files: raise Exception('File list is empty, couldn\'t initialize ICDC_Schema object!') sys.exit(1) else: for data_file in yaml_files: if not os.path.isfile(data_file): raise Exception('File "{}" doesn\'t exist'.format(data_file)) self.log = get_logger('ICDC Schema') self.org_schema = {} for aFile in yaml_files: try: self.log.info('Reading schema file: {} ...'.format(aFile)) if os.path.isfile(aFile): with open(aFile) as schema_file: schema = yaml.safe_load(schema_file) if schema: self.org_schema.update(schema) except Exception as e: self.log.exception(e) self.nodes = {} self.relationships = {} self.relationship_props = {} self.num_relationship = 0 self.log.debug("-------------processing nodes-----------------") if NODES not in self.org_schema: self.log.error('Can\'t load any nodes!') sys.exit(1) elif PROP_DEFINITIONS not in self.org_schema: self.log.error('Can\'t load any properties!') sys.exit(1) for key, value in self.org_schema[NODES].items(): # Assume all keys start with '_' are not regular nodes if not key.startswith('_'): self.process_node(key, value) self.log.debug("-------------processing edges-----------------") if RELATIONSHIPS in self.org_schema: for key, value in self.org_schema[RELATIONSHIPS].items(): # Assume all keys start with '_' are not regular nodes if not key.startswith('_'): self.process_node(key, value, True) self.num_relationship += self.process_edges(key, value)
def __init__(self, config_file, args, config_file_arg='config_file'): self.log = get_logger('Bento Config') if not config_file: raise ValueError(f'Empty config file name') if not os.path.isfile(config_file): raise ValueError(f'"{config_file}" is not a file!') self.config_file_arg = config_file_arg with open(config_file) as c_file: self.data = yaml.safe_load(c_file)['Config'] if self.data is None: self.data = {} self._override(args)
def setUp(self): uri = 'bolt://localhost:7687' user = '******' password = os.environ['NEO_PASSWORD'] self.driver = GraphDatabase.driver(uri, auth=(user, password)) self.data_folder = 'data/COTC007B' props = Props('../config/props-icdc.yml') self.schema = ICDC_Schema( ['data/icdc-model.yml', 'data/icdc-model-props.yml'], props) self.log = get_logger('Test Loader') self.loader = DataLoader(self.driver, self.schema) self.file_list = [ "data/Dataset/COP-program.txt", "data/Dataset/NCATS-COP01-case.txt", "data/Dataset/NCATS-COP01-diagnosis.txt", "data/Dataset/NCATS-COP01_cohort_file.txt", "data/Dataset/NCATS-COP01_study_file.txt" ]
def main(): parser = argparse.ArgumentParser(description='Generate CTDC SBG manifest') parser.add_argument('-i', '--uri', help='Neo4j uri like bolt://12.34.56.78:7687') parser.add_argument('-u', '--user', help='Neo4j user', default='neo4j') parser.add_argument('-p', '--password', help='Neo4j password', default=os.environ[PSWD_ENV]) args = parser.parse_args() log = get_logger('CTDC_SBG_Manifest') with GraphDatabase.driver(args.uri, auth=(args.user, args.password)) as driver: with driver.session() as session: tx = session.begin_transaction() generate(tx, log)
def __init__(self, file_name): self.log = get_logger('Props') if file_name and os.path.isfile(file_name): with open(file_name) as prop_file: props = yaml.safe_load(prop_file)['Properties'] if not props: msg = 'Can\'t read property file!' self.log.error(msg) raise Exception(msg) self.plurals = props.get('plurals', {}) self.type_mapping = props.get('type_mapping', {}) self.id_fields = props.get('id_fields', {}) self.visit_date_in_nodes = props.get('visit_date_in_nodes', {}) self.domain = props.get('domain', 'Unknown.domain.nci.nih.gov') self.rel_prop_delimiter = props.get('rel_prop_delimiter', '$') self.indexes = props.get('indexes', []) self.save_parent_id = props.get('save_parent_id', []) else: msg = f'Can NOT open file: "{file_name}"' self.log.error(msg) raise Exception(msg)
def __init__(self, queue_name, driver, schema, config, manifest_bucket, manifest_folder, dry_run=False): if not isinstance(config, BentoConfig): raise TypeError('config object has wrong type!') self.config = config self.log = get_logger('File Loader') self.queue_name = queue_name self.s3_client = boto3.client('s3') if not isinstance(driver, neo4j.Driver): raise Exception('Neo4j driver is invalid!') self.driver = driver if not isinstance(schema, ICDC_Schema): raise Exception('Scheme is invalid!') self.schema = schema if not manifest_bucket: raise Exception('Manifest bucket is invalid!') self.manifest_bucket = manifest_bucket if not manifest_folder: raise Exception('Manifest folder is invalid') self.manifest_folder = manifest_folder self.dry_run = dry_run
def __init__(self, driver, schema, plugins=[]): if not schema or not isinstance(schema, ICDC_Schema): raise Exception('Invalid ICDC_Schema object') self.log = get_logger('Data Loader') self.driver = driver self.schema = schema self.rel_prop_delimiter = self.schema.rel_prop_delimiter if plugins: for plugin in plugins: if not hasattr(plugin, 'create_node'): raise ValueError('Invalid Plugin!') if not hasattr(plugin, 'should_run'): raise ValueError('Invalid Plugin!') if not hasattr(plugin, 'nodes_stat'): raise ValueError('Invalid Plugin!') if not hasattr(plugin, 'relationships_stat'): raise ValueError('Invalid Plugin!') if not hasattr(plugin, 'nodes_created'): raise ValueError('Invalid Plugin!') if not hasattr(plugin, 'relationships_created'): raise ValueError('Invalid Plugin!') self.plugins = plugins
def __init__(self, mode, adapter_module=None, adapter_class=None, adapter_params=None, domain=None, bucket=None, prefix=None, pre_manifest=None, first=1, count=-1, job_queue=None, result_queue=None, retry=3, overwrite=False, dryrun=False, verify_md5=False): """" :param bucket: string type :param pre_manifest: string type, holds path to pre-manifest :param first: first file of files to process, file 1 is in line 2 of pre-manifest :param count: number of files to process :param adapter: any object that has following methods/properties defined in adapter_attrs """ if mode not in Config.valid_modes: raise ValueError(f'Invalid loading mode: {mode}') self.mode = mode if mode != SOLO_MODE: if not job_queue: raise ValueError( f'Job queue name is required in {self.mode} mode!') self.job_queue_name = job_queue self.job_queue = Queue(job_queue) if not result_queue: raise ValueError( f'Result queue name is required in {self.mode} mode!') self.result_queue_name = result_queue self.result_queue = Queue(result_queue) if self.mode != SLAVE_MODE: if not bucket: raise ValueError('Empty destination bucket name') self.bucket_name = bucket if prefix and isinstance(prefix, str): self.prefix = removeTrailingSlash(prefix) else: raise ValueError(f'Invalid prefix: "{prefix}"') if not pre_manifest or not os.path.isfile(pre_manifest): raise ValueError( f'Pre-manifest: "{pre_manifest}" dosen\'t exist') self.pre_manifest = pre_manifest if not domain: raise ValueError(f'Empty domain!') self.domain = domain self.adapter_config = { self.ADAPTER_PARAMS: adapter_params, self.ADAPTER_CLASS: adapter_class, self.ADAPTER_MODULE: adapter_module } self._init_adapter(adapter_module, adapter_class, adapter_params) else: self.adapter = None self.adapter_config = {} self.copier = None if not first > 0 or count == 0: raise ValueError(f'Invalid first ({first}) or count ({count})') self.skip = first - 1 self.count = count if not isinstance(retry, int) and retry > 0: raise ValueError(f'Invalid retry value: {retry}') self.retry = retry if not isinstance(overwrite, bool): raise TypeError(f'Invalid overwrite value: {overwrite}') self.overwrite = overwrite if not isinstance(dryrun, bool): raise TypeError(f'Invalid dryrun value: {dryrun}') self.dryrun = dryrun self.verify_md5 = verify_md5 self.log = get_logger('FileLoader') # Statistics self.files_processed = 0 self.files_skipped = 0 self.files_failed = 0
def __init__(self, config_file): self.log = get_logger('Bento Config') self.PSWD_ENV = 'NEO_PASSWORD' if config_file is None: # File-Loader related self.temp_folder = None self.queue_long_pull_time = None self.visibility_timeout = None self.indexd_guid_prefix = None self.indexd_manifest_ext = None self.rel_prop_delimiter = None # Data-Loader Related self.backup_folder = None self.neo4j_uri = None self.neo4j_user = None self.neo4j_password = None self.schema_files = None self.prop_file = None self.cheat_mode = None self.dry_run = None self.wipe_db = None self.no_backup = None self.yes = None self.max_violations = None self.s3_bucket = None self.s3_folder = None self.loading_mode = None self.dataset = None self.no_parents = None self.split_transactions = None else: if os.path.isfile(config_file): with open(config_file) as c_file: config = yaml.safe_load(c_file)['Config'] ################################# # Folders self.temp_folder = config.get('temp_folder') if self.temp_folder: self._create_folder(self.temp_folder) self.backup_folder = config.get('backup_folder') if self.backup_folder: self._create_folder(self.backup_folder) ################################# # File-loader related if 'sqs' in config: sqs = config['sqs'] self.queue_long_pull_time = sqs.get('long_pull_time') self.visibility_timeout = sqs.get('visibility_timeout') if 'indexd' in config: indexd = config['indexd'] self.indexd_guid_prefix = indexd.get('GUID_prefix') self.indexd_manifest_ext = indexd.get('ext') if self.indexd_manifest_ext and not self.indexd_manifest_ext.startswith( '.'): self.indexd_manifest_ext = '.' + self.indexd_manifest_ext self.slack_url = config.get('url') ################################# # Data-loader related self.rel_prop_delimiter = config.get('rel_prop_delimiter') if 'neo4j' in config: neo4j = config['neo4j'] self.neo4j_uri = neo4j.get('uri') self.neo4j_user = neo4j.get('user') self.neo4j_password = neo4j.get('password') self.plugins = [] if 'plugins' in config: for plugin in config.get('plugins', []) or []: self.plugins.append(PluginConfig(plugin)) self.schema_files = config.get('schema') self.prop_file = config.get('prop_file') self.cheat_mode = config.get('cheat_mode') self.dry_run = config.get('dry_run') self.wipe_db = config.get('wipe_db') self.no_backup = config.get('no_backup') self.yes = config.get('no_confirmation') self.max_violations = config.get('max_violations', 10) self.s3_bucket = config.get('s3_bucket') self.s3_folder = config.get('s3_folder') self.loading_mode = config.get('loading_mode', 'UPSERT_MODE') self.dataset = config.get('dataset') self.no_parents = config.get('no_parents') self.split_transactions = config.get('split_transactions') else: msg = f'Can NOT open configuration file "{config_file}"!' self.log.error(msg) raise Exception(msg)
# This script can be used to verify or replace UUIDs generated by file-copier # UUID should be generated by using project's domain name, node type of 'file' and file's file_location (new) or MD5 (legacy used before UBC01 and UBC02) as signature import argparse import csv import os from bento.common.utils import LOG_PREFIX, APP_NAME, get_logger, get_uuid, get_time_stamp, removeTrailingSlash, \ get_log_file, format_bytes if LOG_PREFIX not in os.environ: os.environ[LOG_PREFIX] = 'UUID_util' os.environ[APP_NAME] = 'UUID_util' log = get_logger('UUID_util') def get_new_manifest_name(manifest): folder = os.path.dirname(manifest) org_name, ext = os.path.basename(manifest).split('.') new_name = f"{org_name}_corrected.{ext}" return os.path.join(folder, new_name) def process_file(file_obj, signature_column, uuid_column, domain, indexd_mode): file_name = file_obj.name log.info(f"Processing {file_name}") data = [] reader = csv.DictReader(file_obj, delimiter='\t')
def __init__(self, api_url): assert isinstance(api_url, str) self.api_url = api_url self.log = get_logger('Metadata_Validator')
# If ETags don't match, it will download both files then calculate and compare MD5s import argparse import csv import os import boto3 from botocore.exceptions import ClientError from timeit import default_timer as timer from bento.common.utils import LOG_PREFIX, APP_NAME, get_md5, get_logger, get_time_stamp, removeTrailingSlash, \ get_log_file, format_bytes if LOG_PREFIX not in os.environ: os.environ[LOG_PREFIX] = 'File_copy_validator' os.environ[APP_NAME] = 'File_copy_validator' log = get_logger('Validator') SUCCEEDED = 'Succeeded' FAILED = 'Failed' tmp_folder = 'tmp' PREVIOUSE_VALIDATED = ': in previous validation' # Input like s3://some/path(/) def split_s3_path(s3_path): path_parts = s3_path.replace("s3://", "").split("/") bucket = path_parts.pop(0) key = "/".join(path_parts) return bucket, key
def setUp(self): uri = 'bolt://localhost:7687' user = '******' password = os.environ['NEO_PASSWORD'] self.driver = GraphDatabase.driver(uri, auth = (user, password)) self.data_folder = 'data/COTC007B' props = Props('../config/props-icdc.yml') self.schema = ICDC_Schema(['data/icdc-model.yml', 'data/icdc-model-props.yml'], props) self.log = get_logger('Test Loader') self.loader = DataLoader(self.driver, self.schema) self.file_list = [ "data/Dataset/COP-program.txt", "data/Dataset/COTC007B-case.txt", "data/Dataset/COTC007B-cohort.txt", "data/Dataset/COTC007B-cycle.txt", "data/Dataset/COTC007B-demographic.txt", "data/Dataset/COTC007B-diagnostic.txt", "data/Dataset/COTC007B-enrollment.txt", "data/Dataset/COTC007B-extent_of_disease.txt", "data/Dataset/COTC007B-physical_exam.txt", "data/Dataset/COTC007B-principal_investigator.txt", "data/Dataset/COTC007B-prior_surgery.txt", "data/Dataset/COTC007B-study.txt", "data/Dataset/COTC007B-study_arm.txt", "data/Dataset/COTC007B-vital_signs.txt", "data/Dataset/NCATS-COP01-blood_samples.txt", "data/Dataset/NCATS-COP01-case.txt", "data/Dataset/NCATS-COP01-demographic.txt", "data/Dataset/NCATS-COP01-diagnosis.txt", "data/Dataset/NCATS-COP01-enrollment.txt", "data/Dataset/NCATS-COP01-normal_samples.txt", "data/Dataset/NCATS-COP01-tumor_samples.txt", "data/Dataset/NCATS-COP01_20170228-GSL-079A-PE-Breen-NCATS-MEL-Rep1-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep2-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep3-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_cohort_file.txt", "data/Dataset/NCATS-COP01_path_report_file_neo4j.txt", "data/Dataset/NCATS-COP01_study_file.txt" ] self.file_list_unique = [ "data/Dataset/COP-program.txt", "data/Dataset/COTC007B-case.txt", "data/Dataset/COTC007B-cohort.txt", "data/Dataset/COTC007B-cycle.txt", "data/Dataset/COTC007B-demographic.txt", "data/Dataset/COTC007B-diagnostic.txt", "data/Dataset/COTC007B-enrollment.txt", "data/Dataset/COTC007B-extent_of_disease.txt", "data/Dataset/COTC007B-physical_exam.txt", "data/Dataset/COTC007B-principal_investigator.txt", "data/Dataset/COTC007B-prior_surgery.txt", "data/Dataset/COTC007B-study.txt", "data/Dataset/COTC007B-study_arm.txt", "data/Dataset/COTC007B-vital_signs_unique.txt", "data/Dataset/NCATS-COP01-blood_samples.txt", "data/Dataset/NCATS-COP01-case.txt", "data/Dataset/NCATS-COP01-demographic.txt", "data/Dataset/NCATS-COP01-diagnosis.txt", "data/Dataset/NCATS-COP01-enrollment.txt", "data/Dataset/NCATS-COP01-normal_samples.txt", "data/Dataset/NCATS-COP01-tumor_samples.txt", "data/Dataset/NCATS-COP01_20170228-GSL-079A-PE-Breen-NCATS-MEL-Rep1-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep2-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep3-Lane1.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane2.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane3.tar-file_neo4j.txt", "data/Dataset/NCATS-COP01_cohort_file.txt", "data/Dataset/NCATS-COP01_path_report_file_neo4j.txt", "data/Dataset/NCATS-COP01_study_file.txt" ]
def main(): log = get_logger('Loader') log_file = get_log_file() config = process_arguments(parse_arguments(), log) if not check_schema_files(config.schema_files, log): return driver = None restore_cmd = '' try: txt_files = glob.glob('{}/*.txt'.format(config.dataset)) tsv_files = glob.glob('{}/*.tsv'.format(config.dataset)) file_list = txt_files + tsv_files if file_list: if config.wipe_db and not config.yes: if not confirm_deletion( 'Wipe out entire Neo4j database before loading?'): sys.exit() if config.loading_mode == DELETE_MODE and not config.yes: if not confirm_deletion( 'Delete all nodes and child nodes from data file?'): sys.exit() prop_path = os.path.join(config.dataset, config.prop_file) if os.path.isfile(prop_path): props = Props(prop_path) else: props = Props(config.prop_file) schema = ICDC_Schema(config.schema_files, props) if not config.dry_run: driver = GraphDatabase.driver(config.neo4j_uri, auth=(config.neo4j_user, config.neo4j_password), encrypted=False) plugins = [] if len(config.plugins) > 0: for plugin_config in config.plugins: plugins.append(prepare_plugin(plugin_config, schema)) loader = DataLoader(driver, schema, plugins) loader.load(file_list, config.cheat_mode, config.dry_run, config.loading_mode, config.wipe_db, config.max_violations, split=config.split_transactions, no_backup=config.no_backup, neo4j_uri=config.neo4j_uri, backup_folder=config.backup_folder) if driver: driver.close() if restore_cmd: log.info(restore_cmd) else: log.info('No files to load.') except ServiceUnavailable: log.critical("Neo4j service not available at: \"{}\"".format( config.neo4j_uri)) return except AuthError: log.error("Wrong Neo4j username or password!") return except KeyboardInterrupt: log.critical("User stopped the loading!") return finally: if driver: driver.close() if restore_cmd: log.info(restore_cmd) if config.s3_bucket and config.s3_folder: result = upload_log_file(config.s3_bucket, f'{config.s3_folder}/logs', log_file) if result: log.info(f'Uploading log file {log_file} succeeded!') else: log.error(f'Uploading log file {log_file} failed!')
def __init__(self, file_name): assert os.path.isfile(file_name) self.file_name = file_name self.log = get_logger('Manifest_Validator') self.s3_buckets = {}