Ejemplo n.º 1
0
    def __init__(self, file_name):
        self.log = get_logger('Configuration')
        # Read the Configuration File
        with open(file_name) as config_file:
            self.data = json.load(config_file)

        # Read the region
        self.region = self.data['region']
        self.domain = self.data['domain']

        # Read arm objects
        self.arms = []
        for obj in self.data['arms']:
            self.arms.append(Arm(obj))

        # Get List of Arms
        self.cipher_key = self.data['cipher_key']
        self.use_prod = self.data['useProd']

        # Get the Secret Name UAT
        self.secret_name = self.data['secretName']
        # Get Okta UAT Authorization URL
        self.okta_auth_url = self.data["oktaAuthUrl"]
        # Get the Match UAT Treatment Arm Api URL
        self.match_base_url = removeTrailingSlash(self.data['matchBaseUrl'])

        # Get CTDC API URL
        self.api_url = self.data['API_URL']

        if self.use_prod == False:
            self.log.info('Using Match UAT Environment')
        else:
            self.log.info('Using Match Production Environment')
Ejemplo n.º 2
0
    def __init__(self, bucket_name, prefix, adapter):

        """"
        Copy file from URL or local file to S3 bucket
        :param bucket_name: string type
        """
        if not bucket_name:
            raise ValueError('Empty destination bucket name')
        self.bucket_name = bucket_name
        self.bucket = S3Bucket(self.bucket_name)

        if prefix and isinstance(prefix, str):
            self.prefix = removeTrailingSlash(prefix)
        else:
            raise ValueError(f'Invalid prefix: "{prefix}"')

        # Verify adapter has all functions needed
        for attr in self.adapter_attrs:
            if not hasattr(adapter, attr):
                raise TypeError(f'Adapter doesn\'t have "{attr}" attribute/method')
        self.adapter = adapter

        self.log = get_logger('Copier')
        self.files_exist_at_dest = 0
        self.files_copied = 0
        self.files_not_found = set()
Ejemplo n.º 3
0
 def __init__(self, schema):
     if not schema or not isinstance(schema, ICDC_Schema):
         raise Exception('Invalid ICDC_Schema object')
     self.schema = schema
     self.log = get_logger('VisitCreator')
     self.nodes_created = 0
     self.relationships_created = 0
     self.nodes_stat = {}
     self.relationships_stat = {}
Ejemplo n.º 4
0
def main(args):
    log = get_logger('Raw file processor - main')
    config = BentoConfig(args.config_file)

    if not args.queue:
        log.error('Please specify queue name with -q/--queue argument')
        sys.exit(1)

    uri = args.uri if args.uri else "bolt://localhost:7687"
    uri = removeTrailingSlash(uri)

    password = args.password
    if not password:
        if config.PSWD_ENV not in os.environ:
            log.error(
                'Password not specified! Please specify password with -p or --password argument, or set {} env var'.format( config.PSWD_ENV))
            sys.exit(1)
        else:
            password = os.environ[config.PSWD_ENV]
    user = args.user if args.user else 'neo4j'

    if not args.schema:
        log.error('Please specify schema file(s) with -s or --schema argument')
        sys.exit(1)

    for schema_file in args.schema:
        if not os.path.isfile(schema_file):
            log.error('{} is not a file'.format(schema_file))
            sys.exit(1)

    if not args.bucket:
        log.error('Please specify output S3 bucket for final manifest(s) using -b/--bucket argument')
        sys.exit(1)

    if not args.s3_folder:
        log.error('Please specify output S3 folder for final manifest(s) using -f/--s3-folder argument')
        sys.exit(1)

    driver = None
    try:
        props = Props(args.prop_file)
        schema = ICDC_Schema(args.schema, props)
        driver = neo4j.GraphDatabase.driver(uri, auth=(user, password))
        processor = FileLoader(args.queue, driver, schema, config, args.bucket, args.s3_folder, args.dry_run)
        processor.listen()

    except neo4j.ServiceUnavailable as err:
        log.exception(err)
        log.critical("Can't connect to Neo4j server at: \"{}\"".format(uri))

    except KeyboardInterrupt:
        log.info("\nBye!")
        sys.exit()

    finally:
        if driver:
            driver.close()
Ejemplo n.º 5
0
    def __init__(self, yaml_files, props):
        assert isinstance(props, Props)
        self.props = props
        self.rel_prop_delimiter = props.rel_prop_delimiter

        if not yaml_files:
            raise Exception('File list is empty, couldn\'t initialize ICDC_Schema object!')
            sys.exit(1)
        else:
            for data_file in yaml_files:
                if not os.path.isfile(data_file):
                    raise Exception('File "{}" doesn\'t exist'.format(data_file))
        self.log = get_logger('ICDC Schema')
        self.org_schema = {}
        for aFile in yaml_files:
            try:
                self.log.info('Reading schema file: {} ...'.format(aFile))
                if os.path.isfile(aFile):
                    with open(aFile) as schema_file:
                        schema = yaml.safe_load(schema_file)
                        if schema:
                            self.org_schema.update(schema)
            except Exception as e:
                self.log.exception(e)

        self.nodes = {}
        self.relationships = {}
        self.relationship_props = {}
        self.num_relationship = 0

        self.log.debug("-------------processing nodes-----------------")
        if NODES not in self.org_schema:
            self.log.error('Can\'t load any nodes!')
            sys.exit(1)

        elif PROP_DEFINITIONS not in self.org_schema:
            self.log.error('Can\'t load any properties!')
            sys.exit(1)

        for key, value in self.org_schema[NODES].items():
            # Assume all keys start with '_' are not regular nodes
            if not key.startswith('_'):
                self.process_node(key, value)
        self.log.debug("-------------processing edges-----------------")
        if RELATIONSHIPS in self.org_schema:
            for key, value in self.org_schema[RELATIONSHIPS].items():
                # Assume all keys start with '_' are not regular nodes
                if not key.startswith('_'):
                    self.process_node(key, value, True)
                    self.num_relationship += self.process_edges(key, value)
Ejemplo n.º 6
0
    def __init__(self, config_file, args, config_file_arg='config_file'):
        self.log = get_logger('Bento Config')
        if not config_file:
            raise ValueError(f'Empty config file name')
        if not os.path.isfile(config_file):
            raise ValueError(f'"{config_file}" is not a file!')

        self.config_file_arg = config_file_arg

        with open(config_file) as c_file:
            self.data = yaml.safe_load(c_file)['Config']
            if self.data is None:
                self.data = {}

        self._override(args)
Ejemplo n.º 7
0
    def setUp(self):
        uri = 'bolt://localhost:7687'
        user = '******'
        password = os.environ['NEO_PASSWORD']

        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.data_folder = 'data/COTC007B'
        props = Props('../config/props-icdc.yml')
        self.schema = ICDC_Schema(
            ['data/icdc-model.yml', 'data/icdc-model-props.yml'], props)
        self.log = get_logger('Test Loader')
        self.loader = DataLoader(self.driver, self.schema)
        self.file_list = [
            "data/Dataset/COP-program.txt",
            "data/Dataset/NCATS-COP01-case.txt",
            "data/Dataset/NCATS-COP01-diagnosis.txt",
            "data/Dataset/NCATS-COP01_cohort_file.txt",
            "data/Dataset/NCATS-COP01_study_file.txt"
        ]
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='Generate CTDC SBG manifest')
    parser.add_argument('-i',
                        '--uri',
                        help='Neo4j uri like bolt://12.34.56.78:7687')
    parser.add_argument('-u', '--user', help='Neo4j user', default='neo4j')
    parser.add_argument('-p',
                        '--password',
                        help='Neo4j password',
                        default=os.environ[PSWD_ENV])
    args = parser.parse_args()

    log = get_logger('CTDC_SBG_Manifest')

    with GraphDatabase.driver(args.uri,
                              auth=(args.user, args.password)) as driver:
        with driver.session() as session:
            tx = session.begin_transaction()
            generate(tx, log)
Ejemplo n.º 9
0
 def __init__(self, file_name):
     self.log = get_logger('Props')
     if file_name and os.path.isfile(file_name):
         with open(file_name) as prop_file:
             props = yaml.safe_load(prop_file)['Properties']
             if not props:
                 msg = 'Can\'t read property file!'
                 self.log.error(msg)
                 raise Exception(msg)
             self.plurals = props.get('plurals', {})
             self.type_mapping = props.get('type_mapping', {})
             self.id_fields = props.get('id_fields', {})
             self.visit_date_in_nodes = props.get('visit_date_in_nodes', {})
             self.domain = props.get('domain', 'Unknown.domain.nci.nih.gov')
             self.rel_prop_delimiter = props.get('rel_prop_delimiter', '$')
             self.indexes = props.get('indexes', [])
             self.save_parent_id = props.get('save_parent_id', [])
     else:
         msg = f'Can NOT open file: "{file_name}"'
         self.log.error(msg)
         raise Exception(msg)
Ejemplo n.º 10
0
    def __init__(self, queue_name, driver, schema, config, manifest_bucket, manifest_folder, dry_run=False):
        if not isinstance(config, BentoConfig):
            raise TypeError('config object has wrong type!')
        self.config = config

        self.log = get_logger('File Loader')
        self.queue_name = queue_name
        self.s3_client = boto3.client('s3')
        if not isinstance(driver, neo4j.Driver):
            raise Exception('Neo4j driver is invalid!')
        self.driver = driver
        if not isinstance(schema, ICDC_Schema):
            raise Exception('Scheme is invalid!')
        self.schema = schema
        if not manifest_bucket:
            raise Exception('Manifest bucket is invalid!')
        self.manifest_bucket = manifest_bucket
        if not manifest_folder:
            raise Exception('Manifest folder is invalid')
        self.manifest_folder = manifest_folder
        self.dry_run = dry_run
Ejemplo n.º 11
0
    def __init__(self, driver, schema, plugins=[]):
        if not schema or not isinstance(schema, ICDC_Schema):
            raise Exception('Invalid ICDC_Schema object')
        self.log = get_logger('Data Loader')
        self.driver = driver
        self.schema = schema
        self.rel_prop_delimiter = self.schema.rel_prop_delimiter

        if plugins:
            for plugin in plugins:
                if not hasattr(plugin, 'create_node'):
                    raise ValueError('Invalid Plugin!')
                if not hasattr(plugin, 'should_run'):
                    raise ValueError('Invalid Plugin!')
                if not hasattr(plugin, 'nodes_stat'):
                    raise ValueError('Invalid Plugin!')
                if not hasattr(plugin, 'relationships_stat'):
                    raise ValueError('Invalid Plugin!')
                if not hasattr(plugin, 'nodes_created'):
                    raise ValueError('Invalid Plugin!')
                if not hasattr(plugin, 'relationships_created'):
                    raise ValueError('Invalid Plugin!')
        self.plugins = plugins
Ejemplo n.º 12
0
    def __init__(self,
                 mode,
                 adapter_module=None,
                 adapter_class=None,
                 adapter_params=None,
                 domain=None,
                 bucket=None,
                 prefix=None,
                 pre_manifest=None,
                 first=1,
                 count=-1,
                 job_queue=None,
                 result_queue=None,
                 retry=3,
                 overwrite=False,
                 dryrun=False,
                 verify_md5=False):
        """"

        :param bucket: string type
        :param pre_manifest: string type, holds path to pre-manifest
        :param first: first file of files to process, file 1 is in line 2 of pre-manifest
        :param count: number of files to process
        :param adapter: any object that has following methods/properties defined in adapter_attrs

        """
        if mode not in Config.valid_modes:
            raise ValueError(f'Invalid loading mode: {mode}')
        self.mode = mode

        if mode != SOLO_MODE:
            if not job_queue:
                raise ValueError(
                    f'Job queue name is required in {self.mode} mode!')
            self.job_queue_name = job_queue
            self.job_queue = Queue(job_queue)
            if not result_queue:
                raise ValueError(
                    f'Result queue name is required in {self.mode} mode!')
            self.result_queue_name = result_queue
            self.result_queue = Queue(result_queue)

        if self.mode != SLAVE_MODE:
            if not bucket:
                raise ValueError('Empty destination bucket name')
            self.bucket_name = bucket

            if prefix and isinstance(prefix, str):
                self.prefix = removeTrailingSlash(prefix)
            else:
                raise ValueError(f'Invalid prefix: "{prefix}"')

            if not pre_manifest or not os.path.isfile(pre_manifest):
                raise ValueError(
                    f'Pre-manifest: "{pre_manifest}" dosen\'t exist')
            self.pre_manifest = pre_manifest

            if not domain:
                raise ValueError(f'Empty domain!')
            self.domain = domain

            self.adapter_config = {
                self.ADAPTER_PARAMS: adapter_params,
                self.ADAPTER_CLASS: adapter_class,
                self.ADAPTER_MODULE: adapter_module
            }
            self._init_adapter(adapter_module, adapter_class, adapter_params)
        else:
            self.adapter = None
            self.adapter_config = {}

        self.copier = None

        if not first > 0 or count == 0:
            raise ValueError(f'Invalid first ({first}) or count ({count})')
        self.skip = first - 1
        self.count = count

        if not isinstance(retry, int) and retry > 0:
            raise ValueError(f'Invalid retry value: {retry}')
        self.retry = retry
        if not isinstance(overwrite, bool):
            raise TypeError(f'Invalid overwrite value: {overwrite}')
        self.overwrite = overwrite
        if not isinstance(dryrun, bool):
            raise TypeError(f'Invalid dryrun value: {dryrun}')
        self.dryrun = dryrun
        self.verify_md5 = verify_md5

        self.log = get_logger('FileLoader')

        # Statistics
        self.files_processed = 0
        self.files_skipped = 0
        self.files_failed = 0
Ejemplo n.º 13
0
    def __init__(self, config_file):
        self.log = get_logger('Bento Config')
        self.PSWD_ENV = 'NEO_PASSWORD'

        if config_file is None:
            # File-Loader related
            self.temp_folder = None
            self.queue_long_pull_time = None
            self.visibility_timeout = None
            self.indexd_guid_prefix = None
            self.indexd_manifest_ext = None
            self.rel_prop_delimiter = None

            # Data-Loader Related
            self.backup_folder = None
            self.neo4j_uri = None
            self.neo4j_user = None
            self.neo4j_password = None
            self.schema_files = None
            self.prop_file = None
            self.cheat_mode = None
            self.dry_run = None
            self.wipe_db = None
            self.no_backup = None
            self.yes = None
            self.max_violations = None
            self.s3_bucket = None
            self.s3_folder = None
            self.loading_mode = None
            self.dataset = None
            self.no_parents = None
            self.split_transactions = None
        else:
            if os.path.isfile(config_file):
                with open(config_file) as c_file:
                    config = yaml.safe_load(c_file)['Config']

                    #################################
                    # Folders
                    self.temp_folder = config.get('temp_folder')
                    if self.temp_folder:
                        self._create_folder(self.temp_folder)

                    self.backup_folder = config.get('backup_folder')
                    if self.backup_folder:
                        self._create_folder(self.backup_folder)

                    #################################
                    # File-loader related
                    if 'sqs' in config:
                        sqs = config['sqs']
                        self.queue_long_pull_time = sqs.get('long_pull_time')
                        self.visibility_timeout = sqs.get('visibility_timeout')

                    if 'indexd' in config:
                        indexd = config['indexd']
                        self.indexd_guid_prefix = indexd.get('GUID_prefix')
                        self.indexd_manifest_ext = indexd.get('ext')
                        if self.indexd_manifest_ext and not self.indexd_manifest_ext.startswith(
                                '.'):
                            self.indexd_manifest_ext = '.' + self.indexd_manifest_ext
                    self.slack_url = config.get('url')

                    #################################
                    # Data-loader related
                    self.rel_prop_delimiter = config.get('rel_prop_delimiter')
                    if 'neo4j' in config:
                        neo4j = config['neo4j']
                        self.neo4j_uri = neo4j.get('uri')
                        self.neo4j_user = neo4j.get('user')
                        self.neo4j_password = neo4j.get('password')

                    self.plugins = []
                    if 'plugins' in config:
                        for plugin in config.get('plugins', []) or []:
                            self.plugins.append(PluginConfig(plugin))

                    self.schema_files = config.get('schema')
                    self.prop_file = config.get('prop_file')
                    self.cheat_mode = config.get('cheat_mode')
                    self.dry_run = config.get('dry_run')
                    self.wipe_db = config.get('wipe_db')
                    self.no_backup = config.get('no_backup')
                    self.yes = config.get('no_confirmation')
                    self.max_violations = config.get('max_violations', 10)
                    self.s3_bucket = config.get('s3_bucket')
                    self.s3_folder = config.get('s3_folder')
                    self.loading_mode = config.get('loading_mode',
                                                   'UPSERT_MODE')
                    self.dataset = config.get('dataset')
                    self.no_parents = config.get('no_parents')
                    self.split_transactions = config.get('split_transactions')
            else:
                msg = f'Can NOT open configuration file "{config_file}"!'
                self.log.error(msg)
                raise Exception(msg)
Ejemplo n.º 14
0
# This script can be used to verify or replace UUIDs generated by file-copier
# UUID should be generated by using project's domain name, node type of 'file' and file's file_location (new) or MD5 (legacy used before UBC01 and UBC02) as signature

import argparse
import csv
import os

from bento.common.utils import LOG_PREFIX, APP_NAME, get_logger, get_uuid, get_time_stamp, removeTrailingSlash, \
    get_log_file, format_bytes

if LOG_PREFIX not in os.environ:
    os.environ[LOG_PREFIX] = 'UUID_util'

os.environ[APP_NAME] = 'UUID_util'
log = get_logger('UUID_util')


def get_new_manifest_name(manifest):
    folder = os.path.dirname(manifest)
    org_name, ext = os.path.basename(manifest).split('.')
    new_name = f"{org_name}_corrected.{ext}"
    return os.path.join(folder, new_name)


def process_file(file_obj, signature_column, uuid_column, domain, indexd_mode):
    file_name = file_obj.name
    log.info(f"Processing {file_name}")
    data = []

    reader = csv.DictReader(file_obj, delimiter='\t')
Ejemplo n.º 15
0
 def __init__(self, api_url):
     assert isinstance(api_url, str)
     self.api_url = api_url
     self.log = get_logger('Metadata_Validator')
Ejemplo n.º 16
0
# If ETags don't match, it will download both files then calculate and compare MD5s
import argparse
import csv
import os
import boto3
from botocore.exceptions import ClientError
from timeit import default_timer as timer

from bento.common.utils import LOG_PREFIX, APP_NAME, get_md5, get_logger, get_time_stamp, removeTrailingSlash, \
                               get_log_file, format_bytes

if LOG_PREFIX not in os.environ:
    os.environ[LOG_PREFIX] = 'File_copy_validator'

os.environ[APP_NAME] = 'File_copy_validator'
log = get_logger('Validator')

SUCCEEDED = 'Succeeded'
FAILED = 'Failed'
tmp_folder = 'tmp'
PREVIOUSE_VALIDATED = ': in previous validation'


# Input like s3://some/path(/)
def split_s3_path(s3_path):
    path_parts = s3_path.replace("s3://", "").split("/")
    bucket = path_parts.pop(0)
    key = "/".join(path_parts)
    return bucket, key

Ejemplo n.º 17
0
    def setUp(self):
        uri = 'bolt://localhost:7687'
        user = '******'
        password = os.environ['NEO_PASSWORD']

        self.driver = GraphDatabase.driver(uri, auth = (user, password))
        self.data_folder = 'data/COTC007B'
        props = Props('../config/props-icdc.yml')
        self.schema = ICDC_Schema(['data/icdc-model.yml', 'data/icdc-model-props.yml'], props)
        self.log = get_logger('Test Loader')
        self.loader = DataLoader(self.driver, self.schema)
        self.file_list = [
            "data/Dataset/COP-program.txt",
            "data/Dataset/COTC007B-case.txt",
            "data/Dataset/COTC007B-cohort.txt",
            "data/Dataset/COTC007B-cycle.txt",
            "data/Dataset/COTC007B-demographic.txt",
            "data/Dataset/COTC007B-diagnostic.txt",
            "data/Dataset/COTC007B-enrollment.txt",
            "data/Dataset/COTC007B-extent_of_disease.txt",
            "data/Dataset/COTC007B-physical_exam.txt",
            "data/Dataset/COTC007B-principal_investigator.txt",
            "data/Dataset/COTC007B-prior_surgery.txt",
            "data/Dataset/COTC007B-study.txt",
            "data/Dataset/COTC007B-study_arm.txt",
            "data/Dataset/COTC007B-vital_signs.txt",
            "data/Dataset/NCATS-COP01-blood_samples.txt",
            "data/Dataset/NCATS-COP01-case.txt",
            "data/Dataset/NCATS-COP01-demographic.txt",
            "data/Dataset/NCATS-COP01-diagnosis.txt",
            "data/Dataset/NCATS-COP01-enrollment.txt",
            "data/Dataset/NCATS-COP01-normal_samples.txt",
            "data/Dataset/NCATS-COP01-tumor_samples.txt",
            "data/Dataset/NCATS-COP01_20170228-GSL-079A-PE-Breen-NCATS-MEL-Rep1-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep2-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep3-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_cohort_file.txt",
            "data/Dataset/NCATS-COP01_path_report_file_neo4j.txt",
            "data/Dataset/NCATS-COP01_study_file.txt"
        ]
        self.file_list_unique = [
            "data/Dataset/COP-program.txt",
            "data/Dataset/COTC007B-case.txt",
            "data/Dataset/COTC007B-cohort.txt",
            "data/Dataset/COTC007B-cycle.txt",
            "data/Dataset/COTC007B-demographic.txt",
            "data/Dataset/COTC007B-diagnostic.txt",
            "data/Dataset/COTC007B-enrollment.txt",
            "data/Dataset/COTC007B-extent_of_disease.txt",
            "data/Dataset/COTC007B-physical_exam.txt",
            "data/Dataset/COTC007B-principal_investigator.txt",
            "data/Dataset/COTC007B-prior_surgery.txt",
            "data/Dataset/COTC007B-study.txt",
            "data/Dataset/COTC007B-study_arm.txt",
            "data/Dataset/COTC007B-vital_signs_unique.txt",
            "data/Dataset/NCATS-COP01-blood_samples.txt",
            "data/Dataset/NCATS-COP01-case.txt",
            "data/Dataset/NCATS-COP01-demographic.txt",
            "data/Dataset/NCATS-COP01-diagnosis.txt",
            "data/Dataset/NCATS-COP01-enrollment.txt",
            "data/Dataset/NCATS-COP01-normal_samples.txt",
            "data/Dataset/NCATS-COP01-tumor_samples.txt",
            "data/Dataset/NCATS-COP01_20170228-GSL-079A-PE-Breen-NCATS-MEL-Rep1-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep1-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep2-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-076A-Breen-NCATS-MEL-Rep3-Lane1.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep2-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane2.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_GSL-079A-Breen-NCATS-MEL-Rep3-Lane3.tar-file_neo4j.txt",
            "data/Dataset/NCATS-COP01_cohort_file.txt",
            "data/Dataset/NCATS-COP01_path_report_file_neo4j.txt",
            "data/Dataset/NCATS-COP01_study_file.txt"
        ]
Ejemplo n.º 18
0
def main():
    log = get_logger('Loader')
    log_file = get_log_file()
    config = process_arguments(parse_arguments(), log)

    if not check_schema_files(config.schema_files, log):
        return

    driver = None
    restore_cmd = ''
    try:
        txt_files = glob.glob('{}/*.txt'.format(config.dataset))
        tsv_files = glob.glob('{}/*.tsv'.format(config.dataset))
        file_list = txt_files + tsv_files
        if file_list:
            if config.wipe_db and not config.yes:
                if not confirm_deletion(
                        'Wipe out entire Neo4j database before loading?'):
                    sys.exit()

            if config.loading_mode == DELETE_MODE and not config.yes:
                if not confirm_deletion(
                        'Delete all nodes and child nodes from data file?'):
                    sys.exit()

            prop_path = os.path.join(config.dataset, config.prop_file)
            if os.path.isfile(prop_path):
                props = Props(prop_path)
            else:
                props = Props(config.prop_file)
            schema = ICDC_Schema(config.schema_files, props)
            if not config.dry_run:
                driver = GraphDatabase.driver(config.neo4j_uri,
                                              auth=(config.neo4j_user,
                                                    config.neo4j_password),
                                              encrypted=False)

            plugins = []
            if len(config.plugins) > 0:
                for plugin_config in config.plugins:
                    plugins.append(prepare_plugin(plugin_config, schema))
            loader = DataLoader(driver, schema, plugins)

            loader.load(file_list,
                        config.cheat_mode,
                        config.dry_run,
                        config.loading_mode,
                        config.wipe_db,
                        config.max_violations,
                        split=config.split_transactions,
                        no_backup=config.no_backup,
                        neo4j_uri=config.neo4j_uri,
                        backup_folder=config.backup_folder)

            if driver:
                driver.close()
            if restore_cmd:
                log.info(restore_cmd)
        else:
            log.info('No files to load.')

    except ServiceUnavailable:
        log.critical("Neo4j service not available at: \"{}\"".format(
            config.neo4j_uri))
        return
    except AuthError:
        log.error("Wrong Neo4j username or password!")
        return
    except KeyboardInterrupt:
        log.critical("User stopped the loading!")
        return
    finally:
        if driver:
            driver.close()
        if restore_cmd:
            log.info(restore_cmd)

    if config.s3_bucket and config.s3_folder:
        result = upload_log_file(config.s3_bucket, f'{config.s3_folder}/logs',
                                 log_file)
        if result:
            log.info(f'Uploading log file {log_file} succeeded!')
        else:
            log.error(f'Uploading log file {log_file} failed!')
Ejemplo n.º 19
0
 def __init__(self, file_name):
     assert os.path.isfile(file_name)
     self.file_name = file_name
     self.log = get_logger('Manifest_Validator')
     self.s3_buckets = {}