Esempio n. 1
0
def flatten(rules):
    data = tools.load_json(rules)
    objects = data['cdm']
    for destination_table,rule_set in objects.items():
        if len(rule_set) < 2: continue
        #print (rule_set)
        df = pd.DataFrame.from_records(rule_set).T

        #for name in df.index:
        #    print (name,len(df.loc[name]))

        df = df.loc['condition_concept_id'].apply(pd.Series)


        def merge(s):
            if s == 'term_mapping':
                print ('hiya')
                return {k:v for a in s for k,v in a.items()}
        
        print (df.groupby('source_field').agg(merge))
            
        #print (df.iloc[1])
        #print (df.iloc[1][1])
        #print (df)
        #print (df.iloc[0])
        #print (df.iloc[0].name)
        #print (df.iloc[0].apply(pd.Series))
        #print (df.iloc[1].apply(pd.Series)['term_mapping'].apply(pd.Series))
        
        exit(0)
Esempio n. 2
0
def print_json(rules,list_fields,list_tables):
    data = tools.load_json(rules)
    if list_fields or list_tables:
        data = tools.get_mapped_fields_from_rules(data)
        if list_tables:
            data = list(data.keys())

    print (json.dumps(data,indent=6))
Esempio n. 3
0
def make_class(ctx,name,rules,register):
    data = tools.load_json(rules)
    if name == None:
        name = data['metadata']['dataset']

    fname = tools.extract.make_class(data,name)
    if register:
        ctx.invoke(register_class,pyconfig=fname)
    return fname
Esempio n. 4
0
def dag(rules,orientation):
    data = tools.load_json(rules)
    if 'cdm' in data:
        data = data['cdm']
    tools.make_dag(data,orientation=orientation,render=True)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        description='ETL-CDM: transform a dataset into a CommonDataModel.')
    parser.add_argument('--rules',
                        dest='rules',
                        required=True,
                        help='input .json file')
    parser.add_argument('--out-dir',
                        '-o',
                        dest='out_dir',
                        required=True,
                        help='name of the output folder')
    parser.add_argument('--inputs',
                        '-i',
                        dest='inputs',
                        required=True,
                        nargs="+",
                        help='input csv files')
    parser.add_argument("-nc",
                        "--number-of-rows-per-chunk",
                        dest='number_of_rows_per_chunk',
                        default=None,
                        type=int,
                        help="choose to chunk running the data into nrows")
    parser.add_argument("-np",
                        "--number-of-rows-to-process",
                        dest='number_of_rows_to_process',
                        default=None,
                        type=int,
                        help="the total number of rows to process")
    parser.add_argument(
        "--use-profiler",
        dest='use_profiler',
        action='store_true',
        help="turn on saving statistics for profiling CPU and memory usage")

    #get the CLI arguments
    args = parser.parse_args()

    #load the rules json file
    config = load_json(args.rules)

    # load the csv inputs, given a map between the name of the .csv file
    #    and the full path of the file
    # by also passing the rules to load_csv, only needed columns (used by rules)
    #    will be loaded
    # pass extra arguments if the user has specified chunking
    #    or has specified only processing a limited number of rows
    inputs = load_csv({os.path.basename(x): x
                       for x in args.inputs},
                      rules=args.rules,
                      chunksize=args.number_of_rows_per_chunk,
                      nrows=args.number_of_rows_to_process)

    name = config['metadata']['dataset']

    #build an object to store the cdm
    cdm = CommonDataModel(name=name,
                          inputs=inputs,
                          output_folder=args.out_dir,
                          use_profiler=args.use_profiler)
    #CDM needs to also track the number of rows to chunk
    # - note: should check if this is still needed/used at all
    cdm.set_chunk_size(args.number_of_rows_per_chunk)

    #loop over the cdm object types defined in the configuration
    #e.g person, measurement etc..
    for destination_table, rules_set in config['cdm'].items():
        #loop over each object instance in the rule set
        #for example, condition_occurrence may have multiple rulesx
        #for multiple condition_ocurrences e.g. Headache, Fever ..
        for i, rules in enumerate(rules_set):
            #make a new object for the cdm object
            #Example:
            # destination_table : person
            # get_cdm_class returns <Person>
            # obj : Person()
            obj = get_cdm_class(destination_table)()
            #set the name of the object
            obj.set_name(f"{destination_table}_{i}")

            #call the apply_rules function to setup how to modify the inputs
            #based on the rules
            obj.rules = rules
            #Build a lambda function that will get executed during run time
            #and will be able to apply these rules to the inputs that are loaded
            #(this is useful when chunk)
            obj.define = lambda self: apply_rules(self)

            #register this object with the CDM model, so it can be processed
            cdm.add(obj)

    cdm.process()
    print('Finished Producing', cdm.keys())
Esempio n. 6
0
def run(ctx,rules,inputs,format_level,
        output_folder,output_database,
        csv_separator,use_profiler,log_file,
        no_mask_person_id,indexing_conf,
        person_id_map,max_rules,
        objects,tables,
        dont_automatically_fill_missing_columns,
        number_of_rows_per_chunk,
        number_of_rows_to_process):
    """
    Perform OMOP Mapping given an json file and a series of input files

    INPUTS should be a space separated list of individual input files or directories (which contain .csv files)
    """

    if output_folder is None:
        output_folder = f'{os.getcwd()}{os.path.sep}output_data{os.path.sep}'

    #if log_file == 'auto' and coconnect.params['log_file'] is None:
    print (log_file)
    if log_file == 'auto':
        log_file = f"{output_folder}{os.path.sep}logs{os.path.sep}coconnect.log"
        coconnect.params['log_file'] = log_file
    elif log_file == 'none':
        pass
    else:
        coconnect.params['log_file'] = log_file
        
    #load the json loads
    if type(rules) == dict:
        config = rules
    else:
        config = tools.load_json(rules)

    if tables:
        tables = list(set(tables))
        config = coconnect.tools.filter_rules_by_destination_tables(config,tables)

    if objects:
        objects = list(set(objects))
        config = coconnect.tools.filter_rules_by_object_names(config,objects)
        
    if max_rules:
        i = 0
        n = max_rules
        new = {}
        for destination_table,rule_set in config['cdm'].items():
            if destination_table == 'person':
                new[destination_table] = rule_set
            else:
                for name,_rules in rule_set.items():
                    if i>=n:
                        break
                    if destination_table not in new:
                        new[destination_table] = {}
                    new[destination_table][name] = _rules
                    i+=1
            
        config['cdm'] = new

    name = config['metadata']['dataset']

    if indexing_conf is not None:
        if isinstance(indexing_conf,dict):
            pass
        elif indexing_conf.endswith(".json") and os.path.exists(indexing_conf):
            indexing_conf = tools.load_json(indexing_conf)
        elif indexing_conf.endswith(".csv") and os.path.exists(indexing_conf):
            try:
                indexing_conf = pd.read_csv(indexing_conf,header=None,index_col=0)[1].to_dict()
            except pd.errors.EmptyDataError:
                indexing_conf = None
                pass
                
    
    #automatically calculate the ideal chunksize
    if number_of_rows_per_chunk == 'auto':
        #get the fields that are going to be used/loaded
        used_fields = tools.get_mapped_fields_from_rules(config)
        #calculate the number of fields that are to be used per dataset
        n_used_fields = [ len(sublist) for sublist in used_fields.values() ]
        #find what's the largest number of fields loaded by any dataset
        max_n_used_fields = max(n_used_fields)
        #get the number of files used
        n_files = len(n_used_fields)
        
        # If there is one dataset and one column being used, the max loaded to memory
        #   is 2million rows (this is fairly arbitrary)
        #   it is an approximation assuming the data in the values is relatively small
        #   this should keep the memory usage down
        # When there is more fields and more files loaded, reduce the of rows per chunk
        max_n_rows = 2e6
        number_of_rows_per_chunk = int(max_n_rows/(max_n_used_fields*n_files))
    else:
        try:
            number_of_rows_per_chunk = int(number_of_rows_per_chunk)
        except ValueError:
            raise ValueError(f"number_of_rows_per_chunk must be an Integer or 'auto', you inputted '{number_of_rows_per_chunk}'")
        
        #turn off chunking if 0 or negative chunksizes are given
        if number_of_rows_per_chunk <= 0 :
            number_of_rows_per_chunk = None
    
    #check if exists
    if any('*' in x for x in inputs):
        data_dir = os.path.dirname(coconnect.__file__)
        data_dir = f'{data_dir}{os.path.sep}data{os.path.sep}'

        new_inputs = []
        for i,x in enumerate(inputs):
            if not os.path.exists(x):
                new_inputs.extend(glob.glob(f"{data_dir}{os.path.sep}{x}"))
            else:
                new_inputs.append(x)
        inputs = new_inputs

    inputs = list(inputs)
    
    for x in inputs:
        if os.path.isdir(x):
            inputs.remove(x)
            inputs.extend(glob.glob(f'{x}{os.path.sep}*.csv'))
        
    #convert the list into a map between the filename and the full path
    inputs = {
        os.path.basename(x):x
        for x in inputs
    }
    
        
    inputs = tools.load_csv(inputs,
                            rules=rules,
                            chunksize=number_of_rows_per_chunk,
                            nrows=number_of_rows_to_process)

    #build an object to store the cdm
    cdm = coconnect.cdm.CommonDataModel(name=name,
                                        inputs=inputs,
                                        format_level=format_level,
                                        do_mask_person_id=not no_mask_person_id,
                                        indexing_conf=indexing_conf,
                                        person_id_map=person_id_map,
                                        output_folder=output_folder,
                                        output_database=output_database,
                                        automatically_fill_missing_columns=not dont_automatically_fill_missing_columns,
                                        use_profiler=use_profiler)
    
    #allow the csv separator to be changed
    #the default is tab (\t) separation
    if not csv_separator is None:
        cdm.set_csv_separator(csv_separator)
    
    #loop over the cdm object types defined in the configuration
    #e.g person, measurement etc..
    for destination_table,rules_set in config['cdm'].items():
        #loop over each object instance in the rule set
        #for example, condition_occurrence may have multiple rulesx
        #for multiple condition_ocurrences e.g. Headache, Fever ..
        for name,rules in rules_set.items():
            #make a new object for the cdm object
            #Example:
            # destination_table : person
            # get_cdm_class returns <Person>
            # obj : Person()
            obj = coconnect.cdm.get_cdm_class(destination_table)()
            obj.set_format_level(cdm.format_level)
            #set the name of the object
            obj.set_name(name)
            
            #Build a lambda function that will get executed during run time
            #and will be able to apply these rules to the inputs that are loaded
            #(this is useful when chunk)
            obj.define = lambda x,rules=rules : tools.apply_rules(x,rules,inputs=cdm.inputs)
            
            #register this object with the CDM model, so it can be processed
            cdm.add(obj)

    cdm.process()
    cdm.close()