def flatten(rules): data = tools.load_json(rules) objects = data['cdm'] for destination_table,rule_set in objects.items(): if len(rule_set) < 2: continue #print (rule_set) df = pd.DataFrame.from_records(rule_set).T #for name in df.index: # print (name,len(df.loc[name])) df = df.loc['condition_concept_id'].apply(pd.Series) def merge(s): if s == 'term_mapping': print ('hiya') return {k:v for a in s for k,v in a.items()} print (df.groupby('source_field').agg(merge)) #print (df.iloc[1]) #print (df.iloc[1][1]) #print (df) #print (df.iloc[0]) #print (df.iloc[0].name) #print (df.iloc[0].apply(pd.Series)) #print (df.iloc[1].apply(pd.Series)['term_mapping'].apply(pd.Series)) exit(0)
def print_json(rules,list_fields,list_tables): data = tools.load_json(rules) if list_fields or list_tables: data = tools.get_mapped_fields_from_rules(data) if list_tables: data = list(data.keys()) print (json.dumps(data,indent=6))
def make_class(ctx,name,rules,register): data = tools.load_json(rules) if name == None: name = data['metadata']['dataset'] fname = tools.extract.make_class(data,name) if register: ctx.invoke(register_class,pyconfig=fname) return fname
def dag(rules,orientation): data = tools.load_json(rules) if 'cdm' in data: data = data['cdm'] tools.make_dag(data,orientation=orientation,render=True)
def main(): parser = argparse.ArgumentParser( description='ETL-CDM: transform a dataset into a CommonDataModel.') parser.add_argument('--rules', dest='rules', required=True, help='input .json file') parser.add_argument('--out-dir', '-o', dest='out_dir', required=True, help='name of the output folder') parser.add_argument('--inputs', '-i', dest='inputs', required=True, nargs="+", help='input csv files') parser.add_argument("-nc", "--number-of-rows-per-chunk", dest='number_of_rows_per_chunk', default=None, type=int, help="choose to chunk running the data into nrows") parser.add_argument("-np", "--number-of-rows-to-process", dest='number_of_rows_to_process', default=None, type=int, help="the total number of rows to process") parser.add_argument( "--use-profiler", dest='use_profiler', action='store_true', help="turn on saving statistics for profiling CPU and memory usage") #get the CLI arguments args = parser.parse_args() #load the rules json file config = load_json(args.rules) # load the csv inputs, given a map between the name of the .csv file # and the full path of the file # by also passing the rules to load_csv, only needed columns (used by rules) # will be loaded # pass extra arguments if the user has specified chunking # or has specified only processing a limited number of rows inputs = load_csv({os.path.basename(x): x for x in args.inputs}, rules=args.rules, chunksize=args.number_of_rows_per_chunk, nrows=args.number_of_rows_to_process) name = config['metadata']['dataset'] #build an object to store the cdm cdm = CommonDataModel(name=name, inputs=inputs, output_folder=args.out_dir, use_profiler=args.use_profiler) #CDM needs to also track the number of rows to chunk # - note: should check if this is still needed/used at all cdm.set_chunk_size(args.number_of_rows_per_chunk) #loop over the cdm object types defined in the configuration #e.g person, measurement etc.. for destination_table, rules_set in config['cdm'].items(): #loop over each object instance in the rule set #for example, condition_occurrence may have multiple rulesx #for multiple condition_ocurrences e.g. Headache, Fever .. for i, rules in enumerate(rules_set): #make a new object for the cdm object #Example: # destination_table : person # get_cdm_class returns <Person> # obj : Person() obj = get_cdm_class(destination_table)() #set the name of the object obj.set_name(f"{destination_table}_{i}") #call the apply_rules function to setup how to modify the inputs #based on the rules obj.rules = rules #Build a lambda function that will get executed during run time #and will be able to apply these rules to the inputs that are loaded #(this is useful when chunk) obj.define = lambda self: apply_rules(self) #register this object with the CDM model, so it can be processed cdm.add(obj) cdm.process() print('Finished Producing', cdm.keys())
def run(ctx,rules,inputs,format_level, output_folder,output_database, csv_separator,use_profiler,log_file, no_mask_person_id,indexing_conf, person_id_map,max_rules, objects,tables, dont_automatically_fill_missing_columns, number_of_rows_per_chunk, number_of_rows_to_process): """ Perform OMOP Mapping given an json file and a series of input files INPUTS should be a space separated list of individual input files or directories (which contain .csv files) """ if output_folder is None: output_folder = f'{os.getcwd()}{os.path.sep}output_data{os.path.sep}' #if log_file == 'auto' and coconnect.params['log_file'] is None: print (log_file) if log_file == 'auto': log_file = f"{output_folder}{os.path.sep}logs{os.path.sep}coconnect.log" coconnect.params['log_file'] = log_file elif log_file == 'none': pass else: coconnect.params['log_file'] = log_file #load the json loads if type(rules) == dict: config = rules else: config = tools.load_json(rules) if tables: tables = list(set(tables)) config = coconnect.tools.filter_rules_by_destination_tables(config,tables) if objects: objects = list(set(objects)) config = coconnect.tools.filter_rules_by_object_names(config,objects) if max_rules: i = 0 n = max_rules new = {} for destination_table,rule_set in config['cdm'].items(): if destination_table == 'person': new[destination_table] = rule_set else: for name,_rules in rule_set.items(): if i>=n: break if destination_table not in new: new[destination_table] = {} new[destination_table][name] = _rules i+=1 config['cdm'] = new name = config['metadata']['dataset'] if indexing_conf is not None: if isinstance(indexing_conf,dict): pass elif indexing_conf.endswith(".json") and os.path.exists(indexing_conf): indexing_conf = tools.load_json(indexing_conf) elif indexing_conf.endswith(".csv") and os.path.exists(indexing_conf): try: indexing_conf = pd.read_csv(indexing_conf,header=None,index_col=0)[1].to_dict() except pd.errors.EmptyDataError: indexing_conf = None pass #automatically calculate the ideal chunksize if number_of_rows_per_chunk == 'auto': #get the fields that are going to be used/loaded used_fields = tools.get_mapped_fields_from_rules(config) #calculate the number of fields that are to be used per dataset n_used_fields = [ len(sublist) for sublist in used_fields.values() ] #find what's the largest number of fields loaded by any dataset max_n_used_fields = max(n_used_fields) #get the number of files used n_files = len(n_used_fields) # If there is one dataset and one column being used, the max loaded to memory # is 2million rows (this is fairly arbitrary) # it is an approximation assuming the data in the values is relatively small # this should keep the memory usage down # When there is more fields and more files loaded, reduce the of rows per chunk max_n_rows = 2e6 number_of_rows_per_chunk = int(max_n_rows/(max_n_used_fields*n_files)) else: try: number_of_rows_per_chunk = int(number_of_rows_per_chunk) except ValueError: raise ValueError(f"number_of_rows_per_chunk must be an Integer or 'auto', you inputted '{number_of_rows_per_chunk}'") #turn off chunking if 0 or negative chunksizes are given if number_of_rows_per_chunk <= 0 : number_of_rows_per_chunk = None #check if exists if any('*' in x for x in inputs): data_dir = os.path.dirname(coconnect.__file__) data_dir = f'{data_dir}{os.path.sep}data{os.path.sep}' new_inputs = [] for i,x in enumerate(inputs): if not os.path.exists(x): new_inputs.extend(glob.glob(f"{data_dir}{os.path.sep}{x}")) else: new_inputs.append(x) inputs = new_inputs inputs = list(inputs) for x in inputs: if os.path.isdir(x): inputs.remove(x) inputs.extend(glob.glob(f'{x}{os.path.sep}*.csv')) #convert the list into a map between the filename and the full path inputs = { os.path.basename(x):x for x in inputs } inputs = tools.load_csv(inputs, rules=rules, chunksize=number_of_rows_per_chunk, nrows=number_of_rows_to_process) #build an object to store the cdm cdm = coconnect.cdm.CommonDataModel(name=name, inputs=inputs, format_level=format_level, do_mask_person_id=not no_mask_person_id, indexing_conf=indexing_conf, person_id_map=person_id_map, output_folder=output_folder, output_database=output_database, automatically_fill_missing_columns=not dont_automatically_fill_missing_columns, use_profiler=use_profiler) #allow the csv separator to be changed #the default is tab (\t) separation if not csv_separator is None: cdm.set_csv_separator(csv_separator) #loop over the cdm object types defined in the configuration #e.g person, measurement etc.. for destination_table,rules_set in config['cdm'].items(): #loop over each object instance in the rule set #for example, condition_occurrence may have multiple rulesx #for multiple condition_ocurrences e.g. Headache, Fever .. for name,rules in rules_set.items(): #make a new object for the cdm object #Example: # destination_table : person # get_cdm_class returns <Person> # obj : Person() obj = coconnect.cdm.get_cdm_class(destination_table)() obj.set_format_level(cdm.format_level) #set the name of the object obj.set_name(name) #Build a lambda function that will get executed during run time #and will be able to apply these rules to the inputs that are loaded #(this is useful when chunk) obj.define = lambda x,rules=rules : tools.apply_rules(x,rules,inputs=cdm.inputs) #register this object with the CDM model, so it can be processed cdm.add(obj) cdm.process() cdm.close()