def generate_controlled_metadata(row, groups): """ row - row from input pandas.DataFrame object to convert to metadata cols - columns of input pandas.DataFrame to convert to metadata """ metadata = {} # use the shared fields for col, val in row.iteritems(): col = upload_key_format(col) ss_validator = SAMP_SERV_CONFIG['validators'].get(col, None) if ss_validator: if not pd.isnull(row[col]): idx = check_value_in_list( col, [upload_key_format(g['value']) for g in groups], return_idx=True) try: val = float(row[col]) except (ValueError, TypeError): val = row[col] mtd = {"value": val} if idx is not None: mtd, _ = parse_grouped_data(row, groups[idx]) # verify against validator missing_fields = _find_missing_fields(mtd, ss_validator) for field, default in missing_fields.items(): mtd[field] = default metadata[col] = mtd return metadata
def format_input_file(df, columns_to_input_names, aliases, header_row_index): errors = [] # change columns to upload format columns_to_input_names = {} for col_idx, col_name in enumerate(df.columns): try: renamed = upload_key_format(col_name) if renamed in columns_to_input_names: raise SampleContentError(( f"Duplicate column \"{renamed}\". \"{col_name}\" would overwrite a different column \"{columns_to_input_names[renamed]}\". " "Rename your columns to be unique alphanumericaly, ignoring whitespace and case." ), key=col_name) columns_to_input_names[renamed] = col_name except SampleContentError as e: e.column = col_idx errors.append(e) df = df.rename(columns={ columns_to_input_names[col]: col for col in columns_to_input_names }) df.replace({n: None for n in NOOP_VALS}, inplace=True) map_aliases = {} for key, key_aliases in aliases.items(): key = upload_key_format(key) for alias_key in key_aliases: alias_key = upload_key_format(alias_key) # check if alias_key in columns if alias_key in df.columns: # make sure that existing if key in df.columns: # if key already exists, continue continue map_aliases[alias_key] = key if alias_key in columns_to_input_names: val = columns_to_input_names.pop(alias_key) columns_to_input_names[key] = val if map_aliases: df = df.rename(columns=map_aliases) return df, columns_to_input_names, errors
def create_groups(col_config): groups = list() for col, rules in col_config.items(): transformations = rules.get('transformations') if transformations: first_trans = transformations[0] transform = first_trans.get('transform') if transform == 'unit_measurement': parameters = first_trans.get('parameters') value = parameters[0] unit_key = parameters[1] unit_rules = col_config[unit_key] unit_transformations = unit_rules.get('transformations') if not unit_transformations: unit_keys = [unit_key] unit_aliases = unit_rules.get('aliases', []) if unit_aliases: unit_keys += unit_aliases unit_keys = list(set([upload_key_format(unit) for unit in unit_keys])) else: first_trans = unit_transformations[0] parameters = first_trans.get('parameters', [col]) unit_keys = [upload_key_format(parameters[0])] for unit in unit_keys: groups.append({'units': unit, 'value': value}) elif transform == 'unit_measurement_fixed': parameters = first_trans.get('parameters') value = parameters[0] unit = 'str:{}'.format(parameters[1]) groups.append({'units': unit, 'value': value}) return groups
def validate_params(params): if not params.get('sample_file'): raise ValueError(f"sample_file argument required in params: {params}") if not params.get('workspace_name'): raise ValueError( f"workspace_name argument required in params: {params}") sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError(f"input file {sample_file} does not exist.") if params.get('name_field'): try: upload_key_format(params.get('name_field')) except SampleContentError as e: raise ValueError("Invalid ID field in params: {e.message}") ws_name = params.get('workspace_name') return sample_file
def generate_user_metadata(row, cols, groups, unit_rules): """ row - row from input pandas.DataFrame object to convert to metadata cols - columns of input pandas.DataFrame to conver to metadata groups - list of dictionaries to group in same metadata field unit_rules - list of regexes that capture the units associated with all fields. the first entry is used before the second which is used before the third and so on.. NOTE: empty list is valid input and results in no unit fields captured from regex. """ # first we iterate through the groups metadata, used_cols = handle_groups_metadata(row, cols, groups) cols = list(set(cols) - used_cols) for col in cols: # col = col.lower( ) if not pd.isnull(row[col]): # if there are column unit rules units = None if unit_rules: for unit_rule in unit_rules: result = re.search(unit_rule, col) if result: # we assume the regex has capturing parantheses. match = result.group(1) units = match # use only first match. break # try to assing value as a float if possible try: val = float(row[col]) except (ValueError, TypeError): val = row[col] metadata[upload_key_format(col)] = {"value": val} if units: metadata[upload_key_format(col)]["units"] = units return metadata
def import_samples_from_file(params, sample_url, workspace_url, username, token, column_groups, date_columns, column_unit_regex, input_sample_set, header_row_index, aliases): """ import samples from '.csv' or '.xls' files in SESAR format """ # verify inputs sample_file = validate_params(params) ws_name = params.get('workspace_name') df = load_file(sample_file, header_row_index, date_columns) df, columns_to_input_names, errors = format_input_file( df, {}, aliases, header_row_index) first_sample_idx = header_row_index + 1 # TODO: Make sure to check all possible name fields, even when not parameterized if params.get('name_field'): name_field = upload_key_format(params.get('name_field')) if name_field not in list(df.columns): raise ValueError( f"The expected name field column \"{name_field}\" could not be found. " "Adjust your parameters or input such that the following are correct:\n" f"- File Format: {params.get('file_format')} (the format to which your sample data conforms)\n" f"- ID Field: {params.get('name_field','name')}\n (the header of the column containing your names)\n" f"- Headers Row: {params.get('header_row_index')} (the row # where column headers are located in your spreadsheet)" ) # here we rename whatever the id field was/is to "id" columns_to_input_names["name"] = columns_to_input_names.pop(name_field) df.rename(columns={name_field: "name"}, inplace=True) if not errors: if params['file_format'].lower() in ['sesar', "enigma"]: if 'material' in df.columns: df.rename(columns={ "material": params['file_format'].lower() + ":material" }, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names[params['file_format'].lower() + ":material"] = val if params['file_format'].lower() == "kbase": if 'material' in df.columns: df.rename(columns={"material": "sesar:material"}, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names["sesar:material"] = val acls = { "read": [], "write": [], "admin": [], "public_read": -1 # set to false (<0) } if params.get('share_within_workspace'): # query workspace for user permissions. acls = get_workspace_user_perms(workspace_url, params.get('workspace_id'), token, username, acls) groups = SAMP_SERV_CONFIG['validators'] samples, existing_samples, produce_errors = _produce_samples( df, column_groups, column_unit_regex, sample_url, token, input_sample_set['samples'], columns_to_input_names, first_sample_idx) errors += produce_errors if params.get('prevalidate') and not errors: error_detail = validate_samples([s['sample'] for s in samples], sample_url, token) errors += [ SampleContentError(e['message'], sample_name=e['sample_name'], node=e['node'], key=e['key']) for e in error_detail ] if errors: saved_samples = [] # Fill in missing location information for SamplesContentError(s) err_col_keys = {} err_key_indices = {} for col_idx, col_name in enumerate(df.columns): err_col_keys[col_idx] = col_name err_key_indices[col_name] = col_idx if col_name in columns_to_input_names and columns_to_input_names[ col_name] != col_name: err_key_indices[columns_to_input_names[col_name]] = col_idx err_row_sample_names = {} err_sample_name_indices = {} for relative_row_idx, row in df.iterrows(): row_pos = first_sample_idx + relative_row_idx sample_name = row.get('name') err_sample_name_indices[sample_name] = row_pos err_row_sample_names[row_pos] = sample_name for e in errors: if e.column != None and e.key == None and e.column in err_col_keys: e.key = err_col_keys[e.column] if e.column == None and e.key != None and e.key in err_key_indices: e.column = err_key_indices[e.key] if e.row != None and e.sample_name == None and e.row in err_row_sample_names: e.sample_name = err_row_sample_names[e.row] if e.row == None and e.sample_name != None and e.sample_name in err_sample_name_indices: e.row = err_sample_name_indices[e.sample_name] else: saved_samples = _save_samples(samples, acls, sample_url, token) saved_samples += existing_samples return { "samples": saved_samples, "description": params.get('description') }, errors
def sample_set_to_output(sample_set, sample_url, token, output_file, output_file_format): """""" def add_to_output(o, key_metadata, val): if key_metadata in o: o[key_metadata] += [ "" for _ in range( len(o['kbase_sample_id']) - 1 - len(o[key_metadata])) ] + [val] else: o[key_metadata] = [ "" for _ in range(len(o['kbase_sample_id']) - 1) ] + [val] return o if output_file_format == "SESAR": groups = SESAR_mappings['groups'] output = {"kbase_sample_id": [], "sample name": []} for samp_id in sample_set['samples']: sample = get_sample(samp_id, sample_url, token) output['kbase_sample_id'].append(sample['id']) output['sample name'].append(sample['name']) used_headers = set(['kbase_sample_id', 'name', 'sample name']) for node in sample['node_tree']: # get 'source_meta' information source_meta = node.get('source_meta', []) source_meta_key = {m['key']: m['skey'] for m in source_meta} for key_metadata in node['meta_controlled']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_controlled'][ key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) used_headers.add(groups[idx]['units']) for key_metadata in node['meta_user']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_user'][key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) for key in output: output[key] += [ "" for _ in range(len(output['kbase_sample_id']) - len(output[key])) ] df = pd.DataFrame.from_dict(output) def line_prepender(filename, line): with open(filename, 'r+') as f: content = f.read() f.seek(0, 0) f.write(line.rstrip('\r\n') + '\n' + content) df.to_csv(output_file, index=False) if output_file_format == "SESAR": line_prepender(output_file, "Object Type:,Individual Sample,User Code:,")
def sample_set_to_output(sample_set, sample_url, token, output_file, output_file_format): """""" def add_to_output(o, key_metadata, val): if key_metadata in o: o[key_metadata] += [ "" for _ in range( len(o['kbase_sample_id']) - 1 - len(o[key_metadata])) ] + [val] else: o[key_metadata] = [ "" for _ in range(len(o['kbase_sample_id']) - 1) ] + [val] return o if output_file_format.lower() == "sesar": groups = SESAR_mappings['groups'] else: raise ValueError(f"SESAR only file format supported for export") output = {"kbase_sample_id": [], "name": []} for samp_id in sample_set['samples']: sample = get_sample(samp_id, sample_url, token) output['kbase_sample_id'].append(sample['id']) # we need to check if there is another match in there. sample_name = sample['name'] output['name'].append(sample_name) used_headers = set(['kbase_sample_id', 'name']) for node_idx, node in enumerate(sample['node_tree']): # check if node 'id' and sample 'name' are not the same if node['id'] != sample_name: output = add_to_output(output, f"alt_id_{node_idx}", node['id']) # get 'source_meta' information source_meta = node.get('source_meta', []) source_meta_key = {m['key']: m['skey'] for m in source_meta} for key_metadata in node['meta_controlled']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_controlled'][ key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) used_headers.add(groups[idx]['units']) for key_metadata in node['meta_user']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_user'][key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) # add any missing lines to the end. for key in output: output[key] += [ "" for _ in range(len(output['kbase_sample_id']) - len(output[key])) ] df = pd.DataFrame.from_dict(output) def line_prepender(filename, line): with open(filename, 'r+') as f: content = f.read() f.seek(0, 0) f.write(line.rstrip('\r\n') + '\n' + content) df.to_csv(output_file, index=False) if output_file_format.lower() == "sesar": line_prepender(output_file, "Object Type:,Individual Sample,User Code:,")
def import_samples_from_file(params, sw_url, workspace_url, username, token, column_mapping, column_groups, date_columns, column_unit_regex, input_sample_set, header_row_index): """ import samples from '.csv' or '.xls' files in SESAR format """ # verify inputs sample_file = validate_params(params) ws_name = params.get('workspace_name') df = load_file(sample_file, header_row_index, date_columns) errors = [] first_sample_idx = header_row_index + 1 # change columns to upload format columns_to_input_names = {} for col_idx, col_name in enumerate(df.columns): try: renamed = upload_key_format(col_name) if renamed in columns_to_input_names: raise SampleContentError(( f"Duplicate column \"{renamed}\". \"{col_name}\" would overwrite a different column \"{columns_to_input_names[renamed]}\". " "Rename your columns to be unique alphanumericaly, ignoring whitespace and case." ), key=col_name) columns_to_input_names[renamed] = col_name except SampleContentError as e: e.column = col_idx errors.append(e) df = df.rename(columns={ columns_to_input_names[col]: col for col in columns_to_input_names }) df.replace({n: None for n in NOOP_VALS}, inplace=True) #TODO: Make sure to check all possible ID fields, even when not parameterized if params.get('id_field'): id_field = upload_key_format(params.get('id_field')) if id_field not in list(df.columns): raise ValueError( f"The expected ID field column \"{id_field}\" could not be found. " "Adjust your parameters or input such that the following are correct:\n" f"- File Format: {params.get('file_format')} (the format to which your sample data conforms)\n" f"- ID Field: {params.get('id_field','id')}\n (the header of the column containing your IDs)\n" f"- Headers Row: {params.get('header_row_index')} (the row # where column headers are located in your spreadsheet)" ) # here we rename whatever the id field was/is to "id" columns_to_input_names["id"] = columns_to_input_names.pop(id_field) df.rename(columns={id_field: "id"}, inplace=True) # remove "id" rename field from column mapping if exists if column_mapping: column_mapping = { key: val for key, val in column_mapping.items() if val != "id" } if not errors: if column_mapping: df = df.rename(columns=column_mapping) # redundant, even harmful if things get out of sync # verify_columns(df) for key in column_mapping: if key in columns_to_input_names: val = columns_to_input_names.pop(key) columns_to_input_names[column_mapping[key]] = val if params['file_format'].upper() in ['SESAR', "ENIGMA"]: if 'material' in df.columns: df.rename(columns={ "material": params['file_format'].upper() + ":material" }, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names[params['file_format'].upper() + ":material"] = val if params['file_format'].upper() == "KBASE": if 'material' in df.columns: df.rename(columns={"material": "SESAR:material"}, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names["SESAR:material"] = val acls = { "read": [], "write": [], "admin": [], "public_read": -1 # set to false (<0) } if params.get('share_within_workspace'): # query workspace for user permissions. acls = get_workspace_user_perms(workspace_url, params.get('workspace_id'), token, username, acls) groups = SAMP_SERV_CONFIG['validators'] cols = list(set(df.columns) - set(REGULATED_COLS)) sample_url = get_sample_service_url(sw_url) samples, existing_samples, produce_errors = _produce_samples( df, cols, column_groups, column_unit_regex, sample_url, token, input_sample_set['samples'], columns_to_input_names, first_sample_idx) errors += produce_errors if params.get('prevalidate') and not errors: error_detail = validate_samples([s['sample'] for s in samples], sample_url, token) errors += [ SampleContentError(e['message'], sample_name=e['sample_name'], node=e['node'], key=e['key']) for e in error_detail ] if errors: saved_samples = [] # Fill in missing location information for SamplesContentError(s) err_col_keys = {} err_key_indices = {} for col_idx, col_name in enumerate(df.columns): err_col_keys[col_idx] = col_name err_key_indices[col_name] = col_idx if col_name in columns_to_input_names and columns_to_input_names[ col_name] != col_name: err_key_indices[columns_to_input_names[col_name]] = col_idx err_row_sample_names = {} err_sample_name_indices = {} for relative_row_idx, row in df.iterrows(): row_pos = first_sample_idx + relative_row_idx sample_name = row.get('id') err_sample_name_indices[sample_name] = row_pos err_row_sample_names[row_pos] = sample_name for e in errors: if e.column != None and e.key == None and e.column in err_col_keys: e.key = err_col_keys[e.column] if e.column == None and e.key != None and e.key in err_key_indices: e.column = err_key_indices[e.key] if e.row != None and e.sample_name == None and e.row in err_row_sample_names: e.sample_name = err_row_sample_names[e.row] if e.row == None and e.sample_name != None and e.sample_name in err_sample_name_indices: e.row = err_sample_name_indices[e.sample_name] else: saved_samples = _save_samples(samples, acls, sample_url, token) saved_samples += existing_samples return { "samples": saved_samples, "description": params.get('description') }, errors
def format_input_file(df, params, columns_to_input_names, aliases): # change columns to upload format columns_to_input_names = {} # set column names into 'upload_key_format'. for col_idx, col_name in enumerate(df.columns): renamed = upload_key_format(col_name) if renamed not in columns_to_input_names: columns_to_input_names[renamed] = col_name else: warnings.warn( SampleContentWarning(( f"Duplicate column \"{renamed}\". \"{col_name}\" would overwrite a different column \"{columns_to_input_names[renamed]}\". " "Rename your columns to be unique alphanumericaly, ignoring whitespace and case." ), key=col_name, column=col_idx, severity='error')) df = df.rename(columns={ columns_to_input_names[col]: col for col in columns_to_input_names }) df.replace({n: None for n in NOOP_VALS}, inplace=True) # TODO: Make sure to check all possible name fields, even when not parameterized if params.get('name_field'): name_field = upload_key_format(params.get('name_field')) if name_field not in list(df.columns): raise ValueError( f"The expected name field column \"{name_field}\" could not be found. " "Adjust your parameters or input such that the following are correct:\n" f"- File Format: {params.get('file_format')} (the format to which your sample data conforms)\n" f"- ID Field: {params.get('name_field','name')}\n (the header of the column containing your names)\n" f"- Headers Row: {params.get('header_row_index')} (the row # where column headers are located in your spreadsheet)" ) # here we rename whatever the id field was/is to "id" columns_to_input_names["name"] = columns_to_input_names.pop(name_field) df.rename(columns={name_field: "name"}, inplace=True) map_aliases = {} for key, key_aliases in aliases.items(): key = upload_key_format(key) for alias_key in key_aliases: alias_key = upload_key_format(alias_key) # check if alias_key in columns if alias_key in df.columns: # make sure that existing if key in df.columns: # if key already exists, continue continue map_aliases[alias_key] = key if alias_key in columns_to_input_names: val = columns_to_input_names.pop(alias_key) columns_to_input_names[key] = val if map_aliases: df = df.rename(columns=map_aliases) file_format = params.get('file_format').lower() prefix_map = {} for col in df.columns: if ':' in col: continue # get prefixed versions of the field. fields = NON_PREFIX_TO_PREFIX.get(col, []) target_field = None for field in fields: # choose the format that fits if it exists. if field.split(":")[0] == file_format: target_field = field break else: if col in CORE_FIELDS: target_field = col else: target_field = field if target_field: prefix_map[col] = target_field if col in columns_to_input_names: val = columns_to_input_names.pop(col) columns_to_input_names[target_field] = val if prefix_map: df = df.rename(columns=prefix_map) return df, columns_to_input_names
def import_samples_from_file(params, sw_url, workspace_url, username, token, column_mapping, column_groups, date_columns, column_unit_regex, input_sample_set, header_row_index): """ import samples from '.csv' or '.xls' files in SESAR format """ # verify inputs sample_file = validate_params(params) ws_name = params.get('workspace_name') df = load_file(sample_file, header_row_index, date_columns) # change columns to upload format # TODO: make sure separate columns are not being renamed to the same thing columns_to_input_names = {upload_key_format(c): c for c in df.columns} df = df.rename(columns={c: upload_key_format(c) for c in df.columns}) df.replace({n: None for n in NOOP_VALS}, inplace=True) if params.get('id_field'): id_field = upload_key_format(params['id_field']) if id_field in list(df.columns): # here we rename whatever the id field was/is to "id" columns_to_input_names["id"] = columns_to_input_names.pop(id_field) df.rename(columns={id_field: "id"}, inplace=True) # remove "id" rename field from column mapping if exists if column_mapping: column_mapping = { key: val for key, val in column_mapping.items() if val != "id" } else: raise ValueError( f"'{params['id_field']}' is not a column field in the input file." ) else: print( f"No id_field argument present in params, proceeding with defaults." ) if column_mapping: df = df.rename(columns=column_mapping) # redundant, even harmful if things get out of sync # verify_columns(df) for key in column_mapping: if key in columns_to_input_names: val = columns_to_input_names.pop(key) columns_to_input_names[column_mapping[key]] = val if params['file_format'].upper() in ['SESAR', "ENIGMA"]: if 'material' in df.columns: df.rename(columns={ "material": params['file_format'].upper() + ":material" }, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names[params['file_format'].upper() + ":material"] = val if params['file_format'].upper() == "KBASE": if 'material' in df.columns: df.rename(columns={"material": "SESAR:material"}, inplace=True) val = columns_to_input_names.pop("material") columns_to_input_names["SESAR:material"] = val acls = { "read": [], "write": [], "admin": [], "public_read": -1 # set to false (<0) } if params.get('share_within_workspace'): # query workspace for user permissions. acls = get_workspace_user_perms(workspace_url, params.get('workspace_id'), token, username, acls) groups = SAMP_SERV_CONFIG['validators'] cols = list(set(df.columns) - set(REGULATED_COLS)) sample_url = get_sample_service_url(sw_url) samples, existing_samples = _produce_samples(df, cols, column_groups, column_unit_regex, sample_url, token, input_sample_set['samples'], columns_to_input_names) errors = {} if params.get('prevalidate'): errors = validate_samples([s['sample'] for s in samples], sample_url, token) if errors: saved_samples = [] else: saved_samples = _save_samples(samples, acls, sample_url, token) saved_samples += existing_samples return { "samples": saved_samples, "description": params.get('description') }, errors