def _get_existing_sample(name, kbase_sample_id): prev_sample = None if kbase_sample_id: prev_sample = get_sample({"id": kbase_sample_id}, sample_url, token) if name in existing_sample_names and prev_sample['name'] == name: # now we check if the sample 'id' and 'name' are the same if existing_sample_names[name]['id'] != prev_sample['id']: raise SampleContentError( f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"", key="id", sample_name=name) elif name in existing_sample_names and name != prev_sample['name']: # not sure if this is an error case raise SampleContentError( f"Cannot rename existing sample from {prev_sample['name']} to {name}", key="id", sample_name=name) elif name in existing_sample_names: prev_sample = get_sample(existing_sample_names[name], sample_url, token) return prev_sample
def _get_existing_sample(name, kbase_sample_id): prev_sample = None if kbase_sample_id: prev_sample = get_sample({"id": kbase_sample_id}, sample_url, token) if name in existing_sample_names and prev_sample['name'] == name: # now we check if the sample 'id' and 'name' are the same if existing_sample_names[name]['id'] != prev_sample['id']: raise SampleContentWarning( f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"", key="id", sample_name=name) elif name in existing_sample_names and name != prev_sample['name']: # not sure if this is an error case raise SampleContentWarning( f"Cannot rename existing sample from {prev_sample['name']} to {name}", key="id", sample_name=name) elif name in existing_sample_names: existing_sample = copy.deepcopy(existing_sample_names[name]) # remove version of samples from sample set in order to get the latest version of sample existing_sample.pop('version', None) prev_sample = get_sample(existing_sample, sample_url, token) return prev_sample
def compare_sample_sets(self, sample_set, sample_set_2): sample_set_2 = {sam['name']: sam for sam in sample_set_2['samples']} for it, samp in enumerate(sample_set['samples']): self.assertTrue(sample_set_2.get(samp['name'])) sample = get_sample(samp, self.sample_url, self.ctx['token']) sample2 = get_sample(sample_set_2[samp['name']], self.sample_url, self.ctx['token']) # print('gen sample', sample) # print('jsn sample', sample2) self.compare_sample(sample, sample2, check_id=True, check_version=True)
def _verify_samples(self, sample_set, compare_path): with open(compare_path) as f: compare = json.load(f) for it, samp in enumerate(sample_set['samples']): sample = get_sample(samp, self.sample_url, self.token) self._compare_sample(sample, compare[it])
def verify_samples(self, sample_set, compare_path): if self.update_test_files: samples = [] for it, samp in enumerate(sample_set['samples']): sample = get_sample(samp, self.sample_url, self.ctx['token']) samples.append(sample) with open(compare_path, 'w') as f: json.dump(samples, f) else: with open(compare_path) as f: compare = json.load(f) # print('[') for it, samp in enumerate(sample_set['samples']): sample = get_sample(samp, self.sample_url, self.ctx['token']) # print(json.dumps(sample), ',') self.compare_sample(sample, compare[it])
def _verify_samples(self, sample_set, compare_path): with open(compare_path) as f: compare = json.load(f) # print('[') for it, samp in enumerate(sample_set['samples']): sample = get_sample(samp, self.sample_url, self.ctx['token']) # print(json.dumps(sample), ',') self._compare_sample(sample, compare[it])
def test_NCBI_sample_importer(self): ncbi_sample_ids = ['SAMN03166112', 'SAMN04383980', 'SAMN04492225'] # test list sample id input params = { 'external_ids': ncbi_sample_ids, 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == len(ncbi_sample_ids) samples = [ get_sample(sample_info, self.sample_url, self.ctx['token']) for sample_info in samples_info ] sample_ids = [sample['node_tree'][0]['id'] for sample in samples] assert set(sample_ids) == set(ncbi_sample_ids) sample_names = [sample['name'] for sample in samples] assert set(ncbi_sample_ids) == set(sample_names) expected_sample_descriptions = ['Seawater-16', 'SAMN04383980', 'c1-1'] sample_descriptions = [ sample['node_tree'][0]['meta_controlled']['description']['value'] for sample in samples ] assert set(expected_sample_descriptions) == set(sample_descriptions) # test multiple sample ids input in str format params = { 'external_ids': ', '.join(ncbi_sample_ids), 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == len(ncbi_sample_ids) samples = [ get_sample(sample_info, self.sample_url, self.ctx['token']) for sample_info in samples_info ] sample_ids = [sample['node_tree'][0]['id'] for sample in samples] assert set(sample_ids) == set(ncbi_sample_ids) sample_names = [sample['name'] for sample in samples] assert set(ncbi_sample_ids) == set(sample_names) expected_sample_descriptions = ['Seawater-16', 'SAMN04383980', 'c1-1'] sample_descriptions = [ sample['node_tree'][0]['meta_controlled']['description']['value'] for sample in samples ] assert set(expected_sample_descriptions) == set(sample_descriptions) # test single sample id input ncbi_sample_ids = ncbi_sample_ids[0] params = { 'external_ids': ncbi_sample_ids, 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == 1 sample = get_sample(samples_info[0], self.sample_url, self.ctx['token']) sample_id = sample['node_tree'][0]['id'] assert sample_id == ncbi_sample_ids expected_sample_description = 'Seawater-16' sample_name = sample['name'] assert sample_name == ncbi_sample_ids sample_description = sample['node_tree'][0]['meta_controlled'][ 'description']['value'] assert expected_sample_description == sample_description
def test_IGSN_sample_importer(self): igsns = ['IEAWH0001', 'GEE0000O4', 'ODP000002'] # test list igsns input params = { 'external_ids': igsns, 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == len(igsns) samples = [ get_sample(sample_info, self.sample_url, self.ctx['token']) for sample_info in samples_info ] sample_igsns = [ sample['node_tree'][0]['meta_controlled']['igsn']['value'] for sample in samples ] assert set(sample_igsns) == set(igsns) expected_sample_names = ['PB-Low-5', 'ww163e', 'Core 1-1*-1M'] sample_names = [sample['name'] for sample in samples] assert set(expected_sample_names) == set(sample_names) # test string igsns input with multiple IGSNs params = { 'external_ids': ', '.join(igsns), 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == len(igsns) samples = [ get_sample(sample_info, self.sample_url, self.ctx['token']) for sample_info in samples_info ] sample_igsns = [ sample['node_tree'][0]['meta_controlled']['igsn']['value'] for sample in samples ] assert set(sample_igsns) == set(igsns) expected_sample_names = ['PB-Low-5', 'ww163e', 'Core 1-1*-1M'] sample_names = [sample['name'] for sample in samples] assert set(expected_sample_names) == set(sample_names) # test string igsns input with single IGSN igsns = igsns[0] params = { 'external_ids': igsns, 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'description': "test sample set from IGSNs", 'set_name': 'test_sample_set_igsn' } ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0] samples_info = ret['sample_set']['samples'] assert len(samples_info) == 1 sample = get_sample(samples_info[0], self.sample_url, self.ctx['token']) sample_igsn = sample['node_tree'][0]['meta_controlled']['igsn'][ 'value'] assert sample_igsn == igsns expected_sample_names = 'PB-Low-5' sample_name = sample['name'] assert expected_sample_names == sample_name
def test_ENIGMA_file(self): '''''' sample_file = os.path.join(self.curr_dir, 'data', 'fake_samples_ENIGMA.xlsx') params = { 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'sample_file': sample_file, 'file_format': "enigma", 'header_row_index': 2, 'set_name': "test_sample_set_2", 'description': "this is a test sample set.", 'output_format': "csv", 'prevalidate': 1, } ret = self.serviceImpl.import_samples(self.ctx, params)[0] sample_set = ret['sample_set'] sample_set_ref = ret['sample_set_ref'] compare_path = os.path.join(self.curr_dir, 'data', 'fake_samples_ENIGMA.json') self._verify_samples(sample_set, compare_path) # next we test if the update functionality is working # make copy of file in scratch os.mkdir(os.path.join(self.scratch, 'temporary_data')) enigma_copy = os.path.join(self.scratch, 'temporary_data', os.path.basename(sample_file)) # now alter the file in a couple places alter = {0: ("Jamboree", 'user data'), 2: ("latitude", 30)} df = pd.read_excel(sample_file, header=1) for idx in alter: df.at[idx, alter[idx][0]] = alter[idx][1] # now write dataframe to new file location df.to_excel(enigma_copy, index=False) params = { 'sample_set_ref': sample_set_ref, 'sample_file': enigma_copy, 'workspace_name': self.wsName, 'workspace_id': self.wsID, 'file_format': "enigma", 'header_row_index': 1, 'description': "this is a copy of a test sample set.", 'incl_input_in_output': 1, 'share_within_workspace': 1, } sample_set_2 = self.serviceImpl.import_samples(self.ctx, params)[0]['sample_set'] ss2 = {s['name']: s for s in sample_set_2['samples']} # check that s1 and s3 were updated and s2 was not. for it, samp1 in enumerate(sample_set['samples']): # get sample by name name = samp1['name'] sample1 = get_sample(samp1, self.sample_url, self.ctx['token']) sample2 = get_sample(ss2.get(samp1['name']), self.sample_url, self.ctx['token']) if name == 's2': self._compare_sample(sample1, sample2) else: try: assert sample2['version'] > sample1['version'] assert sample2['id'] == sample1['id'] node2 = sample2['node_tree'][0] node1 = sample1['node_tree'][0] if name == 's1': assert node2['meta_user']['jamboree'][ 'value'] == 'user data' elif name == 's3': assert node2['meta_controlled']['latitude'][ 'value'] == 30 except: raise ValueError( f"could not compare samples:\n{json.dumps(sample1)}\n{json.dumps(sample2)}" )
def sample_set_to_output(sample_set, sample_url, token, output_file, output_file_format): """""" def add_to_output(o, key_metadata, val): if key_metadata in o: o[key_metadata] += [ "" for _ in range( len(o['kbase_sample_id']) - 1 - len(o[key_metadata])) ] + [val] else: o[key_metadata] = [ "" for _ in range(len(o['kbase_sample_id']) - 1) ] + [val] return o if output_file_format == "SESAR": groups = SESAR_mappings['groups'] output = {"kbase_sample_id": [], "sample name": []} for samp_id in sample_set['samples']: sample = get_sample(samp_id, sample_url, token) output['kbase_sample_id'].append(sample['id']) output['sample name'].append(sample['name']) used_headers = set(['kbase_sample_id', 'name', 'sample name']) for node in sample['node_tree']: # get 'source_meta' information source_meta = node.get('source_meta', []) source_meta_key = {m['key']: m['skey'] for m in source_meta} for key_metadata in node['meta_controlled']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_controlled'][ key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) used_headers.add(groups[idx]['units']) for key_metadata in node['meta_user']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_user'][key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) for key in output: output[key] += [ "" for _ in range(len(output['kbase_sample_id']) - len(output[key])) ] df = pd.DataFrame.from_dict(output) def line_prepender(filename, line): with open(filename, 'r+') as f: content = f.read() f.seek(0, 0) f.write(line.rstrip('\r\n') + '\n' + content) df.to_csv(output_file, index=False) if output_file_format == "SESAR": line_prepender(output_file, "Object Type:,Individual Sample,User Code:,")
def sample_set_to_output(sample_set, sample_url, token, output_file, output_file_format): """""" def add_to_output(o, key_metadata, val): if key_metadata in o: o[key_metadata] += [ "" for _ in range( len(o['kbase_sample_id']) - 1 - len(o[key_metadata])) ] + [val] else: o[key_metadata] = [ "" for _ in range(len(o['kbase_sample_id']) - 1) ] + [val] return o if output_file_format.lower() == "sesar": groups = SESAR_mappings['groups'] else: raise ValueError(f"SESAR only file format supported for export") output = {"kbase_sample_id": [], "name": []} for samp_id in sample_set['samples']: sample = get_sample(samp_id, sample_url, token) output['kbase_sample_id'].append(sample['id']) # we need to check if there is another match in there. sample_name = sample['name'] output['name'].append(sample_name) used_headers = set(['kbase_sample_id', 'name']) for node_idx, node in enumerate(sample['node_tree']): # check if node 'id' and sample 'name' are not the same if node['id'] != sample_name: output = add_to_output(output, f"alt_id_{node_idx}", node['id']) # get 'source_meta' information source_meta = node.get('source_meta', []) source_meta_key = {m['key']: m['skey'] for m in source_meta} for key_metadata in node['meta_controlled']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_controlled'][ key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) used_headers.add(groups[idx]['units']) for key_metadata in node['meta_user']: # get original input key upload_key = source_meta_key.get(key_metadata, key_metadata) if upload_key not in used_headers: for key, val in node['meta_user'][key_metadata].items(): if key == 'value': output = add_to_output(output, upload_key, val) used_headers.add(upload_key) if key == 'units': idx = check_value_in_list(key_metadata, [ upload_key_format(g['value']) for g in groups ], return_idx=True) if idx is not None and not groups[idx][ 'units'].startswith('str:'): output = add_to_output(output, groups[idx]['units'], val) # add any missing lines to the end. for key in output: output[key] += [ "" for _ in range(len(output['kbase_sample_id']) - len(output[key])) ] df = pd.DataFrame.from_dict(output) def line_prepender(filename, line): with open(filename, 'r+') as f: content = f.read() f.seek(0, 0) f.write(line.rstrip('\r\n') + '\n' + content) df.to_csv(output_file, index=False) if output_file_format.lower() == "sesar": line_prepender(output_file, "Object Type:,Individual Sample,User Code:,")
def produce_samples( df, cols, column_groups, column_unit_regex, sample_url, token, existing_samples, columns_to_input_names, acls ): """""" samples = [] existing_sample_names = {sample['name']: sample for sample in existing_samples} for idx, row in df.iterrows(): if row.get('id'): # first we check if a 'kbase_sample_id' column is specified kbase_sample_id = None if row.get('kbase_sample_id'): kbase_sample_id = str(row.pop('kbase_sample_id')) if 'kbase_sample_id' in cols: cols.pop(cols.index('kbase_sample_id')) # use name field as name, if there is non-reuse id. if row.get('name'): name = str(row['name']) else: name = str(row['id']) if row.get('parent_id'): parent = str(row.pop('parent_id')) if 'parent_id' in cols: cols.pop(cols.index('parent_id')) if 'id' in cols: cols.pop(cols.index('id')) if 'name' in cols: cols.pop(cols.index('name')) controlled_metadata = generate_controlled_metadata( row, column_groups ) user_metadata = generate_user_metadata( row, cols, column_groups, column_unit_regex ) source_meta = generate_source_meta( row, controlled_metadata.keys(), columns_to_input_names ) sample = { 'node_tree': [{ "id": str(row['id']), "parent": None, "type": "BioReplicate", "meta_controlled": controlled_metadata, "meta_user": user_metadata, 'source_meta': source_meta }], 'name': name, } prev_sample = None if kbase_sample_id: prev_sample = get_sample({"id": kbase_sample_id}, sample_url, token) if name in existing_sample_names and prev_sample['name'] == name: # now we check if the sample 'id' and 'name' are the same if existing_sample_names[name]['id'] != prev_sample['id']: raise ValueError(f"'kbase_sample_id' and input sample set have different ID's for sample: {name}") elif name in existing_sample_names and name != prev_sample['name']: # not sure if this is an error case raise ValueError(f"Cannot rename existing sample from {prev_sample['name']} to {name}") elif name in existing_sample_names: prev_sample = get_sample(existing_sample_names[name], sample_url, token) if compare_samples(sample, prev_sample): if sample.get('name') not in existing_sample_names: existing_sample_names[sample['name']] = prev_sample continue elif name in existing_sample_names: existing_sample_names.pop(name) sample_id, sample_ver = save_sample(sample, sample_url, token, previous_version=prev_sample) samples.append({ "id": sample_id, "name": name, "version": sample_ver }) # check input for any reason to update access control list # should have a "write", "read", "admin" entry writer = row.get('write') reader = row.get('read') admin = row.get('admin') if writer or reader or admin: acls["read"] += [r for r in reader] acls["write"] += [w for w in writer] acls["admin"] += [a for a in admin] if len(acls["read"]) > 0 or len(acls['write']) > 0 or len(acls['admin']) > 0: resp = update_acls(sample_url, sample_id, acls, token) else: raise RuntimeError(f"{row.get('id')} evaluates as false - {row.keys()}") # add the missing samples from existing_sample_names samples += [existing_sample_names[key] for key in existing_sample_names] return samples