def _get_existing_sample(name, kbase_sample_id):
        prev_sample = None
        if kbase_sample_id:
            prev_sample = get_sample({"id": kbase_sample_id}, sample_url,
                                     token)

            if name in existing_sample_names and prev_sample['name'] == name:
                # now we check if the sample 'id' and 'name' are the same
                if existing_sample_names[name]['id'] != prev_sample['id']:
                    raise SampleContentError(
                        f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"",
                        key="id",
                        sample_name=name)
            elif name in existing_sample_names and name != prev_sample['name']:
                # not sure if this is an error case
                raise SampleContentError(
                    f"Cannot rename existing sample from {prev_sample['name']} to {name}",
                    key="id",
                    sample_name=name)
        elif name in existing_sample_names:

            prev_sample = get_sample(existing_sample_names[name], sample_url,
                                     token)

        return prev_sample
    def _get_existing_sample(name, kbase_sample_id):
        prev_sample = None
        if kbase_sample_id:
            prev_sample = get_sample({"id": kbase_sample_id}, sample_url,
                                     token)

            if name in existing_sample_names and prev_sample['name'] == name:
                # now we check if the sample 'id' and 'name' are the same
                if existing_sample_names[name]['id'] != prev_sample['id']:
                    raise SampleContentWarning(
                        f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"",
                        key="id",
                        sample_name=name)
            elif name in existing_sample_names and name != prev_sample['name']:
                # not sure if this is an error case
                raise SampleContentWarning(
                    f"Cannot rename existing sample from {prev_sample['name']} to {name}",
                    key="id",
                    sample_name=name)
        elif name in existing_sample_names:
            existing_sample = copy.deepcopy(existing_sample_names[name])
            # remove version of samples from sample set in order to get the latest version of sample
            existing_sample.pop('version', None)
            prev_sample = get_sample(existing_sample, sample_url, token)

        return prev_sample
Exemple #3
0
    def compare_sample_sets(self, sample_set, sample_set_2):
        sample_set_2 = {sam['name']: sam for sam in sample_set_2['samples']}
        for it, samp in enumerate(sample_set['samples']):
            self.assertTrue(sample_set_2.get(samp['name']))
            sample = get_sample(samp, self.sample_url, self.ctx['token'])
            sample2 = get_sample(sample_set_2[samp['name']], self.sample_url, self.ctx['token'])
            # print('gen sample', sample)
            # print('jsn sample', sample2)

            self.compare_sample(sample, sample2, check_id=True, check_version=True)
Exemple #4
0
    def _verify_samples(self, sample_set, compare_path):
        with open(compare_path) as f:
            compare = json.load(f)

        for it, samp in enumerate(sample_set['samples']):
            sample = get_sample(samp, self.sample_url, self.token)
            self._compare_sample(sample, compare[it])
Exemple #5
0
 def verify_samples(self, sample_set, compare_path):
     if self.update_test_files:
         samples = []
         for it, samp in enumerate(sample_set['samples']):
             sample = get_sample(samp, self.sample_url, self.ctx['token'])
             samples.append(sample)
         with open(compare_path, 'w') as f:
             json.dump(samples, f)
     else:
         with open(compare_path) as f:
             compare = json.load(f)
         # print('[')
         for it, samp in enumerate(sample_set['samples']):
             sample = get_sample(samp, self.sample_url, self.ctx['token'])
             # print(json.dumps(sample), ',')
             self.compare_sample(sample, compare[it])
Exemple #6
0
 def _verify_samples(self, sample_set, compare_path):
     with open(compare_path) as f:
         compare = json.load(f)
     # print('[')
     for it, samp in enumerate(sample_set['samples']):
         sample = get_sample(samp, self.sample_url, self.ctx['token'])
         # print(json.dumps(sample), ',')
         self._compare_sample(sample, compare[it])
Exemple #7
0
    def test_NCBI_sample_importer(self):
        ncbi_sample_ids = ['SAMN03166112', 'SAMN04383980', 'SAMN04492225']

        # test list sample id input
        params = {
            'external_ids': ncbi_sample_ids,
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == len(ncbi_sample_ids)

        samples = [
            get_sample(sample_info, self.sample_url, self.ctx['token'])
            for sample_info in samples_info
        ]

        sample_ids = [sample['node_tree'][0]['id'] for sample in samples]
        assert set(sample_ids) == set(ncbi_sample_ids)

        sample_names = [sample['name'] for sample in samples]
        assert set(ncbi_sample_ids) == set(sample_names)

        expected_sample_descriptions = ['Seawater-16', 'SAMN04383980', 'c1-1']
        sample_descriptions = [
            sample['node_tree'][0]['meta_controlled']['description']['value']
            for sample in samples
        ]
        assert set(expected_sample_descriptions) == set(sample_descriptions)

        # test multiple sample ids input in str format
        params = {
            'external_ids': ', '.join(ncbi_sample_ids),
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == len(ncbi_sample_ids)

        samples = [
            get_sample(sample_info, self.sample_url, self.ctx['token'])
            for sample_info in samples_info
        ]

        sample_ids = [sample['node_tree'][0]['id'] for sample in samples]
        assert set(sample_ids) == set(ncbi_sample_ids)

        sample_names = [sample['name'] for sample in samples]
        assert set(ncbi_sample_ids) == set(sample_names)

        expected_sample_descriptions = ['Seawater-16', 'SAMN04383980', 'c1-1']
        sample_descriptions = [
            sample['node_tree'][0]['meta_controlled']['description']['value']
            for sample in samples
        ]
        assert set(expected_sample_descriptions) == set(sample_descriptions)

        # test single sample id input
        ncbi_sample_ids = ncbi_sample_ids[0]
        params = {
            'external_ids': ncbi_sample_ids,
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_NCBI(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == 1

        sample = get_sample(samples_info[0], self.sample_url,
                            self.ctx['token'])

        sample_id = sample['node_tree'][0]['id']
        assert sample_id == ncbi_sample_ids

        expected_sample_description = 'Seawater-16'
        sample_name = sample['name']
        assert sample_name == ncbi_sample_ids

        sample_description = sample['node_tree'][0]['meta_controlled'][
            'description']['value']
        assert expected_sample_description == sample_description
Exemple #8
0
    def test_IGSN_sample_importer(self):
        igsns = ['IEAWH0001', 'GEE0000O4', 'ODP000002']

        # test list igsns input
        params = {
            'external_ids': igsns,
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == len(igsns)

        samples = [
            get_sample(sample_info, self.sample_url, self.ctx['token'])
            for sample_info in samples_info
        ]

        sample_igsns = [
            sample['node_tree'][0]['meta_controlled']['igsn']['value']
            for sample in samples
        ]
        assert set(sample_igsns) == set(igsns)

        expected_sample_names = ['PB-Low-5', 'ww163e', 'Core 1-1*-1M']
        sample_names = [sample['name'] for sample in samples]
        assert set(expected_sample_names) == set(sample_names)

        # test string igsns input with multiple IGSNs
        params = {
            'external_ids': ', '.join(igsns),
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == len(igsns)

        samples = [
            get_sample(sample_info, self.sample_url, self.ctx['token'])
            for sample_info in samples_info
        ]

        sample_igsns = [
            sample['node_tree'][0]['meta_controlled']['igsn']['value']
            for sample in samples
        ]
        assert set(sample_igsns) == set(igsns)

        expected_sample_names = ['PB-Low-5', 'ww163e', 'Core 1-1*-1M']
        sample_names = [sample['name'] for sample in samples]
        assert set(expected_sample_names) == set(sample_names)

        # test string igsns input with single IGSN
        igsns = igsns[0]
        params = {
            'external_ids': igsns,
            'workspace_name': self.wsName,
            'workspace_id': self.wsID,
            'description': "test sample set from IGSNs",
            'set_name': 'test_sample_set_igsn'
        }
        ret = self.serviceImpl.import_samples_from_IGSN(self.ctx, params)[0]
        samples_info = ret['sample_set']['samples']

        assert len(samples_info) == 1

        sample = get_sample(samples_info[0], self.sample_url,
                            self.ctx['token'])

        sample_igsn = sample['node_tree'][0]['meta_controlled']['igsn'][
            'value']
        assert sample_igsn == igsns

        expected_sample_names = 'PB-Low-5'
        sample_name = sample['name']
        assert expected_sample_names == sample_name
Exemple #9
0
 def test_ENIGMA_file(self):
     ''''''
     sample_file = os.path.join(self.curr_dir, 'data',
                                'fake_samples_ENIGMA.xlsx')
     params = {
         'workspace_name': self.wsName,
         'workspace_id': self.wsID,
         'sample_file': sample_file,
         'file_format': "enigma",
         'header_row_index': 2,
         'set_name': "test_sample_set_2",
         'description': "this is a test sample set.",
         'output_format': "csv",
         'prevalidate': 1,
     }
     ret = self.serviceImpl.import_samples(self.ctx, params)[0]
     sample_set = ret['sample_set']
     sample_set_ref = ret['sample_set_ref']
     compare_path = os.path.join(self.curr_dir, 'data',
                                 'fake_samples_ENIGMA.json')
     self._verify_samples(sample_set, compare_path)
     # next we test if the update functionality is working
     # make copy of file in scratch
     os.mkdir(os.path.join(self.scratch, 'temporary_data'))
     enigma_copy = os.path.join(self.scratch, 'temporary_data',
                                os.path.basename(sample_file))
     # now alter the file in a couple places
     alter = {0: ("Jamboree", 'user data'), 2: ("latitude", 30)}
     df = pd.read_excel(sample_file, header=1)
     for idx in alter:
         df.at[idx, alter[idx][0]] = alter[idx][1]
     # now write dataframe to new file location
     df.to_excel(enigma_copy, index=False)
     params = {
         'sample_set_ref': sample_set_ref,
         'sample_file': enigma_copy,
         'workspace_name': self.wsName,
         'workspace_id': self.wsID,
         'file_format': "enigma",
         'header_row_index': 1,
         'description': "this is a copy of a test sample set.",
         'incl_input_in_output': 1,
         'share_within_workspace': 1,
     }
     sample_set_2 = self.serviceImpl.import_samples(self.ctx,
                                                    params)[0]['sample_set']
     ss2 = {s['name']: s for s in sample_set_2['samples']}
     # check that s1 and s3 were updated and s2 was not.
     for it, samp1 in enumerate(sample_set['samples']):
         # get sample by name
         name = samp1['name']
         sample1 = get_sample(samp1, self.sample_url, self.ctx['token'])
         sample2 = get_sample(ss2.get(samp1['name']), self.sample_url,
                              self.ctx['token'])
         if name == 's2':
             self._compare_sample(sample1, sample2)
         else:
             try:
                 assert sample2['version'] > sample1['version']
                 assert sample2['id'] == sample1['id']
                 node2 = sample2['node_tree'][0]
                 node1 = sample1['node_tree'][0]
                 if name == 's1':
                     assert node2['meta_user']['jamboree'][
                         'value'] == 'user data'
                 elif name == 's3':
                     assert node2['meta_controlled']['latitude'][
                         'value'] == 30
             except:
                 raise ValueError(
                     f"could not compare samples:\n{json.dumps(sample1)}\n{json.dumps(sample2)}"
                 )
Exemple #10
0
def sample_set_to_output(sample_set, sample_url, token, output_file,
                         output_file_format):
    """"""
    def add_to_output(o, key_metadata, val):
        if key_metadata in o:
            o[key_metadata] += [
                "" for _ in range(
                    len(o['kbase_sample_id']) - 1 - len(o[key_metadata]))
            ] + [val]
        else:
            o[key_metadata] = [
                "" for _ in range(len(o['kbase_sample_id']) - 1)
            ] + [val]
        return o

    if output_file_format == "SESAR":
        groups = SESAR_mappings['groups']

    output = {"kbase_sample_id": [], "sample name": []}
    for samp_id in sample_set['samples']:
        sample = get_sample(samp_id, sample_url, token)
        output['kbase_sample_id'].append(sample['id'])
        output['sample name'].append(sample['name'])
        used_headers = set(['kbase_sample_id', 'name', 'sample name'])
        for node in sample['node_tree']:
            # get 'source_meta' information
            source_meta = node.get('source_meta', [])
            source_meta_key = {m['key']: m['skey'] for m in source_meta}
            for key_metadata in node['meta_controlled']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_controlled'][
                            key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)
                                used_headers.add(groups[idx]['units'])

            for key_metadata in node['meta_user']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_user'][key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)

    for key in output:
        output[key] += [
            ""
            for _ in range(len(output['kbase_sample_id']) - len(output[key]))
        ]

    df = pd.DataFrame.from_dict(output)

    def line_prepender(filename, line):
        with open(filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)

    df.to_csv(output_file, index=False)

    if output_file_format == "SESAR":
        line_prepender(output_file,
                       "Object Type:,Individual Sample,User Code:,")
def sample_set_to_output(sample_set, sample_url, token, output_file,
                         output_file_format):
    """"""
    def add_to_output(o, key_metadata, val):
        if key_metadata in o:
            o[key_metadata] += [
                "" for _ in range(
                    len(o['kbase_sample_id']) - 1 - len(o[key_metadata]))
            ] + [val]
        else:
            o[key_metadata] = [
                "" for _ in range(len(o['kbase_sample_id']) - 1)
            ] + [val]
        return o

    if output_file_format.lower() == "sesar":
        groups = SESAR_mappings['groups']
    else:
        raise ValueError(f"SESAR only file format supported for export")

    output = {"kbase_sample_id": [], "name": []}
    for samp_id in sample_set['samples']:
        sample = get_sample(samp_id, sample_url, token)
        output['kbase_sample_id'].append(sample['id'])
        # we need to check if there is another match in there.
        sample_name = sample['name']

        output['name'].append(sample_name)
        used_headers = set(['kbase_sample_id', 'name'])
        for node_idx, node in enumerate(sample['node_tree']):
            # check if node 'id' and sample 'name' are not the same
            if node['id'] != sample_name:
                output = add_to_output(output, f"alt_id_{node_idx}",
                                       node['id'])
            # get 'source_meta' information
            source_meta = node.get('source_meta', [])
            source_meta_key = {m['key']: m['skey'] for m in source_meta}
            for key_metadata in node['meta_controlled']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_controlled'][
                            key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)
                                used_headers.add(groups[idx]['units'])

            for key_metadata in node['meta_user']:
                # get original input key
                upload_key = source_meta_key.get(key_metadata, key_metadata)
                if upload_key not in used_headers:
                    for key, val in node['meta_user'][key_metadata].items():
                        if key == 'value':
                            output = add_to_output(output, upload_key, val)
                            used_headers.add(upload_key)
                        if key == 'units':
                            idx = check_value_in_list(key_metadata, [
                                upload_key_format(g['value']) for g in groups
                            ],
                                                      return_idx=True)
                            if idx is not None and not groups[idx][
                                    'units'].startswith('str:'):
                                output = add_to_output(output,
                                                       groups[idx]['units'],
                                                       val)

    # add any missing lines to the end.
    for key in output:
        output[key] += [
            ""
            for _ in range(len(output['kbase_sample_id']) - len(output[key]))
        ]

    df = pd.DataFrame.from_dict(output)

    def line_prepender(filename, line):
        with open(filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(line.rstrip('\r\n') + '\n' + content)

    df.to_csv(output_file, index=False)

    if output_file_format.lower() == "sesar":
        line_prepender(output_file,
                       "Object Type:,Individual Sample,User Code:,")
Exemple #12
0
def produce_samples(
    df,
    cols,
    column_groups,
    column_unit_regex,
    sample_url,
    token,
    existing_samples,
    columns_to_input_names,
    acls
):
    """"""
    samples = []
    existing_sample_names = {sample['name']: sample for sample in existing_samples}

    for idx, row in df.iterrows():
        if row.get('id'):
            # first we check if a 'kbase_sample_id' column is specified
            kbase_sample_id = None
            if row.get('kbase_sample_id'):
                kbase_sample_id = str(row.pop('kbase_sample_id'))
                if 'kbase_sample_id' in cols:
                    cols.pop(cols.index('kbase_sample_id'))
            # use name field as name, if there is non-reuse id.
            if row.get('name'):
                name = str(row['name'])
            else:
                name = str(row['id'])
            if row.get('parent_id'):
                parent = str(row.pop('parent_id'))
                if 'parent_id' in cols:
                    cols.pop(cols.index('parent_id'))
            if 'id' in cols:
                cols.pop(cols.index('id'))
            if 'name' in cols:
                cols.pop(cols.index('name'))

            controlled_metadata = generate_controlled_metadata(
                row,
                column_groups
            )
            user_metadata = generate_user_metadata(
                row,
                cols,
                column_groups,
                column_unit_regex
            )
            source_meta = generate_source_meta(
                row,
                controlled_metadata.keys(),
                columns_to_input_names
            )

            sample = {
                'node_tree': [{
                    "id": str(row['id']),
                    "parent": None,
                    "type": "BioReplicate",
                    "meta_controlled": controlled_metadata,
                    "meta_user": user_metadata,
                    'source_meta': source_meta
                }],
                'name': name,
            }
            prev_sample = None
            if kbase_sample_id:
                prev_sample = get_sample({"id": kbase_sample_id}, sample_url, token)
                if name in existing_sample_names and prev_sample['name'] == name:
                    # now we check if the sample 'id' and 'name' are the same
                    if existing_sample_names[name]['id'] != prev_sample['id']:
                        raise ValueError(f"'kbase_sample_id' and input sample set have different ID's for sample: {name}")
                elif name in existing_sample_names and name != prev_sample['name']:
                    # not sure if this is an error case
                    raise ValueError(f"Cannot rename existing sample from {prev_sample['name']} to {name}")
            elif name in existing_sample_names:
                prev_sample = get_sample(existing_sample_names[name], sample_url, token)
            if compare_samples(sample, prev_sample):
                if sample.get('name') not in existing_sample_names:
                    existing_sample_names[sample['name']] = prev_sample
                continue
            elif name in existing_sample_names:
                existing_sample_names.pop(name)

            sample_id, sample_ver = save_sample(sample, sample_url, token, previous_version=prev_sample)

            samples.append({
                "id": sample_id,
                "name": name,
                "version": sample_ver
            })
            # check input for any reason to update access control list
            # should have a "write", "read", "admin" entry
            writer = row.get('write')
            reader = row.get('read')
            admin  = row.get('admin')
            if writer or reader or admin:
                acls["read"] +=  [r for r in reader]
                acls["write"] += [w for w in writer]
                acls["admin"] += [a for a in admin]
            if len(acls["read"]) > 0 or len(acls['write']) > 0 or len(acls['admin']) > 0:
                resp = update_acls(sample_url, sample_id, acls, token)
        else:
            raise RuntimeError(f"{row.get('id')} evaluates as false - {row.keys()}")
    # add the missing samples from existing_sample_names
    samples += [existing_sample_names[key] for key in existing_sample_names]
    return samples