def _produce_samples(df, column_groups, column_unit_regex, sample_url, token,
                     existing_samples, columns_to_input_names,
                     first_sample_idx):
    """"""
    samples = []
    existing_sample_names = {
        sample['name']: sample
        for sample in existing_samples
    }

    if not REQUIRED_COLS.issubset(df.columns):
        raise ValueError(
            f'Required "name" column missing from input. Use "name" or '
            f'an alias (sample name", "sample id", "samplename", "sampleid")')

    def _get_existing_sample(name, kbase_sample_id):
        prev_sample = None
        if kbase_sample_id:
            prev_sample = get_sample({"id": kbase_sample_id}, sample_url,
                                     token)

            if name in existing_sample_names and prev_sample['name'] == name:
                # now we check if the sample 'id' and 'name' are the same
                if existing_sample_names[name]['id'] != prev_sample['id']:
                    raise SampleContentError(
                        f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"",
                        key="id",
                        sample_name=name)
            elif name in existing_sample_names and name != prev_sample['name']:
                # not sure if this is an error case
                raise SampleContentError(
                    f"Cannot rename existing sample from {prev_sample['name']} to {name}",
                    key="id",
                    sample_name=name)
        elif name in existing_sample_names:

            prev_sample = get_sample(existing_sample_names[name], sample_url,
                                     token)

        return prev_sample

    errors = []
    cols = list(set(df.columns) - set(REQUIRED_COLS))
    for relative_row_idx, row in df.iterrows():
        try:
            # only required field is 'name'
            if not row.get('name'):
                raise SampleContentError(
                    f"Bad sample name \"{row.get('name')}\". evaluates as false",
                    key='name',
                    sample_name=row.get('name'))
            name = str(row.pop('name'))
            if 'name' in cols:
                cols.pop(cols.index('name'))

            # check if a 'kbase_sample_id' column is specified
            kbase_sample_id = None
            if row.get('kbase_sample_id'):
                kbase_sample_id = str(row.pop('kbase_sample_id'))
                if 'kbase_sample_id' in cols:
                    cols.pop(cols.index('kbase_sample_id'))
            if row.get('parent_id'):
                parent = str(row.pop('parent_id'))
                if 'parent_id' in cols:
                    cols.pop(cols.index('parent_id'))
            controlled_metadata = generate_controlled_metadata(
                row, column_groups)
            user_metadata = generate_user_metadata(row, cols, column_groups,
                                                   column_unit_regex)
            source_meta = generate_source_meta(row, controlled_metadata.keys(),
                                               columns_to_input_names)

            sample = {
                'node_tree': [{
                    "id": name,
                    "parent": None,
                    "type": "BioReplicate",
                    "meta_controlled": controlled_metadata,
                    "meta_user": user_metadata,
                    'source_meta': source_meta
                }],
                'name':
                name,
            }
            # get existing sample (if exists)
            prev_sample = _get_existing_sample(name, kbase_sample_id)

            if compare_samples(sample, prev_sample):
                if sample.get('name') not in existing_sample_names:
                    existing_sample_names[sample['name']] = prev_sample
                continue
            elif name in existing_sample_names:
                existing_sample_names.pop(name)
            # "save_sample_for_later"
            samples.append({
                'sample': sample,
                'prev_sample': prev_sample,
                'name': name,
                'write': row.get('write'),
                'read': row.get('read'),
                'admin': row.get('admin')
            })
        except SampleContentError as e:
            e.row = first_sample_idx + relative_row_idx
            errors.append(e)
    # add the missing samples from existing_sample_names
    return samples, [
        existing_sample_names[key] for key in existing_sample_names
    ], errors
Exemple #2
0
def _produce_samples(df, cols, column_groups, column_unit_regex, sample_url,
                     token, existing_samples, columns_to_input_names):
    """"""
    samples = []
    existing_sample_names = {
        sample['name']: sample
        for sample in existing_samples
    }

    def _get_existing_sample(name, kbase_sample_id):
        prev_sample = None
        if kbase_sample_id:
            prev_sample = get_sample({"id": kbase_sample_id}, sample_url,
                                     token)
            if name in existing_sample_names and prev_sample['name'] == name:
                # now we check if the sample 'id' and 'name' are the same
                if existing_sample_names[name]['id'] != prev_sample['id']:
                    raise ValueError(
                        f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\""
                    )
            elif name in existing_sample_names and name != prev_sample['name']:
                # not sure if this is an error case
                raise ValueError(
                    f"Cannot rename existing sample from {prev_sample['name']} to {name}"
                )
        elif name in existing_sample_names:
            prev_sample = get_sample(existing_sample_names[name], sample_url,
                                     token)

        return prev_sample

    for idx, row in df.iterrows():
        if not row.get('id'):
            raise RuntimeError(
                f"{row.get('id')} evaluates as false - {row.keys()}")
        # first we check if a 'kbase_sample_id' column is specified
        kbase_sample_id = None
        if row.get('kbase_sample_id'):
            kbase_sample_id = str(row.pop('kbase_sample_id'))
            if 'kbase_sample_id' in cols:
                cols.pop(cols.index('kbase_sample_id'))
        # use name field as name, if there is non-reuse id.
        if row.get('name'):
            name = str(row['name'])
        else:
            name = str(row['id'])
        if row.get('parent_id'):
            parent = str(row.pop('parent_id'))
            if 'parent_id' in cols:
                cols.pop(cols.index('parent_id'))
        if 'id' in cols:
            cols.pop(cols.index('id'))
        if 'name' in cols:
            cols.pop(cols.index('name'))

        controlled_metadata = generate_controlled_metadata(row, column_groups)
        user_metadata = generate_user_metadata(row, cols, column_groups,
                                               column_unit_regex)
        source_meta = generate_source_meta(row, controlled_metadata.keys(),
                                           columns_to_input_names)

        sample = {
            'node_tree': [{
                "id": str(row['id']),
                "parent": None,
                "type": "BioReplicate",
                "meta_controlled": controlled_metadata,
                "meta_user": user_metadata,
                'source_meta': source_meta
            }],
            'name':
            name,
        }
        # get existing sample (if exists)
        prev_sample = _get_existing_sample(name, kbase_sample_id)

        if compare_samples(sample, prev_sample):
            if sample.get('name') not in existing_sample_names:
                existing_sample_names[sample['name']] = prev_sample
            continue
        elif name in existing_sample_names:
            existing_sample_names.pop(name)
        # "save_sample_for_later"
        samples.append({
            'sample': sample,
            'prev_sample': prev_sample,
            'name': name,
            'write': row.get('write'),
            'read': row.get('read'),
            'admin': row.get('admin')
        })
    # add the missing samples from existing_sample_names
    return samples, [
        existing_sample_names[key] for key in existing_sample_names
    ]
def _produce_samples(callback_url, df, column_groups, column_unit_regex,
                     sample_url, token, existing_samples,
                     columns_to_input_names, keep_existing_samples):
    """"""
    samples = []
    existing_sample_names = {
        sample['name']: sample
        for sample in existing_samples
    }

    if not REQUIRED_COLS.issubset(df.columns):
        raise ValueError(
            f'Required "name" column missing from input. Use "name" or '
            f'an alias ("sample name", "sample id", "samplename", "sampleid") '
            f'Existing fields ({df.columns})')

    def _get_existing_sample(name, kbase_sample_id):
        prev_sample = None
        if kbase_sample_id:
            prev_sample = get_sample({"id": kbase_sample_id}, sample_url,
                                     token)

            if name in existing_sample_names and prev_sample['name'] == name:
                # now we check if the sample 'id' and 'name' are the same
                if existing_sample_names[name]['id'] != prev_sample['id']:
                    raise SampleContentWarning(
                        f"'kbase_sample_id' and input sample set have different ID's for sample with name \"{name}\"",
                        key="id",
                        sample_name=name)
            elif name in existing_sample_names and name != prev_sample['name']:
                # not sure if this is an error case
                raise SampleContentWarning(
                    f"Cannot rename existing sample from {prev_sample['name']} to {name}",
                    key="id",
                    sample_name=name)
        elif name in existing_sample_names:
            existing_sample = copy.deepcopy(existing_sample_names[name])
            # remove version of samples from sample set in order to get the latest version of sample
            existing_sample.pop('version', None)
            prev_sample = get_sample(existing_sample, sample_url, token)

        return prev_sample

    field_transformer = FieldTransformer(callback_url)

    cols = list(set(df.columns) - set(REQUIRED_COLS))
    imported_sample_names = list()
    for row_num, row in df.iterrows():
        try:
            # only required field is 'name'
            if not row.get('name'):
                raise SampleContentWarning(
                    f"Bad sample name \"{row.get('name')}\". Cell content evaluates as false",
                    key='name',
                    sample_name=row.get('name'))
            name = str(row.pop('name'))
            if 'name' in cols:
                cols.pop(cols.index('name'))

            # check if a 'kbase_sample_id' column is specified
            kbase_sample_id = None
            if row.get('kbase_sample_id'):
                kbase_sample_id = str(row.pop('kbase_sample_id'))
                if 'kbase_sample_id' in cols:
                    cols.pop(cols.index('kbase_sample_id'))
            if row.get('parent_id'):
                parent = str(row.pop('parent_id'))
                if 'parent_id' in cols:
                    cols.pop(cols.index('parent_id'))

            # tranformations for data in row.
            row = field_transformer.field_transformations(row, cols)

            controlled_metadata, controlled_cols = generate_controlled_metadata(
                row, column_groups)
            # remove controlled columns from cols.
            user_metadata = generate_user_metadata(
                row, list(set(cols) - set(controlled_cols)), column_groups,
                column_unit_regex)
            source_meta = generate_source_meta(row, controlled_metadata.keys(),
                                               columns_to_input_names)

            sample = {
                'node_tree': [{
                    "id": name,
                    "parent": None,
                    "type": "BioReplicate",
                    "meta_controlled": controlled_metadata,
                    "meta_user": user_metadata,
                    'source_meta': source_meta
                }],
                'name':
                name,
            }
            imported_sample_names.append(name)

            # get existing sample (if exists)
            prev_sample = _get_existing_sample(name, kbase_sample_id)

            if compare_samples(sample, prev_sample):
                if sample.get('name') not in existing_sample_names:
                    existing_sample_names[sample['name']] = prev_sample
                continue
            elif name in existing_sample_names:
                existing_sample_names.pop(name)
            # "save_sample_for_later"
            samples.append({
                'sample': sample,
                'prev_sample': prev_sample,
                'name': name,
                'write': row.get('write'),
                'read': row.get('read'),
                'admin': row.get('admin')
            })
        except SampleContentWarning as e:
            e.row = row_num
            warnings.warn(e)

    if not keep_existing_samples:
        # remove samples in the existing_samples (input sample_set) but not in the input file
        extra_samples = set(
            existing_sample_names.keys()) - set(imported_sample_names)
        for extra_sample in extra_samples:
            del existing_sample_names[extra_sample]

    user_keys = set()
    for s in samples:
        for n in s['sample']['node_tree']:
            ukeys = set(n['meta_user'].keys())
            ckeys = set(n['meta_controlled'].keys())
            user_keys |= (ukeys - ckeys)
    for key in user_keys:
        warnings.warn(
            SampleContentWarning(
                f"\"{key}\" is a user-defined column. It is of unknown type, will not be automatically validated, and may not be interoperable with other samples during analysis.",
                key=key,
                severity='warning'))

    # add the missing samples from existing_sample_names
    return samples, [
        existing_sample_names[key] for key in existing_sample_names
    ]
Exemple #4
0
def produce_samples(
    df,
    cols,
    column_groups,
    column_unit_regex,
    sample_url,
    token,
    existing_samples,
    columns_to_input_names,
    acls
):
    """"""
    samples = []
    existing_sample_names = {sample['name']: sample for sample in existing_samples}

    for idx, row in df.iterrows():
        if row.get('id'):
            # first we check if a 'kbase_sample_id' column is specified
            kbase_sample_id = None
            if row.get('kbase_sample_id'):
                kbase_sample_id = str(row.pop('kbase_sample_id'))
                if 'kbase_sample_id' in cols:
                    cols.pop(cols.index('kbase_sample_id'))
            # use name field as name, if there is non-reuse id.
            if row.get('name'):
                name = str(row['name'])
            else:
                name = str(row['id'])
            if row.get('parent_id'):
                parent = str(row.pop('parent_id'))
                if 'parent_id' in cols:
                    cols.pop(cols.index('parent_id'))
            if 'id' in cols:
                cols.pop(cols.index('id'))
            if 'name' in cols:
                cols.pop(cols.index('name'))

            controlled_metadata = generate_controlled_metadata(
                row,
                column_groups
            )
            user_metadata = generate_user_metadata(
                row,
                cols,
                column_groups,
                column_unit_regex
            )
            source_meta = generate_source_meta(
                row,
                controlled_metadata.keys(),
                columns_to_input_names
            )

            sample = {
                'node_tree': [{
                    "id": str(row['id']),
                    "parent": None,
                    "type": "BioReplicate",
                    "meta_controlled": controlled_metadata,
                    "meta_user": user_metadata,
                    'source_meta': source_meta
                }],
                'name': name,
            }
            prev_sample = None
            if kbase_sample_id:
                prev_sample = get_sample({"id": kbase_sample_id}, sample_url, token)
                if name in existing_sample_names and prev_sample['name'] == name:
                    # now we check if the sample 'id' and 'name' are the same
                    if existing_sample_names[name]['id'] != prev_sample['id']:
                        raise ValueError(f"'kbase_sample_id' and input sample set have different ID's for sample: {name}")
                elif name in existing_sample_names and name != prev_sample['name']:
                    # not sure if this is an error case
                    raise ValueError(f"Cannot rename existing sample from {prev_sample['name']} to {name}")
            elif name in existing_sample_names:
                prev_sample = get_sample(existing_sample_names[name], sample_url, token)
            if compare_samples(sample, prev_sample):
                if sample.get('name') not in existing_sample_names:
                    existing_sample_names[sample['name']] = prev_sample
                continue
            elif name in existing_sample_names:
                existing_sample_names.pop(name)

            sample_id, sample_ver = save_sample(sample, sample_url, token, previous_version=prev_sample)

            samples.append({
                "id": sample_id,
                "name": name,
                "version": sample_ver
            })
            # check input for any reason to update access control list
            # should have a "write", "read", "admin" entry
            writer = row.get('write')
            reader = row.get('read')
            admin  = row.get('admin')
            if writer or reader or admin:
                acls["read"] +=  [r for r in reader]
                acls["write"] += [w for w in writer]
                acls["admin"] += [a for a in admin]
            if len(acls["read"]) > 0 or len(acls['write']) > 0 or len(acls['admin']) > 0:
                resp = update_acls(sample_url, sample_id, acls, token)
        else:
            raise RuntimeError(f"{row.get('id')} evaluates as false - {row.keys()}")
    # add the missing samples from existing_sample_names
    samples += [existing_sample_names[key] for key in existing_sample_names]
    return samples