Ejemplo n.º 1
0
    def test_KBASE_format(self):
        # test default sample server
        sample_file = os.path.join(self.test_dir, 'example_data',
                                   'ncbi_sample_example.csv')
        params = {
            'workspace_name': 'workspace_name',
            'sample_file': sample_file,
            'file_format': "KBASE",
            'id_field': 'id',
            'prevalidate': 1,
        }
        header_row_index = 0

        mappings = {
            'enigma': ENIGMA_mappings,
            'sesar': SESAR_mappings,
            'kbase': {}
        }
        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex',
                    []), {"samples": []}, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 2)
        expected_sample_name = ['SAMN03166112', 'SAMN04383980']
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)
Ejemplo n.º 2
0
    def test_import_SESAR_format(self):
        # test default sample server
        sample_file = os.path.join(self.test_dir, 'data', 'fake_samples.tsv')

        params = {
            'workspace_name': 'workspace_name',
            'sample_file': sample_file,
            'file_format': "sesar",
            'name_field': 'test name field',
            'prevalidate': 1
        }
        header_row_index = 1

        mappings = {
            'enigma': ENIGMA_mappings,
            'sesar': SESAR_mappings,
            'kbase': {}
        }
        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex',
                    []), {"samples": []}, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 3)
        expected_sample_name = ['s1', 's2', 's3']
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)

        compare_path = os.path.join(self.test_dir, "data", "fake_samples.json")
        self._verify_samples(sample_set, compare_path)
    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long, parameter
           "prevalidate" of Long, parameter "incl_input_in_output" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # Check if we have an existing Sample Set as input
        # if so, download
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], {}, [], [], [], sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        file_links = []
        sample_set_ref = None
        html_link = None

        if errors:
            # create UI to display the errors clearly
            html_link = _error_ui(errors, self.scratch)
        else:
            # only save object if there are no errors
            obj_info = self.dfu.save_objects({
                'id':
                save_ws_id,
                'objects': [{
                    "name": set_name,
                    "type": "KBaseSets.SampleSet",
                    "data": sample_set
                }]
            })[0]

            sample_set_ref = '/'.join(
                [str(obj_info[6]),
                 str(obj_info[0]),
                 str(obj_info[4])])
            sample_file_name = os.path.basename(
                params['sample_file']).split('.')[0] + '_OTU'

            # -- Format outputs below --
            # if output file format specified, add one to output
            if params.get('output_format') in ['csv', 'xls']:
                otu_path = sample_set_to_OTU_sheet(sample_set,
                                                   sample_file_name,
                                                   self.scratch, params)
                file_links.append({
                    'path':
                    otu_path,
                    'name':
                    os.path.basename(otu_path),
                    'label':
                    "OTU template file",
                    'description':
                    "file with each column containing the assigned sample_id and sample "
                    "name of each saved sample. Intended for uploading OTU data."
                })

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_data = {
            'report_object_name':
            "SampleSet_import_report_" + str(uuid.uuid4()),
            'workspace_name': params['workspace_name']
        }
        if file_links:
            report_data['file_links'] = file_links
        if sample_set_ref:
            report_data[
                'message'] = f"SampleSet object named \"{set_name}\" imported."
            report_data['objects_created'] = [{'ref': sample_set_ref}]

        if html_link:
            report_data['html_links'] = [{
                'path':
                html_link,
                'name':
                'index.html',
                'description':
                'Sample Set Import Error ui'
            }]
            report_data['direct_html_link_index'] = 0
        report_info = report_client.create_extended_report(report_data)
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref,
            'errors': errors
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Ejemplo n.º 4
0
    def test_ENIGMA_format(self):
        # test default sample server
        ori_sample_file = os.path.join(self.test_dir, 'data',
                                       'fake_samples_ENIGMA.xlsx')
        sample_file = os.path.join(self.test_dir, 'data',
                                   'updated_fake_samples_ENIGMA.xlsx')
        shutil.copy2(ori_sample_file, sample_file)

        params = {
            'workspace_name': 'workspace_name',
            'sample_file': sample_file,
            'file_format': "ENIGMA",
            'prevalidate': 1,
            'keep_existing_samples': 1
        }

        header_row_index = 1

        mappings = {
            'enigma': ENIGMA_mappings,
            'sesar': SESAR_mappings,
            'kbase': {}
        }
        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex',
                    []), {"samples": []}, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 3)
        expected_sample_name = ['s1', 's2', 's3']
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)

        ori_compare_path = os.path.join(self.test_dir, "data",
                                        "fake_samples_ENIGMA.json")
        compare_path = os.path.join(self.test_dir, "data",
                                    "updated_fake_samples_ENIGMA.json")
        shutil.copy2(ori_compare_path, compare_path)

        self._verify_samples(sample_set, compare_path)

        # test updating samples

        # test updating samples with same sample file
        expected_error = "No sample is produced from the input file.\nThe input sample set has identical information to the input file"
        with self.assertRaisesRegex(ValueError, expected_error):
            import_samples_from_file(
                params, self.sample_url, self.workspace_url, self.callback_url,
                self.username, self.token, mappings[str(
                    params.get('file_format')).lower()].get('groups', []),
                mappings[str(params.get('file_format')).lower()].get(
                    'date_columns',
                    []), mappings[str(params.get('file_format')).lower()].get(
                        'column_unit_regex', []), sample_set, header_row_index,
                aliases.get(params.get('file_format').lower(), {}))

        # test updating a single value
        wb = load_workbook(sample_file)
        ws = wb.active
        new_latitude = 66.6
        self.assertNotEqual(new_latitude, ws['E3'].value)
        ws['E3'].value = new_latitude  # update latitude value to 66.6 for S1
        wb.save(sample_file)

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 3)
        expected_sample_name = ['s1', 's2', 's3']
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)

        with open(compare_path) as f:
            data = json.load(f)

        data[0]['version'] += 1  # sample version should be bumped
        data[0]['node_tree'][0]['meta_controlled']['latitude'][
            'value'] = new_latitude

        with open(compare_path, 'w') as json_file:
            json.dump(data, json_file)

        self._verify_samples(sample_set, compare_path)

        # test adding a column
        wb = load_workbook(sample_file)
        ws = wb.active
        new_column = '?size?'  # add a user metadata column
        ws['I2'].value = new_column
        size_load = 10
        for cell in ws['I3':'I5']:
            cell[0].value = size_load
        wb.save(sample_file)

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 3)
        expected_sample_name = ['s1', 's2', 's3']
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)

        with open(compare_path) as f:
            data = json.load(f)

        for sample in data:
            sample['version'] += 1  # sample version should be bumped
            sample['node_tree'][0]['meta_user'][new_column] = {
                'value': size_load
            }  # a new user meta data should be added

        with open(compare_path, 'w') as json_file:
            json.dump(data, json_file)
        self._verify_samples(sample_set, compare_path)

        # test adding a new sample (row)
        wb = load_workbook(sample_file)
        ws = wb.active
        for cell in ws[5]:
            ws[cell.column_letter +
               '6'] = cell.value  # copy s3 (line 5) to the next line
        new_sample = 's4'
        ws['A6'].value = new_sample  # update the sample id for the new row
        wb.save(sample_file)

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        samples = sample_set['samples']
        self.assertEqual(len(samples), 4)
        expected_sample_name = ['s1', 's2', 's3', new_sample]
        self.assertCountEqual([sample['name'] for sample in samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)

        with open(compare_path) as f:
            data = json.load(f)

        data.insert(0, copy.deepcopy(
            data[-1]))  # copy S3 data and update sample ID and version
        data[0]['name'] = new_sample
        data[0]['node_tree'][0]['id'] = new_sample
        data[0]['version'] = 1

        with open(compare_path, 'w') as json_file:
            json.dump(data, json_file)

        self._verify_samples(sample_set, compare_path)

        # test keep_existing_samples
        wb = load_workbook(sample_file)
        ws = wb.active
        for cell in ws[6]:
            cell.value = None  # remove s4 (line 6)
        wb.save(sample_file)

        # since we are keeping all existing samples, there should be no changes to the exisiting sample set
        with self.assertRaisesRegex(ValueError, expected_error):
            import_samples_from_file(
                params, self.sample_url, self.workspace_url, self.callback_url,
                self.username, self.token, mappings[str(
                    params.get('file_format')).lower()].get('groups', []),
                mappings[str(params.get('file_format')).lower()].get(
                    'date_columns',
                    []), mappings[str(params.get('file_format')).lower()].get(
                        'column_unit_regex', []), sample_set, header_row_index,
                aliases.get(params.get('file_format').lower(), {}))

        # test removing a new sample (row)
        params['keep_existing_samples'] = False

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            self.username, self.token,
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        updated_samples = sample_set['samples']
        self.assertEqual(len(updated_samples), 3)
        expected_sample_name = ['s1', 's2', 's3']
        self.assertCountEqual([sample['name'] for sample in updated_samples],
                              expected_sample_name)
        self.assertEqual(has_unignored_errors, False)
    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "name_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long, parameter
           "prevalidate" of Long, parameter "incl_input_in_output" of Long,
           parameter "ignore_warnings" of Long, parameter
           "keep_existing_samples" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # Check if we have an existing Sample Set as input
        # if so, download
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            if params.get('set_name'):
                set_name = params.get('set_name')
            else:
                set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = find_header_row(params.get('sample_file'),
                                               params.get('file_format'))

        username = ctx['user_id']

        if str(params.get('file_format')).lower() not in [
                'enigma', 'sesar', 'kbase'
        ]:
            raise ValueError(
                f"Only SESAR, ENIGMA, and KBase formats are currently supported for importing samples. "
                f"File of format {params.get('file_format')} not supported.")
        mappings = {
            'enigma': ENIGMA_mappings,
            'sesar': SESAR_mappings,
            'kbase': {}
        }

        sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file(
            params, self.sample_url, self.workspace_url, self.callback_url,
            username, ctx['token'],
            mappings[str(params.get('file_format')).lower()].get('groups', []),
            mappings[str(params.get('file_format')).lower()].get(
                'date_columns',
                []), mappings[str(params.get('file_format')).lower()].get(
                    'column_unit_regex', []), sample_set, header_row_index,
            aliases.get(params.get('file_format').lower(), {}))

        file_links = []
        new_data_links = []
        sample_set_ref = None

        # create UI to display the errors clearly
        html_link = _error_ui(errors, sample_data_json, has_unignored_errors,
                              self.scratch)

        if not has_unignored_errors:
            # only save object if there are no errors
            obj_info = self.dfu.save_objects({
                'id':
                save_ws_id,
                'objects': [{
                    "name": set_name,
                    "type": "KBaseSets.SampleSet",
                    "data": sample_set
                }]
            })[0]

            sample_set_ref = '/'.join(
                [str(obj_info[6]),
                 str(obj_info[0]),
                 str(obj_info[4])])
            sample_file_name = os.path.basename(
                params['sample_file']).split('.')[0] + '_OTU'

            # create a data link between each sample and the sampleset
            ss = SampleService(self.sample_url)
            for idx, sample_info in enumerate(sample_set['samples']):
                sample_id = sample_info['id']
                version = sample_info['version']
                sample = ss.get_sample({
                    'id': sample_id,
                    'version': version,
                })
                ret = ss.create_data_link(
                    dict(
                        upa=sample_set_ref,
                        id=sample_id,
                        dataid='samples/{}'.format(idx),
                        version=version,
                        node=sample['node_tree'][0]['id'],
                        update=1,
                    ))
                new_data_links.append(ret)

            # -- Format outputs below --
            # if output file format specified, add one to output
            if params.get('output_format') in ['csv', 'xls']:
                otu_path = sample_set_to_OTU_sheet(sample_set,
                                                   sample_file_name,
                                                   self.scratch, params)
                file_links.append({
                    'path':
                    otu_path,
                    'name':
                    os.path.basename(otu_path),
                    'label':
                    "OTU template file",
                    'description':
                    "file with each column containing the assigned sample_id and sample "
                    "name of each saved sample. Intended for uploading OTU data."
                })

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"Input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_data = {
            'report_object_name':
            "SampleSet_import_report_" + str(uuid.uuid4()),
            'workspace_name': params['workspace_name']
        }
        if file_links:
            report_data['file_links'] = file_links
        if sample_set_ref:
            report_data[
                'message'] = f"SampleSet object named \"{set_name}\" imported."
            report_data['objects_created'] = [{'ref': sample_set_ref}]

        if html_link:
            report_data['html_links'] = [{
                'path':
                html_link,
                'name':
                'index.html',
                'description':
                'HTML Report for Sample Uploader'
            }]
            report_data['direct_html_link_index'] = 0
        report_info = report_client.create_extended_report(report_data)
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref,
            'errors': errors,
            'links': new_data_links
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]