def test_load_data_from_spreadsheet_bad_input(self):
        """Test load_data_from_spreadsheet with bad input files"""
        filenames = [
            "load_data_from_spreadsheet_bad_column_names.xlsx",
            "load_data_from_spreadsheet_wrong_field_number.xlsx",
        ]

        for filename in filenames:
            with self.assertRaises(Exception):
                xlsx = os.path.join(data_dir, filename)
                spreadsheet_helper.load_data_from_spreadsheet(xlsx)
Example #2
0
    def test_load_data_from_spreadsheet_bad_input(self):
        '''Test load_data_from_spreadsheet with bad input files'''
        filenames = [
            'load_data_from_spreadsheet_bad_column_names.xlsx',
            'load_data_from_spreadsheet_wrong_field_number.xlsx',
        ]

        for filename in filenames:
            with self.assertRaises(spreadsheet_helper.Error):
                xlsx = os.path.join(data_dir, filename)
                spreadsheet_helper.load_data_from_spreadsheet(xlsx)
    def run(self):
        all_data = spreadsheet_helper.load_data_from_spreadsheet(
            self.spreasheet_xlsx)
        errors = SpreadsheetValidator._check_no_blank_values(all_data)
        errors.extend(
            SpreadsheetValidator._check_uniqueness_of_values(all_data))
        errors.extend(
            SpreadsheetValidator.
            _check_global_file_and_md5_column_intersection(all_data))
        errors.extend(
            SpreadsheetValidator._check_files_exist_and_md5(
                all_data, self.data_root_dir, self.md5_threads))
        errors.extend(
            SpreadsheetValidator._check_integers(all_data,
                                                 "isolate_number",
                                                 min_value=1))
        errors.extend(
            SpreadsheetValidator._check_integers(all_data,
                                                 "sequence_replicate_number",
                                                 min_value=1))
        errors.extend(
            SpreadsheetValidator._check_integers(all_data,
                                                 "submit_to_ena",
                                                 min_value=0,
                                                 max_value=1))
        errors.extend(
            SpreadsheetValidator._check_integers(all_data,
                                                 "ena_on_hold",
                                                 min_value=0,
                                                 max_value=1))
        errors.extend(SpreadsheetValidator._check_instrument_model(all_data))

        with open(self.outfile, "w") as f:
            for line in errors:
                print(line, file=f)
    def _import_reads_and_update_db(self):
        database = db.Db(self.db_ini_file)
        data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file)
        xlsx_dir = os.path.dirname(self.xlsx_file)
        data_errors = SpreadsheetImporter._validate_data(
            database, data, self.dropbox_dir)

        if len(data_errors) > 0:
            raise Exception("Error(s) importing spreadsheet:\n" +
                            "\n".join(data_errors))

        try:
            f_out = open(self.jobs_outfile, "w")
        except:
            raise Exception('Error opening file "' + self.jobs_outfile +
                            '". Cannot continue')

        print(
            "seqrep_id",
            "sample_id",
            "isolate_id",
            "sequence_replicate_number",
            "reads1",
            "reads2",
            "reads1_md5",
            "reads2_md5",
            sep="\t",
            file=f_out,
        )

        for data_dict in data:
            reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"])
            reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"])
            assert os.path.exists(reads1) and os.path.exists(reads2)
            seqrep_id, isolate_id, sample_id = database.add_one_seqrep(
                data_dict)
            print(
                seqrep_id,
                sample_id,
                isolate_id,
                data_dict["sequence_replicate_number"],
                reads1,
                reads2,
                data_dict["reads_file_1_md5"],
                data_dict["reads_file_2_md5"],
                sep="\t",
                file=f_out,
            )

        f_out.close()
        xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet(
            self.xlsx_file, self.xlsx_archive_dir)
        jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv"
        assert not os.path.exists(jobs_backup_file)
        utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file)
        database.commit_and_close()

        if self.db_backup_dir is not None:
            database.backup(self.db_backup_dir)
    def test_load_data_from_spreadsheet_tsv(self):
        """test load_data_from_spreadsheet tsv file"""
        expected = [
            {
                "subject_id": "p1",
                "site_id": "s1",
                "lab_id": "l1",
                "isolate_number": "42",
                "sequence_replicate_number": "43",
                "submission_date": datetime.date(2017, 12, 25),
                "reads_file_1": "reads_1_1.fq",
                "reads_file_1_md5": "abcdefghijklmnopqrstuvwyx123456",
                "reads_file_2": "reads_1_2.fq",
                "reads_file_2_md5": "abcdefghijklmnopqrstuvwyx123457",
                "dataset_name": "g1",
                "instrument_model": "Illumina HiSeq 2000",
                "ena_center_name": "Center 1",
                "submit_to_ena": "0",
                "ena_on_hold": "0",
                "ena_run_accession": "ERR123456",
                "ena_sample_accession": "ERS123456",
            },
            {
                "subject_id": "p2",
                "site_id": "s2",
                "lab_id": "l2",
                "isolate_number": "44",
                "sequence_replicate_number": "45",
                "submission_date": datetime.date(2017, 12, 26),
                "reads_file_1": "reads_2_1.fq",
                "reads_file_1_md5": None,
                "reads_file_2": "reads_2_2.fq",
                "reads_file_2_md5": None,
                "dataset_name": "g2",
                "instrument_model": "Illumina HiSeq 2000",
                "ena_center_name": "Center 1",
                "submit_to_ena": "1",
                "ena_on_hold": "1",
                "ena_run_accession": None,
                "ena_sample_accession": None,
            },
        ]

        filename = os.path.join(data_dir, "load_data_from_spreadsheet.tsv")
        got = spreadsheet_helper.load_data_from_spreadsheet(filename)
        self.maxDiff = None
        self.assertEqual(expected, got)
Example #6
0
    def test_load_data_from_spreadsheet_tsv(self):
        '''test load_data_from_spreadsheet tsv file'''
        expected = [
            {
                'subject_id': 'p1',
                'site_id': 's1',
                'lab_id': 'l1',
                'isolate_number': '42',
                'sequence_replicate_number': '43',
                'submission_date': datetime.date(2017, 12, 25),
                'reads_file_1': 'reads_1_1.fq',
                'reads_file_1_md5': 'abcdefghijklmnopqrstuvwyx123456',
                'reads_file_2': 'reads_1_2.fq',
                'reads_file_2_md5': 'abcdefghijklmnopqrstuvwyx123457',
                'dataset_name': 'g1',
                'instrument_model': 'Illumina HiSeq 2000',
                'ena_center_name': 'Center 1',
                'submit_to_ena': '0',
                'ena_on_hold': '0',
                'ena_run_accession': 'ERR123456',
                'ena_sample_accession': 'ERS123456',
            },
            {
                'subject_id': 'p2',
                'site_id': 's2',
                'lab_id': 'l2',
                'isolate_number': '44',
                'sequence_replicate_number': '45',
                'submission_date': datetime.date(2017, 12, 26),
                'reads_file_1': 'reads_2_1.fq',
                'reads_file_1_md5': None,
                'reads_file_2': 'reads_2_2.fq',
                'reads_file_2_md5': None,
                'dataset_name': 'g2',
                'instrument_model': 'Illumina HiSeq 2000',
                'ena_center_name': 'Center 1',
                'submit_to_ena': '1',
                'ena_on_hold': '1',
                'ena_run_accession': None,
                'ena_sample_accession': None,
            },
        ]

        filename = os.path.join(data_dir, 'load_data_from_spreadsheet.tsv')
        got = spreadsheet_helper.load_data_from_spreadsheet(filename)
        self.maxDiff = None
        self.assertEqual(expected, got)