def test_load_data_from_spreadsheet_bad_input(self): """Test load_data_from_spreadsheet with bad input files""" filenames = [ "load_data_from_spreadsheet_bad_column_names.xlsx", "load_data_from_spreadsheet_wrong_field_number.xlsx", ] for filename in filenames: with self.assertRaises(Exception): xlsx = os.path.join(data_dir, filename) spreadsheet_helper.load_data_from_spreadsheet(xlsx)
def test_load_data_from_spreadsheet_bad_input(self): '''Test load_data_from_spreadsheet with bad input files''' filenames = [ 'load_data_from_spreadsheet_bad_column_names.xlsx', 'load_data_from_spreadsheet_wrong_field_number.xlsx', ] for filename in filenames: with self.assertRaises(spreadsheet_helper.Error): xlsx = os.path.join(data_dir, filename) spreadsheet_helper.load_data_from_spreadsheet(xlsx)
def run(self): all_data = spreadsheet_helper.load_data_from_spreadsheet( self.spreasheet_xlsx) errors = SpreadsheetValidator._check_no_blank_values(all_data) errors.extend( SpreadsheetValidator._check_uniqueness_of_values(all_data)) errors.extend( SpreadsheetValidator. _check_global_file_and_md5_column_intersection(all_data)) errors.extend( SpreadsheetValidator._check_files_exist_and_md5( all_data, self.data_root_dir, self.md5_threads)) errors.extend( SpreadsheetValidator._check_integers(all_data, "isolate_number", min_value=1)) errors.extend( SpreadsheetValidator._check_integers(all_data, "sequence_replicate_number", min_value=1)) errors.extend( SpreadsheetValidator._check_integers(all_data, "submit_to_ena", min_value=0, max_value=1)) errors.extend( SpreadsheetValidator._check_integers(all_data, "ena_on_hold", min_value=0, max_value=1)) errors.extend(SpreadsheetValidator._check_instrument_model(all_data)) with open(self.outfile, "w") as f: for line in errors: print(line, file=f)
def _import_reads_and_update_db(self): database = db.Db(self.db_ini_file) data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file) xlsx_dir = os.path.dirname(self.xlsx_file) data_errors = SpreadsheetImporter._validate_data( database, data, self.dropbox_dir) if len(data_errors) > 0: raise Exception("Error(s) importing spreadsheet:\n" + "\n".join(data_errors)) try: f_out = open(self.jobs_outfile, "w") except: raise Exception('Error opening file "' + self.jobs_outfile + '". Cannot continue') print( "seqrep_id", "sample_id", "isolate_id", "sequence_replicate_number", "reads1", "reads2", "reads1_md5", "reads2_md5", sep="\t", file=f_out, ) for data_dict in data: reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"]) reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"]) assert os.path.exists(reads1) and os.path.exists(reads2) seqrep_id, isolate_id, sample_id = database.add_one_seqrep( data_dict) print( seqrep_id, sample_id, isolate_id, data_dict["sequence_replicate_number"], reads1, reads2, data_dict["reads_file_1_md5"], data_dict["reads_file_2_md5"], sep="\t", file=f_out, ) f_out.close() xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet( self.xlsx_file, self.xlsx_archive_dir) jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv" assert not os.path.exists(jobs_backup_file) utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file) database.commit_and_close() if self.db_backup_dir is not None: database.backup(self.db_backup_dir)
def test_load_data_from_spreadsheet_tsv(self): """test load_data_from_spreadsheet tsv file""" expected = [ { "subject_id": "p1", "site_id": "s1", "lab_id": "l1", "isolate_number": "42", "sequence_replicate_number": "43", "submission_date": datetime.date(2017, 12, 25), "reads_file_1": "reads_1_1.fq", "reads_file_1_md5": "abcdefghijklmnopqrstuvwyx123456", "reads_file_2": "reads_1_2.fq", "reads_file_2_md5": "abcdefghijklmnopqrstuvwyx123457", "dataset_name": "g1", "instrument_model": "Illumina HiSeq 2000", "ena_center_name": "Center 1", "submit_to_ena": "0", "ena_on_hold": "0", "ena_run_accession": "ERR123456", "ena_sample_accession": "ERS123456", }, { "subject_id": "p2", "site_id": "s2", "lab_id": "l2", "isolate_number": "44", "sequence_replicate_number": "45", "submission_date": datetime.date(2017, 12, 26), "reads_file_1": "reads_2_1.fq", "reads_file_1_md5": None, "reads_file_2": "reads_2_2.fq", "reads_file_2_md5": None, "dataset_name": "g2", "instrument_model": "Illumina HiSeq 2000", "ena_center_name": "Center 1", "submit_to_ena": "1", "ena_on_hold": "1", "ena_run_accession": None, "ena_sample_accession": None, }, ] filename = os.path.join(data_dir, "load_data_from_spreadsheet.tsv") got = spreadsheet_helper.load_data_from_spreadsheet(filename) self.maxDiff = None self.assertEqual(expected, got)
def test_load_data_from_spreadsheet_tsv(self): '''test load_data_from_spreadsheet tsv file''' expected = [ { 'subject_id': 'p1', 'site_id': 's1', 'lab_id': 'l1', 'isolate_number': '42', 'sequence_replicate_number': '43', 'submission_date': datetime.date(2017, 12, 25), 'reads_file_1': 'reads_1_1.fq', 'reads_file_1_md5': 'abcdefghijklmnopqrstuvwyx123456', 'reads_file_2': 'reads_1_2.fq', 'reads_file_2_md5': 'abcdefghijklmnopqrstuvwyx123457', 'dataset_name': 'g1', 'instrument_model': 'Illumina HiSeq 2000', 'ena_center_name': 'Center 1', 'submit_to_ena': '0', 'ena_on_hold': '0', 'ena_run_accession': 'ERR123456', 'ena_sample_accession': 'ERS123456', }, { 'subject_id': 'p2', 'site_id': 's2', 'lab_id': 'l2', 'isolate_number': '44', 'sequence_replicate_number': '45', 'submission_date': datetime.date(2017, 12, 26), 'reads_file_1': 'reads_2_1.fq', 'reads_file_1_md5': None, 'reads_file_2': 'reads_2_2.fq', 'reads_file_2_md5': None, 'dataset_name': 'g2', 'instrument_model': 'Illumina HiSeq 2000', 'ena_center_name': 'Center 1', 'submit_to_ena': '1', 'ena_on_hold': '1', 'ena_run_accession': None, 'ena_sample_accession': None, }, ] filename = os.path.join(data_dir, 'load_data_from_spreadsheet.tsv') got = spreadsheet_helper.load_data_from_spreadsheet(filename) self.maxDiff = None self.assertEqual(expected, got)