def test_run(self): """test run""" archive_dir = "tmp.test_spreadsheet_importer_run.archive" os.mkdir(archive_dir) tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv" # need to copy the dropbox directory, because run() will move the # xlsx file original_dropbox_dir = os.path.join(data_dir, "run.dropbox") dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox" shutil.copytree(original_dropbox_dir, dropbox_dir) xlsx_file = os.path.join(dropbox_dir, "import.xlsx") date = utils.date_string_from_file_mtime(xlsx_file) importer = spreadsheet_importer.SpreadsheetImporter( dropbox_dir, xlsx_file, db_ini_file, archive_dir, tsv_file ) # test lock file stops it running utils.make_empty_file(importer.lock_file) with self.assertRaises(Exception): importer.run() os.unlink(importer.lock_file) # we'll just run it - the details are checked in test_import_reads_and_update_db importer.run() shutil.rmtree(archive_dir) shutil.rmtree(dropbox_dir) os.unlink(tsv_file)
def run(options): importer = spreadsheet_importer.SpreadsheetImporter( options.dropbox_dir, options.xlsx_file, options.db_config_file, options.xls_archive_dir, options.jobs_outfile, db_backup_dir=options.db_backup_dir, ) importer.run()
def test_import_reads_and_update_db(self): """test _import_reads_and_update_db""" archive_dir = "tmp.test_spreadsheet_importer_run.archive" os.mkdir(archive_dir) db_backup_dir = "tmp.test_spreadsheet_importer_run.db_backup" os.mkdir(db_backup_dir) tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv" # need to copy the dropbox directory, because run() will move the # xlsx file original_dropbox_dir = os.path.join(data_dir, "run.dropbox") dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox" shutil.copytree(original_dropbox_dir, dropbox_dir) xlsx_file = os.path.join(dropbox_dir, "import.xlsx") date = utils.date_string_from_file_mtime(xlsx_file) importer = spreadsheet_importer.SpreadsheetImporter( dropbox_dir, xlsx_file, db_ini_file, archive_dir, tsv_file, db_backup_dir=db_backup_dir, ) importer._import_reads_and_update_db() # check xlsx file and import jobs file got archived xlsx_archive_file = os.path.join(archive_dir, date, "import.xlsx") self.assertTrue(os.path.exists(xlsx_archive_file)) self.assertTrue(os.path.exists(xlsx_archive_file + ".import_jobs.tsv")) self.assertFalse(os.path.exists(xlsx_file)) shutil.rmtree(archive_dir) # check database got backed up backup_files = os.listdir(db_backup_dir) self.assertEqual(1, len(backup_files)) shutil.rmtree(db_backup_dir) # check tsv file is correct reads_prefix = os.path.abspath(os.path.join(dropbox_dir, "reads")) expected_tsv_lines = [ "\t".join( [ "seqrep_id", "sample_id", "isolate_id", "sequence_replicate_number", "reads1", "reads2", "reads1_md5", "reads2_md5", ] ), "\t".join( [ "1", "1", "1", "43", reads_prefix + ".1_1.fq", reads_prefix + ".1_2.fq", "abcdefghijklmnopqrstuvwyx123456", "abcdefghijklmnopqrstuvwyx123457", ] ), "\t".join( [ "2", "2", "2", "45", reads_prefix + ".2_1.fq", reads_prefix + ".2_2.fq", "a73817805eb1d44ca88eb5cb794c7de7", "d468360c689d482b227256d887a05996", ] ), ] with open(tsv_file) as f: got_tsv_lines = [line.rstrip() for line in f] self.assertEqual(expected_tsv_lines, got_tsv_lines) os.unlink(tsv_file) # check database is correct got_sample = self.db.get_rows_from_table("Sample", order_by="sample_id") expected_sample = [ { "sample_id": 1, "subject_id": "p1", "site_id": "s1", "sample_id_from_lab": "l1", "dataset_name": "g1", "ena_center_name": "Center 1", "ena_sample_accession": "ERS123456", "ena_study_accession": None, }, { "sample_id": 2, "subject_id": "p2", "site_id": "s2", "sample_id_from_lab": "l2", "dataset_name": "g2", "ena_center_name": "Center 1", "ena_sample_accession": None, "ena_study_accession": None, }, ] self.maxDiff = None self.assertEqual(expected_sample, got_sample) got_seqrep = self.db.get_rows_from_table("Seqrep", order_by="seqrep_id") expected_seqrep = [ { "seqrep_id": 1, "isolate_id": 1, "sequence_replicate_number": 43, "original_reads_file_1_md5": "abcdefghijklmnopqrstuvwyx123456", "original_reads_file_2_md5": "abcdefghijklmnopqrstuvwyx123457", "remove_contam_reads_file_1_md5": None, "remove_contam_reads_file_2_md5": None, "withdrawn": 0, "import_status": 0, "submission_date": datetime.date(2017, 12, 25), "instrument_model": "Illumina HiSeq 2000", "submit_to_ena": 0, "ena_run_accession": "ERR123456", "ena_on_hold": 0, }, { "seqrep_id": 2, "isolate_id": 2, "sequence_replicate_number": 45, "original_reads_file_1_md5": "a73817805eb1d44ca88eb5cb794c7de7", "original_reads_file_2_md5": "d468360c689d482b227256d887a05996", "remove_contam_reads_file_1_md5": None, "remove_contam_reads_file_2_md5": None, "withdrawn": 0, "import_status": 0, "submission_date": datetime.date(2017, 12, 26), "instrument_model": "Illumina HiSeq 2000", "submit_to_ena": 1, "ena_run_accession": None, "ena_on_hold": 1, }, ] self.assertEqual(expected_seqrep, got_seqrep) shutil.rmtree(dropbox_dir)
def test_import_reads_and_update_db(self): '''test _import_reads_and_update_db''' archive_dir = 'tmp.test_spreadsheet_importer_run.archive' os.mkdir(archive_dir) db_backup_dir = 'tmp.test_spreadsheet_importer_run.db_backup' os.mkdir(db_backup_dir) tsv_file = 'tmp.test_spreadsheet_importer_run.out.tsv' # need to copy the dropbox directory, because run() will move the # xlsx file original_dropbox_dir = os.path.join(data_dir, 'run.dropbox') dropbox_dir = 'tmp.test_spreadsheet_importer_run.out.dropbox' shutil.copytree(original_dropbox_dir, dropbox_dir) xlsx_file = os.path.join(dropbox_dir, 'import.xlsx') date = utils.date_string_from_file_mtime(xlsx_file) importer = spreadsheet_importer.SpreadsheetImporter( dropbox_dir, xlsx_file, db_ini_file, archive_dir, tsv_file, db_backup_dir=db_backup_dir) importer._import_reads_and_update_db() # check xlsx file and import jobs file got archived xlsx_archive_file = os.path.join(archive_dir, date, 'import.xlsx') self.assertTrue(os.path.exists(xlsx_archive_file)) self.assertTrue(os.path.exists(xlsx_archive_file + '.import_jobs.tsv')) self.assertFalse(os.path.exists(xlsx_file)) shutil.rmtree(archive_dir) # check database got backed up backup_files = os.listdir(db_backup_dir) self.assertEqual(1, len(backup_files)) shutil.rmtree(db_backup_dir) # check tsv file is correct reads_prefix = os.path.abspath(os.path.join(dropbox_dir, 'reads')) expected_tsv_lines = [ '\t'.join([ 'seqrep_id', 'sample_id', 'isolate_id', 'sequence_replicate_number', 'reads1', 'reads2', 'reads1_md5', 'reads2_md5' ]), '\t'.join([ '1', '1', '1', '43', reads_prefix + '.1_1.fq', reads_prefix + '.1_2.fq', 'abcdefghijklmnopqrstuvwyx123456', 'abcdefghijklmnopqrstuvwyx123457' ]), '\t'.join([ '2', '2', '2', '45', reads_prefix + '.2_1.fq', reads_prefix + '.2_2.fq', 'a73817805eb1d44ca88eb5cb794c7de7', 'd468360c689d482b227256d887a05996' ]), ] with open(tsv_file) as f: got_tsv_lines = [line.rstrip() for line in f] self.assertEqual(expected_tsv_lines, got_tsv_lines) os.unlink(tsv_file) # check database is correct got_sample = self.db.get_rows_from_table('Sample', order_by='sample_id') expected_sample = [{ 'sample_id': 1, 'subject_id': 'p1', 'site_id': 's1', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center 1', 'ena_sample_accession': 'ERS123456', 'ena_study_accession': None, }, { 'sample_id': 2, 'subject_id': 'p2', 'site_id': 's2', 'sample_id_from_lab': 'l2', 'dataset_name': 'g2', 'ena_center_name': 'Center 1', 'ena_sample_accession': None, 'ena_study_accession': None, }] self.maxDiff = None self.assertEqual(expected_sample, got_sample) got_seqrep = self.db.get_rows_from_table('Seqrep', order_by='seqrep_id') expected_seqrep = [{ 'seqrep_id': 1, 'isolate_id': 1, 'sequence_replicate_number': 43, 'original_reads_file_1_md5': 'abcdefghijklmnopqrstuvwyx123456', 'original_reads_file_2_md5': 'abcdefghijklmnopqrstuvwyx123457', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'withdrawn': 0, 'import_status': 0, 'submission_date': datetime.date(2017, 12, 25), 'instrument_model': 'Illumina HiSeq 2000', 'submit_to_ena': 0, 'ena_run_accession': 'ERR123456', 'ena_on_hold': 0, }, { 'seqrep_id': 2, 'isolate_id': 2, 'sequence_replicate_number': 45, 'original_reads_file_1_md5': 'a73817805eb1d44ca88eb5cb794c7de7', 'original_reads_file_2_md5': 'd468360c689d482b227256d887a05996', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'withdrawn': 0, 'import_status': 0, 'submission_date': datetime.date(2017, 12, 26), 'instrument_model': 'Illumina HiSeq 2000', 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, }] self.assertEqual(expected_seqrep, got_seqrep) shutil.rmtree(dropbox_dir)