def _archive_spreadsheet(cls, xlsx_file, archive_dir): date_string = utils.date_string_from_file_mtime(xlsx_file) date_directory = os.path.join(archive_dir, date_string) if not os.path.exists(date_directory): try: os.mkdir(date_directory) except: raise Exception("Error mkdir " + date_directory) xlsx_basename = os.path.basename(xlsx_file) existing_xlsx_files = set(os.listdir(date_directory)) # this is unlikely to happen, but if name of archived xlsx # file already exists, append .N on the end (where N is smallest # int such that file does not already exist) if xlsx_basename in existing_xlsx_files: i = 1 while xlsx_basename + "." + str(i) in existing_xlsx_files: i += 1 xlsx_basename += "." + str(i) new_name = os.path.join(date_directory, xlsx_basename) utils.rsync_and_md5(xlsx_file, new_name) os.unlink(xlsx_file) return new_name
def _tidy_files(self): shutil.rmtree(os.path.join(self.cortex_outdir, 'tmp_filelists')) for filename in glob.glob( os.path.join(self.cortex_outdir, 'binaries', 'uncleaned', '**', '**')): if not (filename.endswith('log') or filename.endswith('.covg')): os.unlink(filename) for filename in glob.glob( os.path.join(self.cortex_outdir, 'calls', '**')): if os.path.isdir(filename): for filename2 in os.listdir(filename): if not filename2.endswith('log'): os.unlink(os.path.join(filename, filename2)) elif not (filename.endswith('log') or filename.endswith('callsets.genotyped')): os.unlink(filename) for filename in glob.glob( os.path.join(self.cortex_outdir, 'vcfs', '**')): if filename.endswith('.vcf'): tmp_vcf = filename + '.tmp' _replace_sample_name_in_vcf(filename, tmp_vcf, self.sample_name) utils.rsync_and_md5(tmp_vcf, filename) os.unlink(tmp_vcf) if not ((filename.endswith('.vcf') and 'FINAL' in filename) or filename.endswith('log') or filename.endswith('aligned_branches')): if os.path.isdir(filename): shutil.rmtree(filename) else: os.unlink(filename)
def _tidy_files(self): shutil.rmtree(os.path.join(self.cortex_outdir, "tmp_filelists")) for filename in glob.glob( os.path.join(self.cortex_outdir, "binaries", "uncleaned", "**", "**")): if not (filename.endswith("log") or filename.endswith(".covg")): os.unlink(filename) for filename in glob.glob( os.path.join(self.cortex_outdir, "calls", "**")): if os.path.isdir(filename): for filename2 in os.listdir(filename): if not filename2.endswith("log"): os.unlink(os.path.join(filename, filename2)) elif not (filename.endswith("log") or filename.endswith("callsets.genotyped")): os.unlink(filename) for filename in glob.glob( os.path.join(self.cortex_outdir, "vcfs", "**")): if filename.endswith(".vcf"): tmp_vcf = filename + ".tmp" _replace_sample_name_in_vcf(filename, tmp_vcf, self.sample_name) utils.rsync_and_md5(tmp_vcf, filename) os.unlink(tmp_vcf) if not ((filename.endswith(".vcf") and "FINAL" in filename) or filename.endswith("log") or filename.endswith("aligned_branches")): if os.path.isdir(filename): shutil.rmtree(filename) else: os.unlink(filename)
def _import_reads_and_update_db(self): database = db.Db(self.db_ini_file) data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file) xlsx_dir = os.path.dirname(self.xlsx_file) data_errors = SpreadsheetImporter._validate_data( database, data, self.dropbox_dir) if len(data_errors) > 0: raise Exception("Error(s) importing spreadsheet:\n" + "\n".join(data_errors)) try: f_out = open(self.jobs_outfile, "w") except: raise Exception('Error opening file "' + self.jobs_outfile + '". Cannot continue') print( "seqrep_id", "sample_id", "isolate_id", "sequence_replicate_number", "reads1", "reads2", "reads1_md5", "reads2_md5", sep="\t", file=f_out, ) for data_dict in data: reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"]) reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"]) assert os.path.exists(reads1) and os.path.exists(reads2) seqrep_id, isolate_id, sample_id = database.add_one_seqrep( data_dict) print( seqrep_id, sample_id, isolate_id, data_dict["sequence_replicate_number"], reads1, reads2, data_dict["reads_file_1_md5"], data_dict["reads_file_2_md5"], sep="\t", file=f_out, ) f_out.close() xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet( self.xlsx_file, self.xlsx_archive_dir) jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv" assert not os.path.exists(jobs_backup_file) utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file) database.commit_and_close() if self.db_backup_dir is not None: database.backup(self.db_backup_dir)
def add_remove_contam_metadata_tsv(self, infile): utils.rsync_and_md5(infile, self.remove_contam_metadata_tsv) data_by_group, sequence_is_contam = contam_remover.ContamRemover._load_metadata_file( self.remove_contam_metadata_tsv) names_in_contam = set(sequence_is_contam.keys()) names_in_fasta = {} pyfastaq.tasks.lengths_from_fai(self.ref_fai, names_in_fasta) names_in_fasta = set(names_in_fasta.keys()) if names_in_fasta != names_in_contam: raise Error("Mismtach in names from metadata tsv and fasta files")
def _copy_reads_file(cls, old_name, new_name, expected_md5): old_name_md5 = utils.md5(old_name) if old_name_md5 != expected_md5: raise Error( "MD5 given by submitter " + expected_md5 + " does not match calculated MD5 " + old_name_md5 ) utils.rsync_and_md5(old_name, new_name, expected_md5)
def test_rsync_and_md5(self): '''test rsync_and_md5''' old_name = os.path.join(data_dir, 'rsync_and_md5.txt') new_name = 'tmp.test_rsync_and_md5' got = utils.rsync_and_md5(old_name, new_name) expected = 'a00096ee7316167c6ceef09f43433667' self.assertEqual(expected, got) self.assertTrue(filecmp.cmp(old_name, new_name, shallow=False)) os.unlink(new_name)
def test_rsync_and_md5(self): """test rsync_and_md5""" old_name = os.path.join(data_dir, "rsync_and_md5.txt") new_name = "tmp.test_rsync_and_md5" got = utils.rsync_and_md5(old_name, new_name) expected = "a00096ee7316167c6ceef09f43433667" self.assertEqual(expected, got) self.assertTrue(filecmp.cmp(old_name, new_name, shallow=False)) os.unlink(new_name)
def setup_files(self, species, panel_name, probes_fasta, var_to_res_json): try: os.mkdir(self.root_dir) except: raise Exception("Error mkdir " + self.root_dir) self.metadata = { "species": species, "name": panel_name, "is_built_in": species in built_in_panels and panel_name in built_in_panels[species], } with open(self.json_file, "w") as f: print(json.dumps(self.metadata), file=f) if not self.metadata["is_built_in"]: utils.rsync_and_md5(probes_fasta, self.probes_fasta) utils.rsync_and_md5(var_to_res_json, self.var_to_res_json)
def setup_files(self, species, panel_name, probes_fasta, var_to_res_json): try: os.mkdir(self.root_dir) except: raise Error('Error mkdir ' + self.root_dir) self.metadata = { 'species': species, 'name': panel_name, 'is_built_in': species in built_in_panels and panel_name in built_in_panels[species], } with open(self.json_file, 'w') as f: print(json.dumps(self.metadata), file=f) if not self.metadata['is_built_in']: utils.rsync_and_md5(probes_fasta, self.probes_fasta) utils.rsync_and_md5(var_to_res_json, self.var_to_res_json)
def _test_run(self, original_reads1, original_reads2, expected_import_status): '''test run''' pipeline_root = 'tmp.read_pair_importer.run.root' if os.path.exists(pipeline_root): shutil.rmtree(pipeline_root) os.mkdir(pipeline_root) seqrep_id = 1 sample = 3 isolate = 2 sequence_replicate_number = 42 # copy the reads because the pipeline will delete them later reads1 = 'tmp.read_pair_importer.reads1.fq' reads2 = 'tmp.read_pair_importer.reads2.fq' md5_1 = utils.rsync_and_md5(original_reads1, reads1) md5_2 = utils.rsync_and_md5(original_reads2, reads2) # write an md5 file, to check it gets deleted later md5_file = reads1 + '.md5' utils.syscall('md5sum ' + reads1 + ' > ' + md5_file) importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) # no row in Seqrep table with self.assertRaises(read_pair_importer.Error): importer.run() seqrep_row = {'seqrep_id': seqrep_id, 'isolate_id': isolate, 'sequence_replicate_number' : sequence_replicate_number, 'original_reads_file_1_md5': md5_1, 'original_reads_file_2_md5': md5_2, 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'withdrawn': 0, 'import_status': 0, 'instrument_model': 'Illumina HiSeq 2000', 'submission_date': '20170101', 'submit_to_ena': 0, 'ena_run_accession': None, 'ena_on_hold': 0} self.db.add_row_to_table('Seqrep', seqrep_row) self.db.commit() # need to create a new object so the database changes get picked up. # Separate connections do not see changes made by each other. importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) # reads file doesn't exit importer.reads_file_1 = 'oops' with self.assertRaises(read_pair_importer.Error): importer.run() importer.reads_file_1 = reads1 # check lock file works iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate) iso_dir.make_essential_dirs() lock_file = os.path.join(iso_dir.reads_dir, 'import_lock.' + str(seqrep_id)) utils.make_empty_file(lock_file) with self.assertRaises(read_pair_importer.Error): importer.run() os.unlink(lock_file) # should run ok where_query = ' and '.join(['sample_id=' + str(sample), 'isolate_number=' + str(isolate), 'sequence_replicate_number=' + str(seqrep_id)]) rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(0, rows[0]['import_status']) importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) importer.run() # reconnect so that we pick up the changes made by the previous line self.db.reconnect() reads_out_1 = iso_dir.reads_filename('original', sequence_replicate_number, 1) reads_out_2 = iso_dir.reads_filename('original', sequence_replicate_number, 2) rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(expected_import_status, rows[0]['import_status']) # Files either copied/deleted or not depending on if import was successful if expected_import_status == 1: self.assertTrue(os.path.exists(reads_out_1)) self.assertTrue(os.path.exists(reads_out_2)) self.assertFalse(os.path.exists(reads1)) self.assertFalse(os.path.exists(reads2)) self.assertFalse(os.path.exists(md5_file)) else: self.assertFalse(os.path.exists(reads_out_1)) self.assertFalse(os.path.exists(reads_out_2)) self.assertTrue(os.path.exists(reads1)) self.assertTrue(os.path.exists(reads2)) self.assertTrue(os.path.exists(md5_file)) os.unlink(reads1) os.unlink(reads2) os.unlink(md5_file) shutil.rmtree(pipeline_root)
def _test_run(self, original_reads1, original_reads2, expected_import_status): """test run""" pipeline_root = "tmp.read_pair_importer.run.root" if os.path.exists(pipeline_root): shutil.rmtree(pipeline_root) os.mkdir(pipeline_root) seqrep_id = 1 sample = 3 isolate = 2 sequence_replicate_number = 42 # copy the reads because the pipeline will delete them later reads1 = "tmp.read_pair_importer.reads1.fq" reads2 = "tmp.read_pair_importer.reads2.fq" md5_1 = utils.rsync_and_md5(original_reads1, reads1) md5_2 = utils.rsync_and_md5(original_reads2, reads2) # write an md5 file, to check it gets deleted later md5_file = reads1 + ".md5" utils.syscall("md5sum " + reads1 + " > " + md5_file) importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) # no row in Seqrep table with self.assertRaises(read_pair_importer.Error): importer.run() seqrep_row = { "seqrep_id": seqrep_id, "isolate_id": isolate, "sequence_replicate_number": sequence_replicate_number, "original_reads_file_1_md5": md5_1, "original_reads_file_2_md5": md5_2, "remove_contam_reads_file_1_md5": None, "remove_contam_reads_file_2_md5": None, "withdrawn": 0, "import_status": 0, "instrument_model": "Illumina HiSeq 2000", "submission_date": "20170101", "submit_to_ena": 0, "ena_run_accession": None, "ena_on_hold": 0, } self.db.add_row_to_table("Seqrep", seqrep_row) self.db.commit() # need to create a new object so the database changes get picked up. # Separate connections do not see changes made by each other. importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) # reads file doesn't exit importer.reads_file_1 = "oops" with self.assertRaises(read_pair_importer.Error): importer.run() importer.reads_file_1 = reads1 # check lock file works iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate) iso_dir.make_essential_dirs() lock_file = os.path.join(iso_dir.reads_dir, "import_lock." + str(seqrep_id)) utils.make_empty_file(lock_file) with self.assertRaises(read_pair_importer.Error): importer.run() os.unlink(lock_file) # should run ok where_query = " and ".join([ "sample_id=" + str(sample), "isolate_number=" + str(isolate), "sequence_replicate_number=" + str(seqrep_id), ]) rows = self.db.get_rows_from_table("Seqrep", where="seqrep_id=" + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(0, rows[0]["import_status"]) importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) importer.run() # reconnect so that we pick up the changes made by the previous line self.db.reconnect() reads_out_1 = iso_dir.reads_filename("original", sequence_replicate_number, 1) reads_out_2 = iso_dir.reads_filename("original", sequence_replicate_number, 2) rows = self.db.get_rows_from_table("Seqrep", where="seqrep_id=" + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(expected_import_status, rows[0]["import_status"]) # Files either copied/deleted or not depending on if import was successful if expected_import_status == 1: self.assertTrue(os.path.exists(reads_out_1)) self.assertTrue(os.path.exists(reads_out_2)) self.assertFalse(os.path.exists(reads1)) self.assertFalse(os.path.exists(reads2)) self.assertFalse(os.path.exists(md5_file)) else: self.assertFalse(os.path.exists(reads_out_1)) self.assertFalse(os.path.exists(reads_out_2)) self.assertTrue(os.path.exists(reads1)) self.assertTrue(os.path.exists(reads2)) self.assertTrue(os.path.exists(md5_file)) os.unlink(reads1) os.unlink(reads2) os.unlink(md5_file) shutil.rmtree(pipeline_root)