def test_reads_filename(self): """test reads_filename""" root = os.path.abspath(os.path.join("spam", "shrubbery")) sample_id = 123456 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.isolate_dir, "Reads", "reads.original.11.1.fq.gz"), iso_dir.reads_filename("original", 11, 1), ) self.assertEqual( os.path.join(iso_dir.isolate_dir, "Reads", "reads.remove_contam.12.2.fq.gz"), iso_dir.reads_filename("remove_contam", 12, 2), ) self.assertEqual( os.path.join(iso_dir.isolate_dir, "Reads", "reads.contam.11.1.fq.gz"), iso_dir.reads_filename("contam", 11, 1), ) with self.assertRaises(Exception): iso_dir.reads_filename("original", 11, 42) with self.assertRaises(Exception): iso_dir.reads_filename("oops_wrong_type", 11, 1)
def _submit_sample_objects(self, data_in): submitted_samples = {} # sample id -> ena accession for row in data_in: if row["ena_sample_accession"] is not None: continue elif row["sample_id"] in submitted_samples: row["ena_sample_accession"] = submitted_samples[row["sample_id"]] else: assert row["ena_sample_accession"] is None iso_dir = isolate_dir.IsolateDir( self.pipeline_root, row["sample_id"], row["isolate_id"] ) object_xml = iso_dir.xml_submission_file("sample") object_alias = "sample." + str(row["sample_id"]) submit_alias = "submit." + object_alias center_name = DatasetSubmitter._ena_center_name_from_db_data( data_in, number_to_name_dict=self.centre_number_to_name ) title = ( row["subject_id"] + ". " + center_name + ". " + row["sample_id_from_lab"] ) obj_creator = object_creator.ObjectCreator( self.ini_file, "sample", object_xml, object_alias, submit_alias, center_name, title, taxon_id=self.taxon_id, use_test_server=self.use_test_server, unit_test=self.unit_test, broker_name=self.broker_name, ) obj_creator.run() if obj_creator.submission_receipt.successful: try: sample_accession = obj_creator.submission_receipt.accessions[ "SAMPLE" ] except: sample_accession = "FAIL" else: sample_accession = "FAIL" row["ena_sample_accession"] = sample_accession self.db.update_row( "Sample", {"sample_id": row["sample_id"]}, {"ena_sample_accession": sample_accession}, ) self.db.commit() submitted_samples[row["sample_id"]] = sample_accession
def _submit_experiment_objects(self, data_in): submitted_isolates = {} # isolate id -> ena accession for row in data_in: if row['ena_experiment_accession'] is not None or row[ 'ena_sample_accession'] == 'FAIL': continue elif row['isolate_id'] in submitted_isolates: row['ena_experiment_accession'] = submitted_isolates[ row['isolate_id']] else: assert row['ena_experiment_accession'] is None iso_dir = isolate_dir.IsolateDir(self.pipeline_root, row['sample_id'], row['isolate_id']) object_xml = iso_dir.xml_submission_file('experiment') object_alias = 'experiment.' + str(row['isolate_id']) submit_alias = 'submit.' + object_alias center_name = DatasetSubmitter._ena_center_name_from_db_data( data_in, number_to_name_dict=self.centre_number_to_name) title = row['subject_id'] + '. ' + center_name + '. ' + row[ 'sample_id_from_lab'] + '. ' + row[ 'isolate_number_from_lab'] library_name = title obj_creator = object_creator.ObjectCreator( self.ini_file, 'experiment', object_xml, object_alias, submit_alias, center_name, title, study_accession=row['ena_study_accession'], sample_accession=row['ena_sample_accession'], library_name=library_name, platform='ILLUMINA', instrument=row['instrument_model'], use_test_server=self.use_test_server, unit_test=self.unit_test, broker_name=self.broker_name, ) obj_creator.run() if obj_creator.submission_receipt.successful: try: experiment_accession = obj_creator.submission_receipt.accessions[ 'EXPERIMENT'] except: experiment_accession = 'FAIL' else: experiment_accession = 'FAIL' row['ena_experiment_accession'] = experiment_accession self.db.update_row( 'Isolate', {'isolate_id': row['isolate_id']}, {'ena_experiment_accession': experiment_accession}) self.db.commit() submitted_isolates[row['isolate_id']] = experiment_accession
def run(self): ReadPairImporter._check_database(self.db, self.seqrep_id, self.isolate_id, self.sequence_replicate_number) for filename in self.reads_file_1, self.reads_file_2: if not os.path.exists(filename): raise Error('Error! Reads file ' + filename + ' not found, Cannot continue.') iso_dir = isolate_dir.IsolateDir(self.pipeline_root_dir, self.sample_id, self.isolate_id) iso_dir.make_essential_dirs() lock_file = os.path.join(iso_dir.reads_dir, 'import_lock.' + str(self.seqrep_id)) if os.path.exists(lock_file): raise Error('Error! Lock file ' + lock_file + ' found. Cannot continue') utils.make_empty_file(lock_file) try: fqtools.validate([self.reads_file_1, self.reads_file_2]) ReadPairImporter._copy_reads_file( self.reads_file_1, iso_dir.reads_filename('original', self.sequence_replicate_number, 1), self.reads_file_md5_1) ReadPairImporter._copy_reads_file( self.reads_file_2, iso_dir.reads_filename('original', self.sequence_replicate_number, 2), self.reads_file_md5_2) ReadPairImporter._update_database(self.db, self.seqrep_id, self.isolate_id, self.sequence_replicate_number, import_status=1) os.unlink(self.reads_file_1) os.unlink(self.reads_file_2) for filename in (self.reads_file_1 + '.md5', self.reads_file_2 + '.md5'): if os.path.exists(filename): os.unlink(filename) self.db.commit_and_close() os.unlink(lock_file) except: ReadPairImporter._update_database(self.db, self.seqrep_id, self.isolate_id, self.sequence_replicate_number, import_status=-1) self.db.commit_and_close() if os.path.exists(lock_file): os.unlink(lock_file)
def test_contamination_counts_filename(self): """test contamination_counts_filename""" root = os.path.abspath(os.path.join("knights", "ni")) sample_id = 42 isolate_id = 11 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) expected = os.path.join(iso_dir.isolate_dir, "Reads", "reads.remove_contam.1.counts.tsv") self.assertEqual(expected, iso_dir.contamination_counts_filename(1)) expected = os.path.join(iso_dir.isolate_dir, "Reads", "reads.remove_contam.2.counts.tsv") self.assertEqual(expected, iso_dir.contamination_counts_filename(2))
def test_contamination_counts_filename(self): '''test contamination_counts_filename''' root = os.path.abspath(os.path.join('knights', 'ni')) sample_id = 42 isolate_id = 11 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) expected = os.path.join(iso_dir.isolate_dir, 'Reads', 'reads.remove_contam.1.counts.tsv') self.assertEqual(expected, iso_dir.contamination_counts_filename(1)) expected = os.path.join(iso_dir.isolate_dir, 'Reads', 'reads.remove_contam.2.counts.tsv') self.assertEqual(expected, iso_dir.contamination_counts_filename(2))
def test_pipeline_dir(self): '''test pipeline_dir''' root = os.path.abspath(os.path.join('dave', 'lister')) sample_id = 42 isolate_id = 1 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'Pipelines', '2', 'name', '1.0.0'), iso_dir.pipeline_dir(2, 'name', '1.0.0')) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'Pipelines', '2', 'name', '1.0.1.ref.42'), iso_dir.pipeline_dir(2, 'name', '1.0.1', reference_id=42))
def _submit_sample_objects(self, data_in): submitted_samples = {} # sample id -> ena accession for row in data_in: if row['ena_sample_accession'] is not None: continue elif row['sample_id'] in submitted_samples: row['ena_sample_accession'] = submitted_samples[ row['sample_id']] else: assert row['ena_sample_accession'] is None iso_dir = isolate_dir.IsolateDir(self.pipeline_root, row['sample_id'], row['isolate_id']) object_xml = iso_dir.xml_submission_file('sample') object_alias = 'sample.' + str(row['sample_id']) submit_alias = 'submit.' + object_alias center_name = DatasetSubmitter._ena_center_name_from_db_data( data_in, number_to_name_dict=self.centre_number_to_name) title = row['subject_id'] + '. ' + center_name + '. ' + row[ 'sample_id_from_lab'] obj_creator = object_creator.ObjectCreator( self.ini_file, 'sample', object_xml, object_alias, submit_alias, center_name, title, taxon_id=self.taxon_id, use_test_server=self.use_test_server, unit_test=self.unit_test, broker_name=self.broker_name) obj_creator.run() if obj_creator.submission_receipt.successful: try: sample_accession = obj_creator.submission_receipt.accessions[ 'SAMPLE'] except: sample_accession = 'FAIL' else: sample_accession = 'FAIL' row['ena_sample_accession'] = sample_accession self.db.update_row('Sample', {'sample_id': row['sample_id']}, {'ena_sample_accession': sample_accession}) self.db.commit() submitted_samples[row['sample_id']] = sample_accession
def test_make_essential_dirs(self): '''test make_essential_dirs''' tmp_root_dir = os.path.abspath('tmp.isolate_dir.make_sample_dir_path') if os.path.exists(tmp_root_dir): shutil.rmtree(tmp_root_dir) sample_id = 12345678 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(tmp_root_dir, sample_id, isolate_id) iso_dir.make_essential_dirs() self.assertTrue(os.path.exists(iso_dir.sample_dir)) self.assertTrue(os.path.exists(iso_dir.isolate_dir)) self.assertTrue(os.path.exists(iso_dir.reads_dir)) self.assertTrue(os.path.exists(iso_dir.pipelines_dir)) shutil.rmtree(tmp_root_dir)
def test_pipeline_dir(self): """test pipeline_dir""" root = os.path.abspath(os.path.join("dave", "lister")) sample_id = 42 isolate_id = 1 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.isolate_dir, "Pipelines", "2", "name", "1.0.0"), iso_dir.pipeline_dir(2, "name", "1.0.0"), ) self.assertEqual( os.path.join(iso_dir.isolate_dir, "Pipelines", "2", "name", "1.0.1.ref.42"), iso_dir.pipeline_dir(2, "name", "1.0.1", reference_id=42), )
def test_init(self): '''test __init__''' tmp_root_dir = os.path.abspath('tmp.isolate_dir.make_sample_dir_path') sample_id = 12345678 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(tmp_root_dir, sample_id, isolate_id) expected_sample_dir = isolate_dir.IsolateDir._make_sample_dir_path( tmp_root_dir, sample_id) self.assertEqual(expected_sample_dir, iso_dir.sample_dir) expected_isolate_dir = os.path.join(expected_sample_dir, str(isolate_id)) self.assertEqual(expected_isolate_dir, iso_dir.isolate_dir) expected_reads_dir = os.path.join(expected_isolate_dir, 'Reads') self.assertEqual(expected_reads_dir, iso_dir.reads_dir)
def test_xml_submission_file(self): '''test xml_submission_file''' root = os.path.abspath(os.path.join('papa', 'lazarou')) sample_id = 64738 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.sample_dir, 'ena_sample_submission.xml'), iso_dir.xml_submission_file('sample')) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'ena_experiment_submission.xml'), iso_dir.xml_submission_file('experiment')) with self.assertRaises(isolate_dir.Error): iso_dir.xml_submission_file('run') self.assertEqual( os.path.join(iso_dir.reads_dir, 'reads.remove_contam.42.ena_run_submission.xml'), iso_dir.xml_submission_file('run', sequence_replicate=42))
def test_xml_submission_file(self): """test xml_submission_file""" root = os.path.abspath(os.path.join("papa", "lazarou")) sample_id = 64738 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.sample_dir, "ena_sample_submission.xml"), iso_dir.xml_submission_file("sample"), ) self.assertEqual( os.path.join(iso_dir.isolate_dir, "ena_experiment_submission.xml"), iso_dir.xml_submission_file("experiment"), ) with self.assertRaises(Exception): iso_dir.xml_submission_file("run") self.assertEqual( os.path.join(iso_dir.reads_dir, "reads.remove_contam.42.ena_run_submission.xml"), iso_dir.xml_submission_file("run", sequence_replicate=42), )
def test_reads_filename(self): '''test reads_filename''' root = os.path.abspath(os.path.join('spam', 'shrubbery')) sample_id = 123456 isolate_id = 42 iso_dir = isolate_dir.IsolateDir(root, sample_id, isolate_id) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'Reads', 'reads.original.11.1.fq.gz'), iso_dir.reads_filename('original', 11, 1)) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'Reads', 'reads.remove_contam.12.2.fq.gz'), iso_dir.reads_filename('remove_contam', 12, 2)) self.assertEqual( os.path.join(iso_dir.isolate_dir, 'Reads', 'reads.contam.11.1.fq.gz'), iso_dir.reads_filename('contam', 11, 1)) with self.assertRaises(isolate_dir.Error): iso_dir.reads_filename('original', 11, 42) with self.assertRaises(isolate_dir.Error): iso_dir.reads_filename('oops_wrong_type', 11, 1)
def _test_run(self, original_reads1, original_reads2, expected_import_status): '''test run''' pipeline_root = 'tmp.read_pair_importer.run.root' if os.path.exists(pipeline_root): shutil.rmtree(pipeline_root) os.mkdir(pipeline_root) seqrep_id = 1 sample = 3 isolate = 2 sequence_replicate_number = 42 # copy the reads because the pipeline will delete them later reads1 = 'tmp.read_pair_importer.reads1.fq' reads2 = 'tmp.read_pair_importer.reads2.fq' md5_1 = utils.rsync_and_md5(original_reads1, reads1) md5_2 = utils.rsync_and_md5(original_reads2, reads2) # write an md5 file, to check it gets deleted later md5_file = reads1 + '.md5' utils.syscall('md5sum ' + reads1 + ' > ' + md5_file) importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) # no row in Seqrep table with self.assertRaises(read_pair_importer.Error): importer.run() seqrep_row = {'seqrep_id': seqrep_id, 'isolate_id': isolate, 'sequence_replicate_number' : sequence_replicate_number, 'original_reads_file_1_md5': md5_1, 'original_reads_file_2_md5': md5_2, 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'withdrawn': 0, 'import_status': 0, 'instrument_model': 'Illumina HiSeq 2000', 'submission_date': '20170101', 'submit_to_ena': 0, 'ena_run_accession': None, 'ena_on_hold': 0} self.db.add_row_to_table('Seqrep', seqrep_row) self.db.commit() # need to create a new object so the database changes get picked up. # Separate connections do not see changes made by each other. importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) # reads file doesn't exit importer.reads_file_1 = 'oops' with self.assertRaises(read_pair_importer.Error): importer.run() importer.reads_file_1 = reads1 # check lock file works iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate) iso_dir.make_essential_dirs() lock_file = os.path.join(iso_dir.reads_dir, 'import_lock.' + str(seqrep_id)) utils.make_empty_file(lock_file) with self.assertRaises(read_pair_importer.Error): importer.run() os.unlink(lock_file) # should run ok where_query = ' and '.join(['sample_id=' + str(sample), 'isolate_number=' + str(isolate), 'sequence_replicate_number=' + str(seqrep_id)]) rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(0, rows[0]['import_status']) importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2) importer.run() # reconnect so that we pick up the changes made by the previous line self.db.reconnect() reads_out_1 = iso_dir.reads_filename('original', sequence_replicate_number, 1) reads_out_2 = iso_dir.reads_filename('original', sequence_replicate_number, 2) rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(expected_import_status, rows[0]['import_status']) # Files either copied/deleted or not depending on if import was successful if expected_import_status == 1: self.assertTrue(os.path.exists(reads_out_1)) self.assertTrue(os.path.exists(reads_out_2)) self.assertFalse(os.path.exists(reads1)) self.assertFalse(os.path.exists(reads2)) self.assertFalse(os.path.exists(md5_file)) else: self.assertFalse(os.path.exists(reads_out_1)) self.assertFalse(os.path.exists(reads_out_2)) self.assertTrue(os.path.exists(reads1)) self.assertTrue(os.path.exists(reads2)) self.assertTrue(os.path.exists(md5_file)) os.unlink(reads1) os.unlink(reads2) os.unlink(md5_file) shutil.rmtree(pipeline_root)
def _submit_runs(self, data_in): # Note: reads have to be in the dropbox before submitting the Run object. # Upload all the reads first, in parallel, then submit the runs. fq_pairs_to_upload = [ ] # (seqrep, full path on disk1, dropbox name1, full path on disk2, dropbox name2) for row in data_in: iso_dir = isolate_dir.IsolateDir(self.pipeline_root, row['sample_id'], row['isolate_id']) fq_pairs_to_upload.append(( row['seqrep_id'], iso_dir.reads_filename('remove_contam', row['sequence_replicate_number'], 1), str(row['seqrep_id']) + '.1.' + row['remove_contam_reads_file_1_md5'] + '.fq.gz', iso_dir.reads_filename('remove_contam', row['sequence_replicate_number'], 2), str(row['seqrep_id']) + '.2.' + row['remove_contam_reads_file_2_md5'] + '.fq.gz', )) self.pool = multiprocessing.Pool(self.fq_upload_threads) upload_return_values = self.pool.starmap( _upload_fastq_file_pair, zip(fq_pairs_to_upload, itertools.repeat(self.ini_file), itertools.repeat(self.unit_test))) upload_success = {x[0]: x[1] for x in upload_return_values} fq_pairs_to_upload = {x[0]: x for x in fq_pairs_to_upload} # Fastqs are uploaded, now submit the xmls and update the database for row in data_in: assert row['seqrep_id'] in fq_pairs_to_upload assert row['seqrep_id'] in upload_success assert row['ena_run_accession'] is None assert row['ena_experiment_accession'] is not None iso_dir = isolate_dir.IsolateDir(self.pipeline_root, row['sample_id'], row['isolate_id']) object_xml = iso_dir.xml_submission_file( 'run', sequence_replicate=row['sequence_replicate_number']) object_alias = 'run.' + str(row['isolate_id']) submit_alias = 'submit.' + object_alias center_name = DatasetSubmitter._ena_center_name_from_db_data( data_in, number_to_name_dict=self.centre_number_to_name) title = None # not needed for a run obj_creator = object_creator.ObjectCreator( self.ini_file, 'run', object_xml, object_alias, submit_alias, center_name, title, experiment_accession=row['ena_experiment_accession'], reads_1=fq_pairs_to_upload[row['seqrep_id']][2], md5_1=row['remove_contam_reads_file_1_md5'], reads_2=fq_pairs_to_upload[row['seqrep_id']][4], md5_2=row['remove_contam_reads_file_2_md5'], use_test_server=self.use_test_server, unit_test=self.unit_test, broker_name=self.broker_name, ) obj_creator.run() if obj_creator.submission_receipt.successful: try: run_accession = obj_creator.submission_receipt.accessions[ 'RUN'] except: run_accession = 'FAIL' else: run_accession = 'FAIL' row['ena_run_accession'] = run_accession self.db.update_row('Seqrep', {'seqrep_id': row['seqrep_id']}, {'ena_run_accession': run_accession}) self.db.commit()
def test_nextflow_qc_using_database(self): """test nextflow_qc using database""" tmp_data_dir = "tmp.nextflow_qc" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf") work_dir = "tmp.nextflow_qc.work" dag_file = "nextflow.qc.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one of the samples is in group2 and should get ignored "--ref_id 1", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table("Pipeline") got_pipeline_rows.sort(key=itemgetter("seqrep_id")) expected_pipeline_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": -1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 4, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table("QC") got_qc_rows.sort(key=itemgetter("seqrep_id")) expected_qc_rows = [ { "seqrep_id": 1, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 48.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "fail", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.6, "samtools_insert_size_standard_deviation": 1.0, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 4, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, { "seqrep_id": 2, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 49.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "warn", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.7, "samtools_insert_size_standard_deviation": 1.1, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 0, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, ] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 43 }, { "sample": 2, "isolate_id": 2, "seq_repl": 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) qc_root_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "qc", clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ["fastqc", "samtools_qc"]: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def _submit_experiment_objects(self, data_in): submitted_isolates = {} # isolate id -> ena accession for row in data_in: if ( row["ena_experiment_accession"] is not None or row["ena_sample_accession"] == "FAIL" ): continue elif row["isolate_id"] in submitted_isolates: row["ena_experiment_accession"] = submitted_isolates[row["isolate_id"]] else: assert row["ena_experiment_accession"] is None iso_dir = isolate_dir.IsolateDir( self.pipeline_root, row["sample_id"], row["isolate_id"] ) object_xml = iso_dir.xml_submission_file("experiment") object_alias = "experiment." + str(row["isolate_id"]) submit_alias = "submit." + object_alias center_name = DatasetSubmitter._ena_center_name_from_db_data( data_in, number_to_name_dict=self.centre_number_to_name ) title = ( row["subject_id"] + ". " + center_name + ". " + row["sample_id_from_lab"] + ". " + row["isolate_number_from_lab"] ) library_name = title obj_creator = object_creator.ObjectCreator( self.ini_file, "experiment", object_xml, object_alias, submit_alias, center_name, title, study_accession=row["ena_study_accession"], sample_accession=row["ena_sample_accession"], library_name=library_name, platform="ILLUMINA", instrument=row["instrument_model"], use_test_server=self.use_test_server, unit_test=self.unit_test, broker_name=self.broker_name, ) obj_creator.run() if obj_creator.submission_receipt.successful: try: experiment_accession = obj_creator.submission_receipt.accessions[ "EXPERIMENT" ] except: experiment_accession = "FAIL" else: experiment_accession = "FAIL" row["ena_experiment_accession"] = experiment_accession self.db.update_row( "Isolate", {"isolate_id": row["isolate_id"]}, {"ena_experiment_accession": experiment_accession}, ) self.db.commit() submitted_isolates[row["isolate_id"]] = experiment_accession
def write_pipeline_data_to_file(self, outfile, pipeline_name, pipeline_version=None, reference_id=None): where_fields = ['pipeline_name="' + pipeline_name + '"'] pooling_seqreps = pipeline_name in {"mykrobe_predict", "variant_call"} seqreps_column = ("sequence_replicate_numbers" if pooling_seqreps else "sequence_replicate_number") if pipeline_version is not None: where_fields.append('version="' + pipeline_version + '"') if reference_id is not None: where_fields.append("reference_id=" + str(reference_id)) if self.dataset_name is not None: where_fields.append('dataset_name="' + self.dataset_name + '"') columns = [ "pipeline_name", "version", "dataset_name", "reference_id", "site_id", "subject_id", "sample_id_from_lab", # = lab_id in import spreadsheet "isolate_number_from_lab", # = isolate_number in import spreadhseet seqreps_column, "pipeline_directory", ] if pipeline_name == "remove_contam": columns.extend([ "remove_contam_reads_1", "remove_contam_reads_file_1_md5", "remove_contam_reads_2", "remove_contam_reads_file_2_md5", ]) if self.include_internal_ids: columns.extend(["sample_id", "isolate_id", "seqrep_id"]) if pooling_seqreps: pipeline_join = " join Pipeline on Pipeline.isolate_id = Isolate.isolate_id" else: pipeline_join = " join Pipeline on Pipeline.seqrep_id = Seqrep.seqrep_id" query = ("select * from " + mysql_seqrep_isolate_sample_join + " " + pipeline_join + " where " + " AND ".join(where_fields)) rows = self.db.query_to_dict(query) rows.sort(key=itemgetter( "site_id", "subject_id", "sample_id_from_lab", "isolate_number_from_lab", "sequence_replicate_number", )) f = pyfastaq.utils.open_file_write(outfile) print(*columns, sep="\t", file=f) # The joining of tales means that if the pipelne pools samples, could # have duplicate rows. eg pool is 1_2. We'll get a row for replicate # 1 and replicate 2. Track what we've seen already and skip the duplicates used_pools = set() for row in rows: iso_dir_obj = isolate_dir.IsolateDir(self.pipeline_root, row["sample_id"], row["isolate_id"]) ref_id = None if pipeline_name == "qc" else row["reference_id"] if pooling_seqreps: row[seqreps_column] = row["seqrep_pool"] pool_tuple = ( row["version"], row["isolate_id"], row["seqrep_pool"], row["reference_id"], ) if pool_tuple in used_pools: continue used_pools.add(pool_tuple) row["pipeline_directory"] = iso_dir_obj.pipeline_dir( row[seqreps_column], pipeline_name, row["version"], reference_id=ref_id) if pipeline_name == "remove_contam": row["remove_contam_reads_1"] = iso_dir_obj.reads_filename( "remove_contam", row["sequence_replicate_number"], 1) row["remove_contam_reads_2"] = iso_dir_obj.reads_filename( "remove_contam", row["sequence_replicate_number"], 2) print(*[row[x] for x in columns], sep="\t", file=f) pyfastaq.utils.close(f)
def test_nextflow_variant_call_using_database(self): '''test nextflow_variant_call using database''' tmp_data_dir = 'tmp.nextflow_variant_call_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'variant_call.nf') work_dir = 'tmp.nextflow_variant_call_db_input.work' dag_file = 'nextflow.variant_call.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--cortex_mem_height 17', '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'variant_call', clockwork_version, reference_id=2) expected_sample = '.'.join([ str(id_dict[x]) for x in ['sample', 'isolate_id', 'seqrep_id', 'seq_repl'] ]) self._files_are_present_and_correct(pipeline_dir, expected_sample) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_generic_pipeline(self): '''test nextflow generic pipeline using database''' tmp_data_dir = 'tmp.nextflow_generic_pipeline_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'generic_pipeline.nf') work_dir = 'tmp.nextflow_generic_pipeline.work' dag_file = 'nextflow.generic_pipeline.dag.pdf' pipeline_name = 'generic_pipeline' script = os.path.join(data_dir, 'script.pl') try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--pipeline_name', pipeline_name, '--pipeline_root', pipeline_root, '--script', script, '--db_config_file', db_ini_file, '--max_ram', '0.5', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ {'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': -1, 'reference_id': None}, {'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2'}, {'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1'}, {'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2'}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], pipeline_name, clockwork_version) counts_file = os.path.join(pipeline_dir, 'count.txt') self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_qc_using_database(self): '''test nextflow_qc using database''' tmp_data_dir = 'tmp.nextflow_qc' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf') work_dir = 'tmp.nextflow_qc.work' dag_file = 'nextflow.qc.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one of the samples is in group2 and should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table('Pipeline') got_pipeline_rows.sort(key=itemgetter('seqrep_id')) expected_pipeline_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': -1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table('QC') got_qc_rows.sort(key=itemgetter('seqrep_id')) expected_qc_rows = [{ 'seqrep_id': 1, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 48.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'fail', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.6, 'samtools_insert_size_standard_deviation': 1.0, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 4, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }, { 'seqrep_id': 2, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 49.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'warn', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.7, 'samtools_insert_size_standard_deviation': 1.1, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 0, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) qc_root_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'qc', clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ['fastqc', 'samtools_qc']: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_fake_remove_contam(self): """test nextflow_fake_remove_contam""" tmp_data_dir = "tmp.nextflow_fake_remove_contam" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "fake_remove_contam.nf") work_dir = "tmp.nextflow_fake_remove_contam.work" dag_file = "nextflow.fake_remove_contam.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair has group g2, so should get ignored "--pipeline_root", os.path.abspath(pipeline_root), "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": -1, "reference_id": 0, }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table("Read_counts") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "seqrep_id": 1, "original_total": 12, "contamination": 0, "not_contamination": 12, "unmapped": 0, "total_after_remove_contam": 12, }, { "seqrep_id": 2, "original_total": 26, "contamination": 0, "not_contamination": 26, "unmapped": 0, "total_after_remove_contam": 26, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 1 }, { "sample": 2, "isolate_id": 2, "seq_repl": 1 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) for read_type in ("original", "remove_contam"): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict["seq_repl"], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_mykrobe_predict(self): """test nextflow_mykrobe using database""" tmp_data_dir = "tmp.nextflow_mykrobe_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "mykrobe_predict.nf") work_dir = "tmp.nextflow_mykrobe_db_input.work" dag_file = "nextflow.mykrobe.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--testing", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": -1, "reference_id": 2, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], "mykrobe_predict", clockwork_version, reference_id=2, ) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, "log.txt") json_file = os.path.join(pipeline_dir, "out.json") if id_dict["sample_name"].endswith("1_2"): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_database(self): """test nextflow_variant_call using database""" tmp_data_dir = "tmp.nextflow_variant_call_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "variant_call.nf") work_dir = "tmp.nextflow_variant_call_db_input.work" dag_file = "nextflow.variant_call.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--cortex_mem_height 17", "--testing", # Using truth ref is broken, and we nevr use it anyway, # so disable this for now #"--truth_ref", #os.path.join(tmp_data_dir, "truth_ref.fa"), "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "variant_call", "status": -1, "reference_id": 2, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "variant_call", clockwork_version, reference_id=2) self._files_are_present_and_correct(pipeline_dir, id_dict["sample_name"], expect_ref_check_files=False) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def _test_run(self, original_reads1, original_reads2, expected_import_status): """test run""" pipeline_root = "tmp.read_pair_importer.run.root" if os.path.exists(pipeline_root): shutil.rmtree(pipeline_root) os.mkdir(pipeline_root) seqrep_id = 1 sample = 3 isolate = 2 sequence_replicate_number = 42 # copy the reads because the pipeline will delete them later reads1 = "tmp.read_pair_importer.reads1.fq" reads2 = "tmp.read_pair_importer.reads2.fq" md5_1 = utils.rsync_and_md5(original_reads1, reads1) md5_2 = utils.rsync_and_md5(original_reads2, reads2) # write an md5 file, to check it gets deleted later md5_file = reads1 + ".md5" utils.syscall("md5sum " + reads1 + " > " + md5_file) importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) # no row in Seqrep table with self.assertRaises(read_pair_importer.Error): importer.run() seqrep_row = { "seqrep_id": seqrep_id, "isolate_id": isolate, "sequence_replicate_number": sequence_replicate_number, "original_reads_file_1_md5": md5_1, "original_reads_file_2_md5": md5_2, "remove_contam_reads_file_1_md5": None, "remove_contam_reads_file_2_md5": None, "withdrawn": 0, "import_status": 0, "instrument_model": "Illumina HiSeq 2000", "submission_date": "20170101", "submit_to_ena": 0, "ena_run_accession": None, "ena_on_hold": 0, } self.db.add_row_to_table("Seqrep", seqrep_row) self.db.commit() # need to create a new object so the database changes get picked up. # Separate connections do not see changes made by each other. importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) # reads file doesn't exit importer.reads_file_1 = "oops" with self.assertRaises(read_pair_importer.Error): importer.run() importer.reads_file_1 = reads1 # check lock file works iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate) iso_dir.make_essential_dirs() lock_file = os.path.join(iso_dir.reads_dir, "import_lock." + str(seqrep_id)) utils.make_empty_file(lock_file) with self.assertRaises(read_pair_importer.Error): importer.run() os.unlink(lock_file) # should run ok where_query = " and ".join([ "sample_id=" + str(sample), "isolate_number=" + str(isolate), "sequence_replicate_number=" + str(seqrep_id), ]) rows = self.db.get_rows_from_table("Seqrep", where="seqrep_id=" + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(0, rows[0]["import_status"]) importer = read_pair_importer.ReadPairImporter( ini_file, pipeline_root, seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2, ) importer.run() # reconnect so that we pick up the changes made by the previous line self.db.reconnect() reads_out_1 = iso_dir.reads_filename("original", sequence_replicate_number, 1) reads_out_2 = iso_dir.reads_filename("original", sequence_replicate_number, 2) rows = self.db.get_rows_from_table("Seqrep", where="seqrep_id=" + str(seqrep_id)) self.assertEqual(1, len(rows)) self.assertEqual(expected_import_status, rows[0]["import_status"]) # Files either copied/deleted or not depending on if import was successful if expected_import_status == 1: self.assertTrue(os.path.exists(reads_out_1)) self.assertTrue(os.path.exists(reads_out_2)) self.assertFalse(os.path.exists(reads1)) self.assertFalse(os.path.exists(reads2)) self.assertFalse(os.path.exists(md5_file)) else: self.assertFalse(os.path.exists(reads_out_1)) self.assertFalse(os.path.exists(reads_out_2)) self.assertTrue(os.path.exists(reads1)) self.assertTrue(os.path.exists(reads2)) self.assertTrue(os.path.exists(md5_file)) os.unlink(reads1) os.unlink(reads2) os.unlink(md5_file) shutil.rmtree(pipeline_root)
def test_nextflow_generic_pipeline(self): """test nextflow generic pipeline using database""" tmp_data_dir = "tmp.nextflow_generic_pipeline_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall( "mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"' ) utils.syscall( "mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump ) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") nextflow_file = os.path.join( nextflow_helper.nextflow_dir, "generic_pipeline.nf" ) work_dir = "tmp.nextflow_generic_pipeline.work" dag_file = "nextflow.generic_pipeline.dag.pdf" pipeline_name = "generic_pipeline" script = os.path.join(data_dir, "script.pl") try: os.unlink(dag_file) except: pass command = " ".join( [ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--pipeline_name", pipeline_name, "--pipeline_root", pipeline_root, "--script", script, "--db_config_file", db_ini_file, "--max_ram", "0.5", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ] ) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": pipeline_name, "status": -1, "reference_id": None, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {"sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2"}, {"sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1"}, {"sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2"}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir( pipeline_root, id_dict["sample"], id_dict["isolate_id"] ) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], pipeline_name, clockwork_version ) counts_file = os.path.join(pipeline_dir, "count.txt") self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def write_seqrep_data_to_file(self, outfile): query = "select * from " + mysql_seqrep_isolate_sample_join columns = [ "site_id", "subject_id", "sample_id_from_lab", # = lab_id in import spreadsheet "isolate_number_from_lab", # = isolate_number in import spreadhseet "sequence_replicate_number", "pool_sequence_replicates", "submission_date", "import_status", "dataset_name", "submit_to_ena", "ena_on_hold", "ena_center_name", "ena_experiment_accession", "ena_run_accession", "ena_sample_accession", "ena_study_accession", "instrument_model", "original_reads_file_1_md5", "original_reads_file_2_md5", "isolate_directory", "remove_contam_reads_1", "remove_contam_reads_file_1_md5", "remove_contam_reads_2", "remove_contam_reads_file_2_md5", ] if self.include_internal_ids: columns.extend(["sample_id", "isolate_id", "seqrep_id"]) where_fields = [] if self.include_withdrawn: columns.append("withdrawn") else: where_fields.append("withdrawn=0") if self.dataset_name is not None: where_fields.append('dataset_name="' + self.dataset_name + '"') if len(where_fields): query += " where " + " AND ".join(where_fields) rows = self.db.query_to_dict(query) rows.sort(key=itemgetter( "site_id", "subject_id", "sample_id_from_lab", "isolate_number_from_lab", "sequence_replicate_number", )) f = pyfastaq.utils.open_file_write(outfile) print(*columns, sep="\t", file=f) for row in rows: iso_dir_obj = isolate_dir.IsolateDir(self.pipeline_root, row["sample_id"], row["isolate_id"]) row["isolate_directory"] = iso_dir_obj.isolate_dir row["remove_contam_reads_1"] = iso_dir_obj.reads_filename( "remove_contam", row["sequence_replicate_number"], 1) row["remove_contam_reads_2"] = iso_dir_obj.reads_filename( "remove_contam", row["sequence_replicate_number"], 2) print(*[row[x] for x in columns], sep="\t", file=f) pyfastaq.utils.close(f)
def test_nextflow_mykrobe_predict(self): '''test nextflow_mykrobe using database''' tmp_data_dir = 'tmp.nextflow_mykrobe_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'mykrobe_predict.nf') work_dir = 'tmp.nextflow_mykrobe_db_input.work' dag_file = 'nextflow.mykrobe.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2', 'sample_name': 'site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'mykrobe_predict', clockwork_version, reference_id=2) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, 'log.txt') json_file = os.path.join(pipeline_dir, 'out.json') if id_dict['sample_name'].endswith('1_2'): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_remove_contam_using_database(self): '''test nextflow_remove_contam using database''' tmp_data_dir = 'tmp.nextflow_remove_contam' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'remove_contam.nf') work_dir = 'tmp.nextflow_remove_contam.work' dag_file = 'nextflow.remove_contam.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair has group g2, so should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', os.path.abspath(pipeline_root), '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': -1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table('Read_counts') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'seqrep_id': 1, 'original_total': 198, 'contamination': 40, 'not_contamination': 132, 'unmapped': 26, 'total_after_remove_contam': 158, }, { 'seqrep_id': 2, 'original_total': 156, 'contamination': 12, 'not_contamination': 132, 'unmapped': 12, 'total_after_remove_contam': 144, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) for read_type in ('original', 'remove_contam', 'contam'): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict['seq_repl'], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()