def __init__(self, ini_file, dataset_name, pipeline_root, taxon_id, fq_upload_threads=1, use_test_server=False, unit_test=None): self.ini_file = os.path.abspath(ini_file) self.db = db.Db(self.ini_file) self.dataset_name = dataset_name self.pipeline_root = os.path.abspath(pipeline_root) self.taxon_id = taxon_id self.fq_upload_threads = fq_upload_threads self.use_test_server = use_test_server self.unit_test = unit_test self.project_xml_dir = DatasetSubmitter.dataset_xml_dir( self.pipeline_root) self.project_xml = DatasetSubmitter.dataset_xml_file( self.pipeline_root, self.dataset_name) self.centre_number_to_name = DatasetSubmitter._get_centres_from_ini_file( self.ini_file) self.broker_name = DatasetSubmitter._get_broker_name_from_ini_file( self.ini_file) self.study_prefix = DatasetSubmitter._get_key_from_ini_file( self.ini_file, 'ena_login', 'study_prefix') if self.study_prefix is None: raise Error( 'Error! Must provide study_prefix in [ena_login] section of ini file ' + self.ini_file)
def _import_reads_and_update_db(self): database = db.Db(self.db_ini_file) data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file) xlsx_dir = os.path.dirname(self.xlsx_file) data_errors = SpreadsheetImporter._validate_data( database, data, self.dropbox_dir) if len(data_errors) > 0: raise Exception("Error(s) importing spreadsheet:\n" + "\n".join(data_errors)) try: f_out = open(self.jobs_outfile, "w") except: raise Exception('Error opening file "' + self.jobs_outfile + '". Cannot continue') print( "seqrep_id", "sample_id", "isolate_id", "sequence_replicate_number", "reads1", "reads2", "reads1_md5", "reads2_md5", sep="\t", file=f_out, ) for data_dict in data: reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"]) reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"]) assert os.path.exists(reads1) and os.path.exists(reads2) seqrep_id, isolate_id, sample_id = database.add_one_seqrep( data_dict) print( seqrep_id, sample_id, isolate_id, data_dict["sequence_replicate_number"], reads1, reads2, data_dict["reads_file_1_md5"], data_dict["reads_file_2_md5"], sep="\t", file=f_out, ) f_out.close() xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet( self.xlsx_file, self.xlsx_archive_dir) jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv" assert not os.path.exists(jobs_backup_file) utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file) database.commit_and_close() if self.db_backup_dir is not None: database.backup(self.db_backup_dir)
def setUp(self): try: db_connection.DbConnection(db_ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(db_ini_file) dbm.run() self.db = db.Db(db_ini_file)
def run(options): database = db.Db(options.db_config_file) database.add_mykrobe_custom_panel( options.species, options.panel_name, options.reference_root, probes_fasta=options.probes_fasta, var_to_res_json=options.var_to_res_json, )
def run(options): database = db.Db(options.db_config_file) lines = database.get_vcfs_and_reads_files_for_minos_multi_sample_calling( options.dataset_name, options.pipeline_root, options.reference_id, pipeline_version=options.pipeline_version, ) print(*lines, sep='\n')
def run(options): database = db.Db(options.db_config_file) database.update_finished_pipeline_run_failed_jobs( options.jobs_tsv, options.success_jobs_file, options.pipeline_name, reference_id=options.reference_id, pipeline_version=options.pipeline_version, ) database.commit_and_close()
def run(options): lock = lock_file.LockFile( os.path.join(options.pipeline_root, "remove_contam.lock")) database = db.Db(options.db_config_file) database.make_remove_contam_jobs_tsv( options.outfile, options.pipeline_root, options.reference_id, options.reference_root, dataset_name=options.dataset_name, ) database.commit_and_close() lock.stop()
def run(options): lock = lock_file.LockFile( os.path.join(options.pipeline_root, "generic_pipeline.lock")) database = db.Db(options.db_config_file) database.make_generic_pipeline_jobs_tsv( options.outfile, options.pipeline_root, options.pipeline_name, pipeline_version=options.pipeline_version, dataset_name=options.dataset_name, ) database.commit_and_close() lock.stop()
def run(options): lock = lock_file.LockFile(os.path.join(options.pipeline_root, 'qc.lock')) database = db.Db(options.db_config_file) database.make_qc_jobs_tsv( options.outfile, options.pipeline_root, options.reference_id, options.reference_root, pipeline_version=options.pipeline_version, dataset_name=options.dataset_name, ) database.commit_and_close() lock.stop()
def run(options): lock = lock_file.LockFile( os.path.join(options.pipeline_root, 'remove_contam.lock')) database = db.Db(options.db_config_file) database.make_remove_contam_jobs_tsv( options.outfile, options.pipeline_root, 0, '/fake/path/to/refs/', dataset_name=options.dataset_name, faking_it=True, ) database.commit_and_close() lock.stop()
def run(options): lock = lock_file.LockFile(os.path.join(options.pipeline_root, 'mykrobe_predict.lock')) database = db.Db(options.db_config_file) database.make_variant_call_or_mykrobe_jobs_tsv( 'mykrobe_predict', options.outfile, options.pipeline_root, options.reference_id, options.reference_root, pipeline_version=options.pipeline_version, dataset_name=options.dataset_name, ) database.commit_and_close() lock.stop()
def __init__(self, db_ini_file, pipeline_root, include_withdrawn=False, include_internal_ids=False, dataset_name=None): self.db = db.Db(db_ini_file) self.pipeline_root = os.path.abspath(pipeline_root) if not os.path.exists(self.pipeline_root): raise Error('Pipeline root directory "' + self.pipeline_root + '" not found. Cannot continue') self.dataset_name = dataset_name self.include_withdrawn = include_withdrawn self.include_internal_ids = include_internal_ids self.dataset_name = dataset_name
def run(options): using_db = None not in ( options.db_config_file, options.pipeline_references_root, options.name, ) if using_db and options.outdir: print( "Error! If adding to database, must use --db_config_file,--pipeline_references_root,--name.", file=sys.stderr, ) print("Otherwise, use --outdir.", file=sys.stderr) sys.exit(1) if using_db: lock = lock_file.LockFile( os.path.join(options.pipeline_references_root, "add_reference.lock")) database = db.Db(options.db_config_file) ref_id = database.add_reference(options.name) database.commit_and_close() lock.stop() else: ref_id = None ref_dir = reference_dir.ReferenceDir( pipeline_references_root_dir=options.pipeline_references_root, reference_id=ref_id, directory=options.outdir, ) genome_is_big = options.contam_tsv is not None using_cortex = options.contam_tsv is None ref_dir.make_index_files( options.fasta_file, genome_is_big, using_cortex, cortex_mem_height=options.cortex_mem_height, ) if options.contam_tsv is not None: ref_dir.add_remove_contam_metadata_tsv(options.contam_tsv)
def run(options): if options.pool == 1: options.seqrep_id = None else: options.seqrep_pool = None options.seqrep_id = int(options.seqrep_id) database = db.Db(options.db_config_file) database.update_finished_pipeline_run( options.isolate_id, options.seqrep_id, options.seqrep_pool, options.pipeline_name, options.new_pipeline_status, reference_id=options.reference_id, pipeline_version=options.pipeline_version, pipeline_root=options.pipeline_root, ) database.commit_and_close()
def __init__( self, db_ini_file, pipeline_root_dir, seqrep_id, isolate_id, sample_id, sequence_replicate_number, reads_file_1, reads_file_2, reads_file_md5_1, reads_file_md5_2, ): self.db = db.Db(db_ini_file) self.pipeline_root_dir = os.path.abspath(pipeline_root_dir) self.seqrep_id = seqrep_id self.isolate_id = isolate_id self.sample_id = sample_id self.sequence_replicate_number = sequence_replicate_number self.reads_file_1 = os.path.abspath(reads_file_1) self.reads_file_2 = os.path.abspath(reads_file_2) self.reads_file_md5_1 = reads_file_md5_1 self.reads_file_md5_2 = reads_file_md5_2
def test_nextflow_mykrobe_predict(self): """test nextflow_mykrobe using database""" tmp_data_dir = "tmp.nextflow_mykrobe_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "mykrobe_predict.nf") work_dir = "tmp.nextflow_mykrobe_db_input.work" dag_file = "nextflow.mykrobe.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--testing", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": -1, "reference_id": 2, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], "mykrobe_predict", clockwork_version, reference_id=2, ) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, "log.txt") json_file = os.path.join(pipeline_dir, "out.json") if id_dict["sample_name"].endswith("1_2"): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def setUp(self): self.pipeline_root = os.path.abspath('piperoot') os.mkdir(self.pipeline_root) try: db_connection.DbConnection(ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(ini_file) dbm.run() self.db = db.Db(ini_file) sample_dicts = [ { 'subject_id': 'subject_1', 'site_id': '01', 'lab_id': 'lab_id_1', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_1_1.fq', 'reads_file_1_md5': 'md5_1_1', 'reads_file_2_md5': 'md5_1_2', 'reads_file_2': 'reads_1_2.fq', 'dataset_name': 'set1', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 1', 'ena_on_hold': '0', 'ena_run_accession': 'ERR123456', 'ena_sample_accession': 'ERS123456', }, { 'subject_id': 'subject_2', 'site_id': '01', 'lab_id': 'lab_id_2', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_2_1.fq', 'reads_file_1_md5': 'md5_2_1', 'reads_file_2_md5': 'md5_2_2', 'reads_file_2': 'reads_2_2.fq', 'dataset_name': 'set1', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 1', 'ena_on_hold': '0', 'ena_run_accession': 'ERR123457', 'ena_sample_accession': 'ERS123457', }, { 'subject_id': 'subject_3', 'site_id': '02', 'lab_id': 'lab_id_3', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_3_1.fq', 'reads_file_1_md5': 'md5_3_1', 'reads_file_2_md5': 'md5_3_2', 'reads_file_2': 'reads_3_2.fq', 'dataset_name': 'set2', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 2', 'ena_on_hold': '0', 'ena_run_accession': None, 'ena_sample_accession': None, }, { 'subject_id': 'subject_3', 'site_id': '02', 'lab_id': 'lab_id_3', 'isolate_number': '1', 'sequence_replicate_number': 2, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_4_1.fq', 'reads_file_1_md5': 'md5_4_1', 'reads_file_2_md5': 'md5_4_2', 'reads_file_2': 'reads_4_2.fq', 'dataset_name': 'set2', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 2', 'ena_on_hold': '0', 'ena_run_accession': None, 'ena_sample_accession': None, }, ] for d in sample_dicts: self.db.add_one_seqrep(d) where_dict = {'original_reads_file_1_md5': d['reads_file_1_md5']} update_dict = { 'remove_contam_reads_file_1_md5': d['reads_file_1_md5'] + '.remove_contam', 'remove_contam_reads_file_2_md5': d['reads_file_2_md5'] + '.remove_contam', } self.db.update_row('Seqrep', where_dict, update_dict) seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3} for seqrep, isolate in seqrep_to_isolate.items(): ref_id = 1 if seqrep in {1, 2} else 2 version = '0.1.1' if seqrep in {1, 2} else '0.1.3' d = { 'isolate_id': isolate, 'seqrep_id': seqrep, 'seqrep_pool': None, 'version': version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': ref_id } self.db.add_row_to_table('Pipeline', d) d = { 'isolate_id': isolate, 'seqrep_id': seqrep, 'seqrep_pool': None, 'version': version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': ref_id + 2 } self.db.add_row_to_table('Pipeline', d) var_call_rows = [ { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, { 'isolate_id': 2, 'seqrep_id': None, 'seqrep_pool': '2', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, ] for d in var_call_rows: self.db.add_row_to_table('Pipeline', d) d['pipeline_name'] = 'mykrobe_predict' self.db.add_row_to_table('Pipeline', d) self.db.commit()
def test_nextflow_remove_contam_using_database(self): '''test nextflow_remove_contam using database''' tmp_data_dir = 'tmp.nextflow_remove_contam' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'remove_contam.nf') work_dir = 'tmp.nextflow_remove_contam.work' dag_file = 'nextflow.remove_contam.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair has group g2, so should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', os.path.abspath(pipeline_root), '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': -1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table('Read_counts') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'seqrep_id': 1, 'original_total': 198, 'contamination': 40, 'not_contamination': 132, 'unmapped': 26, 'total_after_remove_contam': 158, }, { 'seqrep_id': 2, 'original_total': 156, 'contamination': 12, 'not_contamination': 132, 'unmapped': 12, 'total_after_remove_contam': 144, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) for read_type in ('original', 'remove_contam', 'contam'): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict['seq_repl'], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_qc_using_database(self): """test nextflow_qc using database""" tmp_data_dir = "tmp.nextflow_qc" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf") work_dir = "tmp.nextflow_qc.work" dag_file = "nextflow.qc.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one of the samples is in group2 and should get ignored "--ref_id 1", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table("Pipeline") got_pipeline_rows.sort(key=itemgetter("seqrep_id")) expected_pipeline_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": -1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 4, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table("QC") got_qc_rows.sort(key=itemgetter("seqrep_id")) expected_qc_rows = [ { "seqrep_id": 1, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 48.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "fail", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.6, "samtools_insert_size_standard_deviation": 1.0, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 4, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, { "seqrep_id": 2, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 49.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "warn", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.7, "samtools_insert_size_standard_deviation": 1.1, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 0, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, ] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 43 }, { "sample": 2, "isolate_id": 2, "seq_repl": 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) qc_root_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "qc", clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ["fastqc", "samtools_qc"]: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_generic_pipeline(self): """test nextflow generic pipeline using database""" tmp_data_dir = "tmp.nextflow_generic_pipeline_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall( "mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"' ) utils.syscall( "mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump ) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") nextflow_file = os.path.join( nextflow_helper.nextflow_dir, "generic_pipeline.nf" ) work_dir = "tmp.nextflow_generic_pipeline.work" dag_file = "nextflow.generic_pipeline.dag.pdf" pipeline_name = "generic_pipeline" script = os.path.join(data_dir, "script.pl") try: os.unlink(dag_file) except: pass command = " ".join( [ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--pipeline_name", pipeline_name, "--pipeline_root", pipeline_root, "--script", script, "--db_config_file", db_ini_file, "--max_ram", "0.5", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ] ) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": pipeline_name, "status": -1, "reference_id": None, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {"sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2"}, {"sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1"}, {"sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2"}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir( pipeline_root, id_dict["sample"], id_dict["isolate_id"] ) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], pipeline_name, clockwork_version ) counts_file = os.path.join(pipeline_dir, "count.txt") self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_run(self): '''test run''' original_pipeline_root = os.path.join(data_dir, 'run', 'Pipeline_root') tmp_pipeline_root = 'tmp.dataset_submitter.pipeline_root' shutil.copytree(original_pipeline_root, tmp_pipeline_root) pipeline_test_dir = os.path.join(data_dir, 'run') mysql_dump = os.path.join(pipeline_test_dir, 'mysql.dump') mysql_config_file = os.path.join(data_dir, 'db.cnf') db_config_data = db_connection.DbConnection._parse_config_file( ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) gsub = dataset_submitter.DatasetSubmitter(ini_file, 'g1', tmp_pipeline_root, 42, unit_test='success') gsub.run() columns = 'Seqrep.seqrep_id, sequence_replicate_number, remove_contam_reads_file_1_md5, remove_contam_reads_file_2_md5, ena_center_name, ena_run_accession, Isolate.isolate_id, ena_experiment_accession, Sample.sample_id, site_id, instrument_model, ena_sample_accession, ena_study_accession' join = 'Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id JOIN Sample ON Isolate.sample_id = Sample.sample_id' where = 'submit_to_ena=1 AND import_status=1 AND dataset_name="g1"' query = ' '.join([ 'SELECT', columns, 'FROM', '(' + join + ')', 'WHERE', '(' + where + ')' ]) database = db.Db(ini_file) got_data = database.query_to_dict(query) for row in got_data: accessions = {row[x] for x in row if x.endswith('accession')} self.assertNotIn(None, accessions) run_accessions = {x['ena_run_accession'] for x in got_data} self.assertEqual(5, len(run_accessions)) study_accessions = {x['ena_study_accession'] for x in got_data} self.assertEqual(1, len(study_accessions)) # hash the rows by md5 of file 1, since we don't know the auto # generated IDs in the DB. data_by_md5_1 = { x['remove_contam_reads_file_1_md5']: x for x in got_data } md51 = '83d842db2d9ea84faa747cefa4b2f1b4' md52 = '67ff4c03bd637e027f372b4b5a833935' md53 = 'bfde82c3a5ec16ffefb32fdfcfd4cf53' md54 = 'be5c2e07716c119a2e86f6421df5f63b' md55 = '21544f51d9d620ca99bc445219b1018d' self.assertNotEqual(data_by_md5_1[md51]['ena_sample_accession'], data_by_md5_1[md52]['ena_sample_accession']) self.assertEqual(data_by_md5_1[md52]['ena_sample_accession'], data_by_md5_1[md53]['ena_sample_accession']) self.assertEqual(data_by_md5_1[md53]['ena_sample_accession'], data_by_md5_1[md54]['ena_sample_accession']) self.assertNotEqual(data_by_md5_1[md54]['ena_sample_accession'], data_by_md5_1[md55]['ena_sample_accession']) self.assertNotEqual(data_by_md5_1[md51]['ena_experiment_accession'], data_by_md5_1[md52]['ena_experiment_accession']) self.assertEqual(data_by_md5_1[md52]['ena_experiment_accession'], data_by_md5_1[md53]['ena_experiment_accession']) self.assertNotEqual(data_by_md5_1[md53]['ena_experiment_accession'], data_by_md5_1[md54]['ena_experiment_accession']) self.assertNotEqual(data_by_md5_1[md54]['ena_experiment_accession'], data_by_md5_1[md55]['ena_experiment_accession']) shutil.rmtree(tmp_pipeline_root)
def test_nextflow_variant_call_using_database(self): '''test nextflow_variant_call using database''' tmp_data_dir = 'tmp.nextflow_variant_call_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'variant_call.nf') work_dir = 'tmp.nextflow_variant_call_db_input.work' dag_file = 'nextflow.variant_call.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--cortex_mem_height 17', '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'variant_call', clockwork_version, reference_id=2) expected_sample = '.'.join([ str(id_dict[x]) for x in ['sample', 'isolate_id', 'seqrep_id', 'seq_repl'] ]) self._files_are_present_and_correct(pipeline_dir, expected_sample) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_database(self): """test nextflow_variant_call using database""" tmp_data_dir = "tmp.nextflow_variant_call_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "variant_call.nf") work_dir = "tmp.nextflow_variant_call_db_input.work" dag_file = "nextflow.variant_call.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--cortex_mem_height 17", "--testing", # Using truth ref is broken, and we nevr use it anyway, # so disable this for now #"--truth_ref", #os.path.join(tmp_data_dir, "truth_ref.fa"), "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "variant_call", "status": -1, "reference_id": 2, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "variant_call", clockwork_version, reference_id=2) self._files_are_present_and_correct(pipeline_dir, id_dict["sample_name"], expect_ref_check_files=False) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_fake_remove_contam(self): """test nextflow_fake_remove_contam""" tmp_data_dir = "tmp.nextflow_fake_remove_contam" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "fake_remove_contam.nf") work_dir = "tmp.nextflow_fake_remove_contam.work" dag_file = "nextflow.fake_remove_contam.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair has group g2, so should get ignored "--pipeline_root", os.path.abspath(pipeline_root), "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": -1, "reference_id": 0, }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table("Read_counts") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "seqrep_id": 1, "original_total": 12, "contamination": 0, "not_contamination": 12, "unmapped": 0, "total_after_remove_contam": 12, }, { "seqrep_id": 2, "original_total": 26, "contamination": 0, "not_contamination": 26, "unmapped": 0, "total_after_remove_contam": 26, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 1 }, { "sample": 2, "isolate_id": 2, "seq_repl": 1 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) for read_type in ("original", "remove_contam"): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict["seq_repl"], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_generic_pipeline(self): '''test nextflow generic pipeline using database''' tmp_data_dir = 'tmp.nextflow_generic_pipeline_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'generic_pipeline.nf') work_dir = 'tmp.nextflow_generic_pipeline.work' dag_file = 'nextflow.generic_pipeline.dag.pdf' pipeline_name = 'generic_pipeline' script = os.path.join(data_dir, 'script.pl') try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--pipeline_name', pipeline_name, '--pipeline_root', pipeline_root, '--script', script, '--db_config_file', db_ini_file, '--max_ram', '0.5', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ {'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': -1, 'reference_id': None}, {'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2'}, {'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1'}, {'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2'}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], pipeline_name, clockwork_version) counts_file = os.path.join(pipeline_dir, 'count.txt') self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_qc_using_database(self): '''test nextflow_qc using database''' tmp_data_dir = 'tmp.nextflow_qc' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf') work_dir = 'tmp.nextflow_qc.work' dag_file = 'nextflow.qc.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one of the samples is in group2 and should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table('Pipeline') got_pipeline_rows.sort(key=itemgetter('seqrep_id')) expected_pipeline_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': -1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table('QC') got_qc_rows.sort(key=itemgetter('seqrep_id')) expected_qc_rows = [{ 'seqrep_id': 1, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 48.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'fail', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.6, 'samtools_insert_size_standard_deviation': 1.0, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 4, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }, { 'seqrep_id': 2, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 49.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'warn', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.7, 'samtools_insert_size_standard_deviation': 1.1, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 0, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) qc_root_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'qc', clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ['fastqc', 'samtools_qc']: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def setUp(self): self.pipeline_root = os.path.abspath("piperoot") os.mkdir(self.pipeline_root) try: db_connection.DbConnection(ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(ini_file) dbm.run() self.db = db.Db(ini_file) sample_dicts = [ { "subject_id": "subject_1", "site_id": "01", "lab_id": "lab_id_1", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_1_1.fq", "reads_file_1_md5": "md5_1_1", "reads_file_2_md5": "md5_1_2", "reads_file_2": "reads_1_2.fq", "dataset_name": "set1", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 1", "ena_on_hold": "0", "ena_run_accession": "ERR123456", "ena_sample_accession": "ERS123456", }, { "subject_id": "subject_2", "site_id": "01", "lab_id": "lab_id_2", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_2_1.fq", "reads_file_1_md5": "md5_2_1", "reads_file_2_md5": "md5_2_2", "reads_file_2": "reads_2_2.fq", "dataset_name": "set1", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 1", "ena_on_hold": "0", "ena_run_accession": "ERR123457", "ena_sample_accession": "ERS123457", }, { "subject_id": "subject_3", "site_id": "02", "lab_id": "lab_id_3", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_3_1.fq", "reads_file_1_md5": "md5_3_1", "reads_file_2_md5": "md5_3_2", "reads_file_2": "reads_3_2.fq", "dataset_name": "set2", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 2", "ena_on_hold": "0", "ena_run_accession": None, "ena_sample_accession": None, }, { "subject_id": "subject_3", "site_id": "02", "lab_id": "lab_id_3", "isolate_number": "1", "sequence_replicate_number": 2, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_4_1.fq", "reads_file_1_md5": "md5_4_1", "reads_file_2_md5": "md5_4_2", "reads_file_2": "reads_4_2.fq", "dataset_name": "set2", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 2", "ena_on_hold": "0", "ena_run_accession": None, "ena_sample_accession": None, }, ] for d in sample_dicts: self.db.add_one_seqrep(d) where_dict = {"original_reads_file_1_md5": d["reads_file_1_md5"]} update_dict = { "remove_contam_reads_file_1_md5": d["reads_file_1_md5"] + ".remove_contam", "remove_contam_reads_file_2_md5": d["reads_file_2_md5"] + ".remove_contam", } self.db.update_row("Seqrep", where_dict, update_dict) seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3} for seqrep, isolate in seqrep_to_isolate.items(): ref_id = 1 if seqrep in {1, 2} else 2 version = "0.1.1" if seqrep in {1, 2} else "0.1.3" d = { "isolate_id": isolate, "seqrep_id": seqrep, "seqrep_pool": None, "version": version, "pipeline_name": "remove_contam", "status": 1, "reference_id": ref_id, } self.db.add_row_to_table("Pipeline", d) d = { "isolate_id": isolate, "seqrep_id": seqrep, "seqrep_pool": None, "version": version, "pipeline_name": "qc", "status": 1, "reference_id": ref_id + 2, } self.db.add_row_to_table("Pipeline", d) var_call_rows = [ { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, { "isolate_id": 2, "seqrep_id": None, "seqrep_pool": "2", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1_2", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, ] for d in var_call_rows: self.db.add_row_to_table("Pipeline", d) d["pipeline_name"] = "mykrobe_predict" self.db.add_row_to_table("Pipeline", d) self.db.commit()
def test_nextflow_mykrobe_predict(self): '''test nextflow_mykrobe using database''' tmp_data_dir = 'tmp.nextflow_mykrobe_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'mykrobe_predict.nf') work_dir = 'tmp.nextflow_mykrobe_db_input.work' dag_file = 'nextflow.mykrobe.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2', 'sample_name': 'site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'mykrobe_predict', clockwork_version, reference_id=2) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, 'log.txt') json_file = os.path.join(pipeline_dir, 'out.json') if id_dict['sample_name'].endswith('1_2'): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_import(self): '''test nextflow_import''' nextflow_helper.write_config_file() pipeline_root = 'tmp.nextflow_import.pipeline_root' os.mkdir(pipeline_root) try: db_connection.DbConnection(db_ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(db_ini_file) dbm.run() dropbox_dir = 'tmp.nextflow_import.dropbox' shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir) xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive' os.mkdir(xlsx_archive_dir) expected_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx')) ] nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf') work_dir = 'tmp.nextflow_import.work' dag_file = 'nextflow.import.dag.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # All files should be gone from the dropbox self.assertEqual([], os.listdir(dropbox_dir)) shutil.rmtree(dropbox_dir) # The two spreadsheets should have been archived got_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx')) ] self.assertEqual(expected_xlsx_files, got_xlsx_files) shutil.rmtree(xlsx_archive_dir) # Check database updated correctly database = db.Db(db_ini_file) expected_sample_rows = [ { 'subject_id': 'p1', 'site_id': 's1', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center A', 'ena_sample_accession': 'ERS123456', 'ena_study_accession': None }, { 'subject_id': 'p2', 'site_id': 's2', 'sample_id_from_lab': 'l2', 'dataset_name': 'g2', 'ena_center_name': 'Center A', 'ena_sample_accession': None, 'ena_study_accession': None }, { 'subject_id': 'p1', 'site_id': 's3', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center B', 'ena_sample_accession': None, 'ena_study_accession': None }, ] got_sample_rows = sorted(database.get_rows_from_table('Sample'), key=itemgetter('site_id')) # the rows also have the sample_id, which is made by mysql auto increment, # We don't know the order in which things are added, so can't check the sample_id. for row in got_sample_rows: del row['sample_id'] self.assertEqual(expected_sample_rows, got_sample_rows) expected_rows = [ { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'edc176f367fe8e5a014c819b9ec9b05c', 'original_reads_file_2_md5': '0dd551a0d76d90059808f6f7ddbb0e02', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 0, 'ena_run_accession': 'ERR123456', 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'fe5cd28cf9394be14794f0a56a2fe845', 'original_reads_file_2_md5': 'd026fd9a439294ed42795bd7f1e7df10', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'aa8f077673c158c4f2a19fc3c50e3fa7', 'original_reads_file_2_md5': 'ae6bafef67da3c26576e799c32985ac9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '2', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': '6b9a34ed492dad739ac03e084f3b2ab9', 'original_reads_file_2_md5': '7ceffc5314ff7e305b4ab5bd859850c9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, { 'sequence_replicate_number': 2, 'original_reads_file_1_md5': 'ec0377e321c59c0b1b6392a3c6dfc2dc', 'original_reads_file_2_md5': 'd541ffdb43a0648233ec7408c3626bfd', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, ] expected_rows.sort(key=itemgetter('original_reads_file_1_md5')) query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)' got_rows = database.query_to_dict(query) got_rows.sort(key=itemgetter('original_reads_file_1_md5')) # Check reads files etc written correctly for isolate_data in got_rows: iso_dir = isolate_dir.IsolateDir(pipeline_root, isolate_data['sample_id'], isolate_data['isolate_id']) self.assertTrue(os.path.exists(iso_dir.reads_dir)) for i in [1, 2]: self.assertTrue( os.path.exists( iso_dir.reads_filename( 'original', isolate_data['sequence_replicate_number'], i))) # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated. for row in got_rows: del row['sample_id'] del row['seqrep_id'] del row['isolate_id'] self.assertEqual(expected_rows, got_rows) shutil.rmtree(pipeline_root) nextflow_helper.clean_files() database.commit_and_close() db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)