예제 #1
0
 def __init__(self,
              ini_file,
              dataset_name,
              pipeline_root,
              taxon_id,
              fq_upload_threads=1,
              use_test_server=False,
              unit_test=None):
     self.ini_file = os.path.abspath(ini_file)
     self.db = db.Db(self.ini_file)
     self.dataset_name = dataset_name
     self.pipeline_root = os.path.abspath(pipeline_root)
     self.taxon_id = taxon_id
     self.fq_upload_threads = fq_upload_threads
     self.use_test_server = use_test_server
     self.unit_test = unit_test
     self.project_xml_dir = DatasetSubmitter.dataset_xml_dir(
         self.pipeline_root)
     self.project_xml = DatasetSubmitter.dataset_xml_file(
         self.pipeline_root, self.dataset_name)
     self.centre_number_to_name = DatasetSubmitter._get_centres_from_ini_file(
         self.ini_file)
     self.broker_name = DatasetSubmitter._get_broker_name_from_ini_file(
         self.ini_file)
     self.study_prefix = DatasetSubmitter._get_key_from_ini_file(
         self.ini_file, 'ena_login', 'study_prefix')
     if self.study_prefix is None:
         raise Error(
             'Error! Must provide study_prefix in [ena_login] section of ini file '
             + self.ini_file)
예제 #2
0
    def _import_reads_and_update_db(self):
        database = db.Db(self.db_ini_file)
        data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file)
        xlsx_dir = os.path.dirname(self.xlsx_file)
        data_errors = SpreadsheetImporter._validate_data(
            database, data, self.dropbox_dir)

        if len(data_errors) > 0:
            raise Exception("Error(s) importing spreadsheet:\n" +
                            "\n".join(data_errors))

        try:
            f_out = open(self.jobs_outfile, "w")
        except:
            raise Exception('Error opening file "' + self.jobs_outfile +
                            '". Cannot continue')

        print(
            "seqrep_id",
            "sample_id",
            "isolate_id",
            "sequence_replicate_number",
            "reads1",
            "reads2",
            "reads1_md5",
            "reads2_md5",
            sep="\t",
            file=f_out,
        )

        for data_dict in data:
            reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"])
            reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"])
            assert os.path.exists(reads1) and os.path.exists(reads2)
            seqrep_id, isolate_id, sample_id = database.add_one_seqrep(
                data_dict)
            print(
                seqrep_id,
                sample_id,
                isolate_id,
                data_dict["sequence_replicate_number"],
                reads1,
                reads2,
                data_dict["reads_file_1_md5"],
                data_dict["reads_file_2_md5"],
                sep="\t",
                file=f_out,
            )

        f_out.close()
        xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet(
            self.xlsx_file, self.xlsx_archive_dir)
        jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv"
        assert not os.path.exists(jobs_backup_file)
        utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file)
        database.commit_and_close()

        if self.db_backup_dir is not None:
            database.backup(self.db_backup_dir)
예제 #3
0
    def setUp(self):
        try:
            db_connection.DbConnection(db_ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(db_ini_file)
        dbm.run()
        self.db = db.Db(db_ini_file)
예제 #4
0
def run(options):
    database = db.Db(options.db_config_file)
    database.add_mykrobe_custom_panel(
        options.species,
        options.panel_name,
        options.reference_root,
        probes_fasta=options.probes_fasta,
        var_to_res_json=options.var_to_res_json,
    )
예제 #5
0
def run(options):
    database = db.Db(options.db_config_file)
    lines = database.get_vcfs_and_reads_files_for_minos_multi_sample_calling(
        options.dataset_name,
        options.pipeline_root,
        options.reference_id,
        pipeline_version=options.pipeline_version,
    )

    print(*lines, sep='\n')
def run(options):
    database = db.Db(options.db_config_file)
    database.update_finished_pipeline_run_failed_jobs(
        options.jobs_tsv,
        options.success_jobs_file,
        options.pipeline_name,
        reference_id=options.reference_id,
        pipeline_version=options.pipeline_version,
    )

    database.commit_and_close()
예제 #7
0
def run(options):
    lock = lock_file.LockFile(
        os.path.join(options.pipeline_root, "remove_contam.lock"))
    database = db.Db(options.db_config_file)
    database.make_remove_contam_jobs_tsv(
        options.outfile,
        options.pipeline_root,
        options.reference_id,
        options.reference_root,
        dataset_name=options.dataset_name,
    )
    database.commit_and_close()
    lock.stop()
예제 #8
0
def run(options):
    lock = lock_file.LockFile(
        os.path.join(options.pipeline_root, "generic_pipeline.lock"))
    database = db.Db(options.db_config_file)
    database.make_generic_pipeline_jobs_tsv(
        options.outfile,
        options.pipeline_root,
        options.pipeline_name,
        pipeline_version=options.pipeline_version,
        dataset_name=options.dataset_name,
    )
    database.commit_and_close()
    lock.stop()
예제 #9
0
def run(options):
    lock = lock_file.LockFile(os.path.join(options.pipeline_root, 'qc.lock'))
    database = db.Db(options.db_config_file)
    database.make_qc_jobs_tsv(
        options.outfile,
        options.pipeline_root,
        options.reference_id,
        options.reference_root,
        pipeline_version=options.pipeline_version,
        dataset_name=options.dataset_name,
    )
    database.commit_and_close()
    lock.stop()
def run(options):
    lock = lock_file.LockFile(
        os.path.join(options.pipeline_root, 'remove_contam.lock'))
    database = db.Db(options.db_config_file)
    database.make_remove_contam_jobs_tsv(
        options.outfile,
        options.pipeline_root,
        0,
        '/fake/path/to/refs/',
        dataset_name=options.dataset_name,
        faking_it=True,
    )
    database.commit_and_close()
    lock.stop()
def run(options):
    lock = lock_file.LockFile(os.path.join(options.pipeline_root, 'mykrobe_predict.lock'))
    database = db.Db(options.db_config_file)
    database.make_variant_call_or_mykrobe_jobs_tsv(
        'mykrobe_predict',
        options.outfile,
        options.pipeline_root,
        options.reference_id,
        options.reference_root,
        pipeline_version=options.pipeline_version,
        dataset_name=options.dataset_name,
    )
    database.commit_and_close()
    lock.stop()
예제 #12
0
 def __init__(self,
              db_ini_file,
              pipeline_root,
              include_withdrawn=False,
              include_internal_ids=False,
              dataset_name=None):
     self.db = db.Db(db_ini_file)
     self.pipeline_root = os.path.abspath(pipeline_root)
     if not os.path.exists(self.pipeline_root):
         raise Error('Pipeline root directory "' + self.pipeline_root +
                     '" not found. Cannot continue')
     self.dataset_name = dataset_name
     self.include_withdrawn = include_withdrawn
     self.include_internal_ids = include_internal_ids
     self.dataset_name = dataset_name
예제 #13
0
def run(options):
    using_db = None not in (
        options.db_config_file,
        options.pipeline_references_root,
        options.name,
    )
    if using_db and options.outdir:
        print(
            "Error! If adding to database, must use --db_config_file,--pipeline_references_root,--name.",
            file=sys.stderr,
        )
        print("Otherwise, use --outdir.", file=sys.stderr)
        sys.exit(1)

    if using_db:
        lock = lock_file.LockFile(
            os.path.join(options.pipeline_references_root,
                         "add_reference.lock"))
        database = db.Db(options.db_config_file)
        ref_id = database.add_reference(options.name)
        database.commit_and_close()
        lock.stop()
    else:
        ref_id = None

    ref_dir = reference_dir.ReferenceDir(
        pipeline_references_root_dir=options.pipeline_references_root,
        reference_id=ref_id,
        directory=options.outdir,
    )

    genome_is_big = options.contam_tsv is not None
    using_cortex = options.contam_tsv is None
    ref_dir.make_index_files(
        options.fasta_file,
        genome_is_big,
        using_cortex,
        cortex_mem_height=options.cortex_mem_height,
    )

    if options.contam_tsv is not None:
        ref_dir.add_remove_contam_metadata_tsv(options.contam_tsv)
예제 #14
0
def run(options):
    if options.pool == 1:
        options.seqrep_id = None
    else:
        options.seqrep_pool = None
        options.seqrep_id = int(options.seqrep_id)

    database = db.Db(options.db_config_file)
    database.update_finished_pipeline_run(
        options.isolate_id,
        options.seqrep_id,
        options.seqrep_pool,
        options.pipeline_name,
        options.new_pipeline_status,
        reference_id=options.reference_id,
        pipeline_version=options.pipeline_version,
        pipeline_root=options.pipeline_root,
    )

    database.commit_and_close()
예제 #15
0
 def __init__(
     self,
     db_ini_file,
     pipeline_root_dir,
     seqrep_id,
     isolate_id,
     sample_id,
     sequence_replicate_number,
     reads_file_1,
     reads_file_2,
     reads_file_md5_1,
     reads_file_md5_2,
 ):
     self.db = db.Db(db_ini_file)
     self.pipeline_root_dir = os.path.abspath(pipeline_root_dir)
     self.seqrep_id = seqrep_id
     self.isolate_id = isolate_id
     self.sample_id = sample_id
     self.sequence_replicate_number = sequence_replicate_number
     self.reads_file_1 = os.path.abspath(reads_file_1)
     self.reads_file_2 = os.path.abspath(reads_file_2)
     self.reads_file_md5_1 = reads_file_md5_1
     self.reads_file_md5_2 = reads_file_md5_2
예제 #16
0
    def test_nextflow_mykrobe_predict(self):
        """test nextflow_mykrobe using database"""
        tmp_data_dir = "tmp.nextflow_mykrobe_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "mykrobe_predict.nf")
        work_dir = "tmp.nextflow_mykrobe_db_input.work"
        dag_file = "nextflow.mykrobe.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair is from group 2 and should get ignored
            "--ref_id 2",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "--testing",
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected.
        # The --testing option is set up so that the pooled
        # sample fails, hence it gets a status of -1.
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": -1,
                "reference_id": 2,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        self.assertEqual(expected_rows, got_rows)

        # check mykrobe output files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "seqrep_id": "1_2",
                "isolate_id": 1,
                "seq_repl": "1_2",
                "sample_name":
                "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2",
            },
            {
                "sample": 2,
                "seqrep_id": 3,
                "isolate_id": 2,
                "seq_repl": "1",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1",
            },
            {
                "sample": 2,
                "seqrep_id": 4,
                "isolate_id": 2,
                "seq_repl": "2",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2",
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            pipeline_dir = iso_dir.pipeline_dir(
                id_dict["seq_repl"],
                "mykrobe_predict",
                clockwork_version,
                reference_id=2,
            )
            self.assertTrue(os.path.exists(pipeline_dir))
            log = os.path.join(pipeline_dir, "log.txt")
            json_file = os.path.join(pipeline_dir, "out.json")

            if id_dict["sample_name"].endswith("1_2"):
                self.assertFalse(os.path.exists(log))
                self.assertFalse(os.path.exists(json_file))
            else:
                self.assertTrue(os.path.exists(log))
                self.assertTrue(os.path.exists(json_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #17
0
    def setUp(self):
        self.pipeline_root = os.path.abspath('piperoot')
        os.mkdir(self.pipeline_root)

        try:
            db_connection.DbConnection(ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(ini_file)
        dbm.run()
        self.db = db.Db(ini_file)

        sample_dicts = [
            {
                'subject_id': 'subject_1',
                'site_id': '01',
                'lab_id': 'lab_id_1',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_1_1.fq',
                'reads_file_1_md5': 'md5_1_1',
                'reads_file_2_md5': 'md5_1_2',
                'reads_file_2': 'reads_1_2.fq',
                'dataset_name': 'set1',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 1',
                'ena_on_hold': '0',
                'ena_run_accession': 'ERR123456',
                'ena_sample_accession': 'ERS123456',
            },
            {
                'subject_id': 'subject_2',
                'site_id': '01',
                'lab_id': 'lab_id_2',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_2_1.fq',
                'reads_file_1_md5': 'md5_2_1',
                'reads_file_2_md5': 'md5_2_2',
                'reads_file_2': 'reads_2_2.fq',
                'dataset_name': 'set1',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 1',
                'ena_on_hold': '0',
                'ena_run_accession': 'ERR123457',
                'ena_sample_accession': 'ERS123457',
            },
            {
                'subject_id': 'subject_3',
                'site_id': '02',
                'lab_id': 'lab_id_3',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_3_1.fq',
                'reads_file_1_md5': 'md5_3_1',
                'reads_file_2_md5': 'md5_3_2',
                'reads_file_2': 'reads_3_2.fq',
                'dataset_name': 'set2',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 2',
                'ena_on_hold': '0',
                'ena_run_accession': None,
                'ena_sample_accession': None,
            },
            {
                'subject_id': 'subject_3',
                'site_id': '02',
                'lab_id': 'lab_id_3',
                'isolate_number': '1',
                'sequence_replicate_number': 2,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_4_1.fq',
                'reads_file_1_md5': 'md5_4_1',
                'reads_file_2_md5': 'md5_4_2',
                'reads_file_2': 'reads_4_2.fq',
                'dataset_name': 'set2',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 2',
                'ena_on_hold': '0',
                'ena_run_accession': None,
                'ena_sample_accession': None,
            },
        ]

        for d in sample_dicts:
            self.db.add_one_seqrep(d)
            where_dict = {'original_reads_file_1_md5': d['reads_file_1_md5']}
            update_dict = {
                'remove_contam_reads_file_1_md5':
                d['reads_file_1_md5'] + '.remove_contam',
                'remove_contam_reads_file_2_md5':
                d['reads_file_2_md5'] + '.remove_contam',
            }
            self.db.update_row('Seqrep', where_dict, update_dict)

        seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3}
        for seqrep, isolate in seqrep_to_isolate.items():
            ref_id = 1 if seqrep in {1, 2} else 2
            version = '0.1.1' if seqrep in {1, 2} else '0.1.3'
            d = {
                'isolate_id': isolate,
                'seqrep_id': seqrep,
                'seqrep_pool': None,
                'version': version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': ref_id
            }
            self.db.add_row_to_table('Pipeline', d)
            d = {
                'isolate_id': isolate,
                'seqrep_id': seqrep,
                'seqrep_pool': None,
                'version': version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': ref_id + 2
            }
            self.db.add_row_to_table('Pipeline', d)

        var_call_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
            {
                'isolate_id': 2,
                'seqrep_id': None,
                'seqrep_pool': '2',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
        ]
        for d in var_call_rows:
            self.db.add_row_to_table('Pipeline', d)
            d['pipeline_name'] = 'mykrobe_predict'
            self.db.add_row_to_table('Pipeline', d)

        self.db.commit()
    def test_nextflow_remove_contam_using_database(self):
        '''test nextflow_remove_contam using database'''
        tmp_data_dir = 'tmp.nextflow_remove_contam'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'remove_contam.nf')
        work_dir = 'tmp.nextflow_remove_contam.work'
        dag_file = 'nextflow.remove_contam.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair has group g2, so should get ignored
            '--ref_id 1',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            os.path.abspath(pipeline_root),
            '--db_config_file',
            db_ini_file,
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('seqrep_id'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': -1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check database Read_counts table updated
        got_rows = database.get_rows_from_table('Read_counts')
        got_rows.sort(key=itemgetter('seqrep_id'))
        expected_rows = [
            {
                'seqrep_id': 1,
                'original_total': 198,
                'contamination': 40,
                'not_contamination': 132,
                'unmapped': 26,
                'total_after_remove_contam': 158,
            },
            {
                'seqrep_id': 2,
                'original_total': 156,
                'contamination': 12,
                'not_contamination': 132,
                'unmapped': 12,
                'total_after_remove_contam': 144,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check FASTQ files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'isolate_id': 1,
                'seq_repl': 43
            },
            {
                'sample': 2,
                'isolate_id': 2,
                'seq_repl': 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            for read_type in ('original', 'remove_contam', 'contam'):
                for i in (1, 2):
                    self.assertTrue(
                        os.path.exists(
                            iso_dir.reads_filename(read_type,
                                                   id_dict['seq_repl'], i)))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #19
0
    def test_nextflow_qc_using_database(self):
        """test nextflow_qc using database"""
        tmp_data_dir = "tmp.nextflow_qc"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf")
        work_dir = "tmp.nextflow_qc.work"
        dag_file = "nextflow.qc.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  #  one of the samples is in group2 and should get ignored
            "--ref_id 1",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_pipeline_rows = database.get_rows_from_table("Pipeline")
        got_pipeline_rows.sort(key=itemgetter("seqrep_id"))
        expected_pipeline_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": -1,
                "reference_id": 1,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        self.assertEqual(expected_pipeline_rows, got_pipeline_rows)

        #  check QC stats added to database
        got_qc_rows = database.get_rows_from_table("QC")
        got_qc_rows.sort(key=itemgetter("seqrep_id"))
        expected_qc_rows = [
            {
                "seqrep_id": 1,
                "pipeline_version": clockwork_version,
                "fastqc1_adapter_content": "pass",
                "fastqc1_basic_statistics": "pass",
                "fastqc1_gc": 48.0,
                "fastqc1_kmer_content": "fail",
                "fastqc1_max_sequence_length": 75,
                "fastqc1_min_sequence_length": 75,
                "fastqc1_overrepresented_sequences": "fail",
                "fastqc1_per_base_n_content": "pass",
                "fastqc1_per_base_sequence_content": "fail",
                "fastqc1_per_base_sequence_quality": "pass",
                "fastqc1_per_sequence_gc_content": "fail",
                "fastqc1_per_sequence_quality_scores": "fail",
                "fastqc1_sequence_duplication_levels": "pass",
                "fastqc1_sequence_length_distribution": "pass",
                "fastqc1_sequences_flagged_as_poor_quality": 0,
                "fastqc1_total_sequences": 72,
                "fastqc2_adapter_content": "pass",
                "fastqc2_basic_statistics": "pass",
                "fastqc2_gc": 48.0,
                "fastqc2_kmer_content": "fail",
                "fastqc2_max_sequence_length": 75,
                "fastqc2_min_sequence_length": 75,
                "fastqc2_overrepresented_sequences": "fail",
                "fastqc2_per_base_n_content": "pass",
                "fastqc2_per_base_sequence_content": "fail",
                "fastqc2_per_base_sequence_quality": "pass",
                "fastqc2_per_sequence_gc_content": "fail",
                "fastqc2_per_sequence_quality_scores": "fail",
                "fastqc2_sequence_duplication_levels": "pass",
                "fastqc2_sequence_length_distribution": "pass",
                "fastqc2_sequences_flagged_as_poor_quality": 0,
                "fastqc2_total_sequences": 72,
                "samtools_average_quality": 40.0,
                "samtools_bases_mapped_cigar": 9900,
                "samtools_bases_trimmed": 0,
                "samtools_error_rate": 0.0,
                "samtools_insert_size_average": 199.6,
                "samtools_insert_size_standard_deviation": 1.0,
                "samtools_inward_oriented_pairs": 66,
                "samtools_outward_oriented_pairs": 0,
                "samtools_pairs_with_other_orientation": 0,
                "samtools_raw_total_sequences": 144,
                "samtools_reads_duplicated": 4,
                "samtools_reads_mapped": 132,
                "het_snp_het_calls": 0,
                "het_snp_positions": 983,
                "het_snp_total_snps": 0,
            },
            {
                "seqrep_id": 2,
                "pipeline_version": clockwork_version,
                "fastqc1_adapter_content": "pass",
                "fastqc1_basic_statistics": "pass",
                "fastqc1_gc": 48.0,
                "fastqc1_kmer_content": "fail",
                "fastqc1_max_sequence_length": 75,
                "fastqc1_min_sequence_length": 75,
                "fastqc1_overrepresented_sequences": "fail",
                "fastqc1_per_base_n_content": "pass",
                "fastqc1_per_base_sequence_content": "fail",
                "fastqc1_per_base_sequence_quality": "pass",
                "fastqc1_per_sequence_gc_content": "fail",
                "fastqc1_per_sequence_quality_scores": "fail",
                "fastqc1_sequence_duplication_levels": "pass",
                "fastqc1_sequence_length_distribution": "pass",
                "fastqc1_sequences_flagged_as_poor_quality": 0,
                "fastqc1_total_sequences": 72,
                "fastqc2_adapter_content": "pass",
                "fastqc2_basic_statistics": "pass",
                "fastqc2_gc": 49.0,
                "fastqc2_kmer_content": "fail",
                "fastqc2_max_sequence_length": 75,
                "fastqc2_min_sequence_length": 75,
                "fastqc2_overrepresented_sequences": "fail",
                "fastqc2_per_base_n_content": "pass",
                "fastqc2_per_base_sequence_content": "fail",
                "fastqc2_per_base_sequence_quality": "pass",
                "fastqc2_per_sequence_gc_content": "warn",
                "fastqc2_per_sequence_quality_scores": "fail",
                "fastqc2_sequence_duplication_levels": "pass",
                "fastqc2_sequence_length_distribution": "pass",
                "fastqc2_sequences_flagged_as_poor_quality": 0,
                "fastqc2_total_sequences": 72,
                "samtools_average_quality": 40.0,
                "samtools_bases_mapped_cigar": 9900,
                "samtools_bases_trimmed": 0,
                "samtools_error_rate": 0.0,
                "samtools_insert_size_average": 199.7,
                "samtools_insert_size_standard_deviation": 1.1,
                "samtools_inward_oriented_pairs": 66,
                "samtools_outward_oriented_pairs": 0,
                "samtools_pairs_with_other_orientation": 0,
                "samtools_raw_total_sequences": 144,
                "samtools_reads_duplicated": 0,
                "samtools_reads_mapped": 132,
                "het_snp_het_calls": 0,
                "het_snp_positions": 983,
                "het_snp_total_snps": 0,
            },
        ]
        self.assertEqual(expected_qc_rows, got_qc_rows)

        # check QC files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "isolate_id": 1,
                "seq_repl": 43
            },
            {
                "sample": 2,
                "isolate_id": 2,
                "seq_repl": 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            qc_root_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "qc",
                                               clockwork_version)
            self.assertTrue(os.path.exists(qc_root_dir))
            for method in ["fastqc", "samtools_qc"]:
                this_qc_dir = os.path.join(qc_root_dir, method)
                self.assertTrue(os.path.exists(this_qc_dir))
                self.assertTrue(len(os.listdir(this_qc_dir)) >= 1)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
    def test_nextflow_generic_pipeline(self):
        """test nextflow generic pipeline using database"""
        tmp_data_dir = "tmp.nextflow_generic_pipeline_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file)
        utils.syscall(
            "mysql --defaults-file="
            + mysql_config_file
            + ' -e "DROP DATABASE IF EXISTS '
            + db_config_data["db"]
            + "; CREATE DATABASE "
            + db_config_data["db"]
            + '"'
        )
        utils.syscall(
            "mysql --defaults-file="
            + mysql_config_file
            + " "
            + db_config_data["db"]
            + " < "
            + mysql_dump
        )
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        nextflow_file = os.path.join(
            nextflow_helper.nextflow_dir, "generic_pipeline.nf"
        )
        work_dir = "tmp.nextflow_generic_pipeline.work"
        dag_file = "nextflow.generic_pipeline.dag.pdf"
        pipeline_name = "generic_pipeline"
        script = os.path.join(data_dir, "script.pl")

        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join(
            [
                "nextflow run",
                "--dataset_name g1",  # one read pair is from group 2 and should get ignored
                "--pipeline_name",
                pipeline_name,
                "--pipeline_root",
                pipeline_root,
                "--script",
                script,
                "--db_config_file",
                db_ini_file,
                "--max_ram",
                "0.5",
                "-with-dag",
                dag_file,
                "-c",
                nextflow_helper.config_file,
                "-w",
                work_dir,
                nextflow_file,
            ]
        )
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": -1,
                "reference_id": None,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        self.assertEqual(expected_rows, got_rows)

        # check that the expected output file from the script.pl
        # got made (except for the sample that is expected to fail)

        ids = [
            {"sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2"},
            {"sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1"},
            {"sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2"},
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(
                pipeline_root, id_dict["sample"], id_dict["isolate_id"]
            )
            pipeline_dir = iso_dir.pipeline_dir(
                id_dict["seq_repl"], pipeline_name, clockwork_version
            )
            counts_file = os.path.join(pipeline_dir, "count.txt")
            self.assertTrue(os.path.exists(counts_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #21
0
    def test_run(self):
        '''test run'''
        original_pipeline_root = os.path.join(data_dir, 'run', 'Pipeline_root')
        tmp_pipeline_root = 'tmp.dataset_submitter.pipeline_root'
        shutil.copytree(original_pipeline_root, tmp_pipeline_root)
        pipeline_test_dir = os.path.join(data_dir, 'run')
        mysql_dump = os.path.join(pipeline_test_dir, 'mysql.dump')
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        db_config_data = db_connection.DbConnection._parse_config_file(
            ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)

        gsub = dataset_submitter.DatasetSubmitter(ini_file,
                                                  'g1',
                                                  tmp_pipeline_root,
                                                  42,
                                                  unit_test='success')
        gsub.run()
        columns = 'Seqrep.seqrep_id, sequence_replicate_number, remove_contam_reads_file_1_md5, remove_contam_reads_file_2_md5, ena_center_name, ena_run_accession, Isolate.isolate_id, ena_experiment_accession, Sample.sample_id, site_id, instrument_model, ena_sample_accession, ena_study_accession'
        join = 'Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id JOIN Sample ON Isolate.sample_id = Sample.sample_id'
        where = 'submit_to_ena=1 AND import_status=1 AND dataset_name="g1"'
        query = ' '.join([
            'SELECT', columns, 'FROM', '(' + join + ')', 'WHERE',
            '(' + where + ')'
        ])
        database = db.Db(ini_file)
        got_data = database.query_to_dict(query)

        for row in got_data:
            accessions = {row[x] for x in row if x.endswith('accession')}
            self.assertNotIn(None, accessions)

        run_accessions = {x['ena_run_accession'] for x in got_data}
        self.assertEqual(5, len(run_accessions))
        study_accessions = {x['ena_study_accession'] for x in got_data}
        self.assertEqual(1, len(study_accessions))

        # hash the rows by md5 of file 1, since we don't know the auto
        # generated IDs in the DB.
        data_by_md5_1 = {
            x['remove_contam_reads_file_1_md5']: x
            for x in got_data
        }
        md51 = '83d842db2d9ea84faa747cefa4b2f1b4'
        md52 = '67ff4c03bd637e027f372b4b5a833935'
        md53 = 'bfde82c3a5ec16ffefb32fdfcfd4cf53'
        md54 = 'be5c2e07716c119a2e86f6421df5f63b'
        md55 = '21544f51d9d620ca99bc445219b1018d'
        self.assertNotEqual(data_by_md5_1[md51]['ena_sample_accession'],
                            data_by_md5_1[md52]['ena_sample_accession'])
        self.assertEqual(data_by_md5_1[md52]['ena_sample_accession'],
                         data_by_md5_1[md53]['ena_sample_accession'])
        self.assertEqual(data_by_md5_1[md53]['ena_sample_accession'],
                         data_by_md5_1[md54]['ena_sample_accession'])
        self.assertNotEqual(data_by_md5_1[md54]['ena_sample_accession'],
                            data_by_md5_1[md55]['ena_sample_accession'])
        self.assertNotEqual(data_by_md5_1[md51]['ena_experiment_accession'],
                            data_by_md5_1[md52]['ena_experiment_accession'])
        self.assertEqual(data_by_md5_1[md52]['ena_experiment_accession'],
                         data_by_md5_1[md53]['ena_experiment_accession'])
        self.assertNotEqual(data_by_md5_1[md53]['ena_experiment_accession'],
                            data_by_md5_1[md54]['ena_experiment_accession'])
        self.assertNotEqual(data_by_md5_1[md54]['ena_experiment_accession'],
                            data_by_md5_1[md55]['ena_experiment_accession'])
        shutil.rmtree(tmp_pipeline_root)
    def test_nextflow_variant_call_using_database(self):
        '''test nextflow_variant_call using database'''
        tmp_data_dir = 'tmp.nextflow_variant_call_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'variant_call.nf')
        work_dir = 'tmp.nextflow_variant_call_db_input.work'
        dag_file = 'nextflow.variant_call.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair is from group 2 and should get ignored
            '--ref_id 2',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '--cortex_mem_height 17',
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': 5,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': -1,
                'reference_id': 2
            },
            {
                'isolate_id': 4,
                'seqrep_id': 6,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check VCF files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'seqrep_id': '1_2',
                'isolate_id': 1,
                'seq_repl': '1_2'
            },
            {
                'sample': 2,
                'seqrep_id': 3,
                'isolate_id': 2,
                'seq_repl': '1'
            },
            {
                'sample': 2,
                'seqrep_id': 4,
                'isolate_id': 2,
                'seq_repl': '2'
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'],
                                                'variant_call',
                                                clockwork_version,
                                                reference_id=2)
            expected_sample = '.'.join([
                str(id_dict[x])
                for x in ['sample', 'isolate_id', 'seqrep_id', 'seq_repl']
            ])
            self._files_are_present_and_correct(pipeline_dir, expected_sample)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #23
0
    def test_nextflow_variant_call_using_database(self):
        """test nextflow_variant_call using database"""
        tmp_data_dir = "tmp.nextflow_variant_call_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "variant_call.nf")
        work_dir = "tmp.nextflow_variant_call_db_input.work"
        dag_file = "nextflow.variant_call.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair is from group 2 and should get ignored
            "--ref_id 2",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "--cortex_mem_height 17",
            "--testing",
            # Using truth ref is broken, and we nevr use it anyway,
            # so disable this for now
            #"--truth_ref",
            #os.path.join(tmp_data_dir, "truth_ref.fa"),
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": -1,
                "reference_id": 2,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check VCF files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "seqrep_id": "1_2",
                "isolate_id": 1,
                "seq_repl": "1_2",
                "sample_name":
                "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2",
            },
            {
                "sample": 2,
                "seqrep_id": 3,
                "isolate_id": 2,
                "seq_repl": "1",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1",
            },
            {
                "sample": 2,
                "seqrep_id": 4,
                "isolate_id": 2,
                "seq_repl": "2",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2",
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            pipeline_dir = iso_dir.pipeline_dir(id_dict["seq_repl"],
                                                "variant_call",
                                                clockwork_version,
                                                reference_id=2)
            self._files_are_present_and_correct(pipeline_dir,
                                                id_dict["sample_name"],
                                                expect_ref_check_files=False)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
    def test_nextflow_fake_remove_contam(self):
        """test nextflow_fake_remove_contam"""
        tmp_data_dir = "tmp.nextflow_fake_remove_contam"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "fake_remove_contam.nf")
        work_dir = "tmp.nextflow_fake_remove_contam.work"
        dag_file = "nextflow.fake_remove_contam.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair has group g2, so should get ignored
            "--pipeline_root",
            os.path.abspath(pipeline_root),
            "--db_config_file",
            db_ini_file,
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("seqrep_id"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 0,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 0,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": -1,
                "reference_id": 0,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check database Read_counts table updated
        got_rows = database.get_rows_from_table("Read_counts")
        got_rows.sort(key=itemgetter("seqrep_id"))
        expected_rows = [
            {
                "seqrep_id": 1,
                "original_total": 12,
                "contamination": 0,
                "not_contamination": 12,
                "unmapped": 0,
                "total_after_remove_contam": 12,
            },
            {
                "seqrep_id": 2,
                "original_total": 26,
                "contamination": 0,
                "not_contamination": 26,
                "unmapped": 0,
                "total_after_remove_contam": 26,
            },
        ]

        self.assertEqual(expected_rows, got_rows)

        # check FASTQ files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "isolate_id": 1,
                "seq_repl": 1
            },
            {
                "sample": 2,
                "isolate_id": 2,
                "seq_repl": 1
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            for read_type in ("original", "remove_contam"):
                for i in (1, 2):
                    self.assertTrue(
                        os.path.exists(
                            iso_dir.reads_filename(read_type,
                                                   id_dict["seq_repl"], i)))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
    def test_nextflow_generic_pipeline(self):
        '''test nextflow generic pipeline using database'''
        tmp_data_dir = 'tmp.nextflow_generic_pipeline_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'generic_pipeline.nf')
        work_dir = 'tmp.nextflow_generic_pipeline.work'
        dag_file = 'nextflow.generic_pipeline.dag.pdf'
        pipeline_name = 'generic_pipeline'
        script = os.path.join(data_dir, 'script.pl')

        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1', # one read pair is from group 2 and should get ignored
            '--pipeline_name', pipeline_name,
            '--pipeline_root', pipeline_root,
            '--script', script,
            '--db_config_file', db_ini_file,
            '--max_ram', '0.5',
            '-with-dag', dag_file,
            '-c', nextflow_helper.config_file,
            '-w', work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': -1, 'reference_id': None},
            {'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
        ]
        expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        self.assertEqual(expected_rows, got_rows)

        # check that the expected output file from the script.pl
        # got made (except for the sample that is expected to fail)

        ids = [
            {'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2'},
            {'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1'},
            {'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2'},
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], pipeline_name, clockwork_version)
            counts_file = os.path.join(pipeline_dir, 'count.txt')
            self.assertTrue(os.path.exists(counts_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #26
0
    def test_nextflow_qc_using_database(self):
        '''test nextflow_qc using database'''
        tmp_data_dir = 'tmp.nextflow_qc'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf')
        work_dir = 'tmp.nextflow_qc.work'
        dag_file = 'nextflow.qc.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one of the samples is in group2 and should get ignored
            '--ref_id 1',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_pipeline_rows = database.get_rows_from_table('Pipeline')
        got_pipeline_rows.sort(key=itemgetter('seqrep_id'))
        expected_pipeline_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': -1,
                'reference_id': 1
            },
            {
                'isolate_id': 4,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_pipeline_rows, got_pipeline_rows)

        # check QC stats added to database
        got_qc_rows = database.get_rows_from_table('QC')
        got_qc_rows.sort(key=itemgetter('seqrep_id'))
        expected_qc_rows = [{
            'seqrep_id': 1,
            'pipeline_version': clockwork_version,
            'fastqc1_adapter_content': 'pass',
            'fastqc1_basic_statistics': 'pass',
            'fastqc1_gc': 48.0,
            'fastqc1_kmer_content': 'fail',
            'fastqc1_max_sequence_length': 75,
            'fastqc1_min_sequence_length': 75,
            'fastqc1_overrepresented_sequences': 'fail',
            'fastqc1_per_base_n_content': 'pass',
            'fastqc1_per_base_sequence_content': 'fail',
            'fastqc1_per_base_sequence_quality': 'pass',
            'fastqc1_per_sequence_gc_content': 'fail',
            'fastqc1_per_sequence_quality_scores': 'fail',
            'fastqc1_sequence_duplication_levels': 'pass',
            'fastqc1_sequence_length_distribution': 'pass',
            'fastqc1_sequences_flagged_as_poor_quality': 0,
            'fastqc1_total_sequences': 72,
            'fastqc2_adapter_content': 'pass',
            'fastqc2_basic_statistics': 'pass',
            'fastqc2_gc': 48.0,
            'fastqc2_kmer_content': 'fail',
            'fastqc2_max_sequence_length': 75,
            'fastqc2_min_sequence_length': 75,
            'fastqc2_overrepresented_sequences': 'fail',
            'fastqc2_per_base_n_content': 'pass',
            'fastqc2_per_base_sequence_content': 'fail',
            'fastqc2_per_base_sequence_quality': 'pass',
            'fastqc2_per_sequence_gc_content': 'fail',
            'fastqc2_per_sequence_quality_scores': 'fail',
            'fastqc2_sequence_duplication_levels': 'pass',
            'fastqc2_sequence_length_distribution': 'pass',
            'fastqc2_sequences_flagged_as_poor_quality': 0,
            'fastqc2_total_sequences': 72,
            'samtools_average_quality': 40.0,
            'samtools_bases_mapped_cigar': 9900,
            'samtools_bases_trimmed': 0,
            'samtools_error_rate': 0.0,
            'samtools_insert_size_average': 199.6,
            'samtools_insert_size_standard_deviation': 1.0,
            'samtools_inward_oriented_pairs': 66,
            'samtools_outward_oriented_pairs': 0,
            'samtools_pairs_with_other_orientation': 0,
            'samtools_raw_total_sequences': 144,
            'samtools_reads_duplicated': 4,
            'samtools_reads_mapped': 132,
            'het_snp_het_calls': 0,
            'het_snp_positions': 983,
            'het_snp_total_snps': 0,
        }, {
            'seqrep_id': 2,
            'pipeline_version': clockwork_version,
            'fastqc1_adapter_content': 'pass',
            'fastqc1_basic_statistics': 'pass',
            'fastqc1_gc': 48.0,
            'fastqc1_kmer_content': 'fail',
            'fastqc1_max_sequence_length': 75,
            'fastqc1_min_sequence_length': 75,
            'fastqc1_overrepresented_sequences': 'fail',
            'fastqc1_per_base_n_content': 'pass',
            'fastqc1_per_base_sequence_content': 'fail',
            'fastqc1_per_base_sequence_quality': 'pass',
            'fastqc1_per_sequence_gc_content': 'fail',
            'fastqc1_per_sequence_quality_scores': 'fail',
            'fastqc1_sequence_duplication_levels': 'pass',
            'fastqc1_sequence_length_distribution': 'pass',
            'fastqc1_sequences_flagged_as_poor_quality': 0,
            'fastqc1_total_sequences': 72,
            'fastqc2_adapter_content': 'pass',
            'fastqc2_basic_statistics': 'pass',
            'fastqc2_gc': 49.0,
            'fastqc2_kmer_content': 'fail',
            'fastqc2_max_sequence_length': 75,
            'fastqc2_min_sequence_length': 75,
            'fastqc2_overrepresented_sequences': 'fail',
            'fastqc2_per_base_n_content': 'pass',
            'fastqc2_per_base_sequence_content': 'fail',
            'fastqc2_per_base_sequence_quality': 'pass',
            'fastqc2_per_sequence_gc_content': 'warn',
            'fastqc2_per_sequence_quality_scores': 'fail',
            'fastqc2_sequence_duplication_levels': 'pass',
            'fastqc2_sequence_length_distribution': 'pass',
            'fastqc2_sequences_flagged_as_poor_quality': 0,
            'fastqc2_total_sequences': 72,
            'samtools_average_quality': 40.0,
            'samtools_bases_mapped_cigar': 9900,
            'samtools_bases_trimmed': 0,
            'samtools_error_rate': 0.0,
            'samtools_insert_size_average': 199.7,
            'samtools_insert_size_standard_deviation': 1.1,
            'samtools_inward_oriented_pairs': 66,
            'samtools_outward_oriented_pairs': 0,
            'samtools_pairs_with_other_orientation': 0,
            'samtools_raw_total_sequences': 144,
            'samtools_reads_duplicated': 0,
            'samtools_reads_mapped': 132,
            'het_snp_het_calls': 0,
            'het_snp_positions': 983,
            'het_snp_total_snps': 0,
        }]
        self.assertEqual(expected_qc_rows, got_qc_rows)

        # check QC files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'isolate_id': 1,
                'seq_repl': 43
            },
            {
                'sample': 2,
                'isolate_id': 2,
                'seq_repl': 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            qc_root_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'qc',
                                               clockwork_version)
            self.assertTrue(os.path.exists(qc_root_dir))
            for method in ['fastqc', 'samtools_qc']:
                this_qc_dir = os.path.join(qc_root_dir, method)
                self.assertTrue(os.path.exists(this_qc_dir))
                self.assertTrue(len(os.listdir(this_qc_dir)) >= 1)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #27
0
    def setUp(self):
        self.pipeline_root = os.path.abspath("piperoot")
        os.mkdir(self.pipeline_root)

        try:
            db_connection.DbConnection(ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(ini_file)
        dbm.run()
        self.db = db.Db(ini_file)

        sample_dicts = [
            {
                "subject_id": "subject_1",
                "site_id": "01",
                "lab_id": "lab_id_1",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_1_1.fq",
                "reads_file_1_md5": "md5_1_1",
                "reads_file_2_md5": "md5_1_2",
                "reads_file_2": "reads_1_2.fq",
                "dataset_name": "set1",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 1",
                "ena_on_hold": "0",
                "ena_run_accession": "ERR123456",
                "ena_sample_accession": "ERS123456",
            },
            {
                "subject_id": "subject_2",
                "site_id": "01",
                "lab_id": "lab_id_2",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_2_1.fq",
                "reads_file_1_md5": "md5_2_1",
                "reads_file_2_md5": "md5_2_2",
                "reads_file_2": "reads_2_2.fq",
                "dataset_name": "set1",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 1",
                "ena_on_hold": "0",
                "ena_run_accession": "ERR123457",
                "ena_sample_accession": "ERS123457",
            },
            {
                "subject_id": "subject_3",
                "site_id": "02",
                "lab_id": "lab_id_3",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_3_1.fq",
                "reads_file_1_md5": "md5_3_1",
                "reads_file_2_md5": "md5_3_2",
                "reads_file_2": "reads_3_2.fq",
                "dataset_name": "set2",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 2",
                "ena_on_hold": "0",
                "ena_run_accession": None,
                "ena_sample_accession": None,
            },
            {
                "subject_id": "subject_3",
                "site_id": "02",
                "lab_id": "lab_id_3",
                "isolate_number": "1",
                "sequence_replicate_number": 2,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_4_1.fq",
                "reads_file_1_md5": "md5_4_1",
                "reads_file_2_md5": "md5_4_2",
                "reads_file_2": "reads_4_2.fq",
                "dataset_name": "set2",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 2",
                "ena_on_hold": "0",
                "ena_run_accession": None,
                "ena_sample_accession": None,
            },
        ]

        for d in sample_dicts:
            self.db.add_one_seqrep(d)
            where_dict = {"original_reads_file_1_md5": d["reads_file_1_md5"]}
            update_dict = {
                "remove_contam_reads_file_1_md5":
                d["reads_file_1_md5"] + ".remove_contam",
                "remove_contam_reads_file_2_md5":
                d["reads_file_2_md5"] + ".remove_contam",
            }
            self.db.update_row("Seqrep", where_dict, update_dict)

        seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3}
        for seqrep, isolate in seqrep_to_isolate.items():
            ref_id = 1 if seqrep in {1, 2} else 2
            version = "0.1.1" if seqrep in {1, 2} else "0.1.3"
            d = {
                "isolate_id": isolate,
                "seqrep_id": seqrep,
                "seqrep_pool": None,
                "version": version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": ref_id,
            }
            self.db.add_row_to_table("Pipeline", d)
            d = {
                "isolate_id": isolate,
                "seqrep_id": seqrep,
                "seqrep_pool": None,
                "version": version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": ref_id + 2,
            }
            self.db.add_row_to_table("Pipeline", d)

        var_call_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
            {
                "isolate_id": 2,
                "seqrep_id": None,
                "seqrep_pool": "2",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
        ]
        for d in var_call_rows:
            self.db.add_row_to_table("Pipeline", d)
            d["pipeline_name"] = "mykrobe_predict"
            self.db.add_row_to_table("Pipeline", d)

        self.db.commit()
예제 #28
0
    def test_nextflow_mykrobe_predict(self):
        '''test nextflow_mykrobe using database'''
        tmp_data_dir = 'tmp.nextflow_mykrobe_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'mykrobe_predict.nf')
        work_dir = 'tmp.nextflow_mykrobe_db_input.work'
        dag_file = 'nextflow.mykrobe.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair is from group 2 and should get ignored
            '--ref_id 2',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected.
        # The --testing option is set up so that the pooled
        # sample fails, hence it gets a status of -1.
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': -1,
                'reference_id': 2
            },
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': 5,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 4,
                'seqrep_id': 6,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        self.assertEqual(expected_rows, got_rows)

        # check mykrobe output files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'seqrep_id': '1_2',
                'isolate_id': 1,
                'seq_repl': '1_2',
                'sample_name':
                'site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2'
            },
            {
                'sample': 2,
                'seqrep_id': 3,
                'isolate_id': 2,
                'seq_repl': '1',
                'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1'
            },
            {
                'sample': 2,
                'seqrep_id': 4,
                'isolate_id': 2,
                'seq_repl': '2',
                'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2'
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'],
                                                'mykrobe_predict',
                                                clockwork_version,
                                                reference_id=2)
            self.assertTrue(os.path.exists(pipeline_dir))
            log = os.path.join(pipeline_dir, 'log.txt')
            json_file = os.path.join(pipeline_dir, 'out.json')

            if id_dict['sample_name'].endswith('1_2'):
                self.assertFalse(os.path.exists(log))
                self.assertFalse(os.path.exists(json_file))
            else:
                self.assertTrue(os.path.exists(log))
                self.assertTrue(os.path.exists(json_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
예제 #29
0
    def test_nextflow_import(self):
        '''test nextflow_import'''
        nextflow_helper.write_config_file()
        pipeline_root = 'tmp.nextflow_import.pipeline_root'
        os.mkdir(pipeline_root)
        try:
            db_connection.DbConnection(db_ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(db_ini_file)
        dbm.run()

        dropbox_dir = 'tmp.nextflow_import.dropbox'
        shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir)
        xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive'
        os.mkdir(xlsx_archive_dir)
        expected_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx'))
        ]

        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf')
        work_dir = 'tmp.nextflow_import.work'
        dag_file = 'nextflow.import.dag.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root',
            pipeline_root, '--db_config_file', db_ini_file,
            '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file,
            '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # All files should be gone from the dropbox
        self.assertEqual([], os.listdir(dropbox_dir))
        shutil.rmtree(dropbox_dir)

        # The two spreadsheets should have been archived
        got_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx'))
        ]
        self.assertEqual(expected_xlsx_files, got_xlsx_files)
        shutil.rmtree(xlsx_archive_dir)

        # Check database updated correctly
        database = db.Db(db_ini_file)
        expected_sample_rows = [
            {
                'subject_id': 'p1',
                'site_id': 's1',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center A',
                'ena_sample_accession': 'ERS123456',
                'ena_study_accession': None
            },
            {
                'subject_id': 'p2',
                'site_id': 's2',
                'sample_id_from_lab': 'l2',
                'dataset_name': 'g2',
                'ena_center_name': 'Center A',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
            {
                'subject_id': 'p1',
                'site_id': 's3',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center B',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
        ]
        got_sample_rows = sorted(database.get_rows_from_table('Sample'),
                                 key=itemgetter('site_id'))
        # the rows also have the sample_id, which is made by mysql auto increment,
        # We don't know the order in which things are added, so can't check the sample_id.
        for row in got_sample_rows:
            del row['sample_id']

        self.assertEqual(expected_sample_rows, got_sample_rows)

        expected_rows = [
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'edc176f367fe8e5a014c819b9ec9b05c',
                'original_reads_file_2_md5':
                '0dd551a0d76d90059808f6f7ddbb0e02',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 0,
                'ena_run_accession': 'ERR123456',
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'fe5cd28cf9394be14794f0a56a2fe845',
                'original_reads_file_2_md5':
                'd026fd9a439294ed42795bd7f1e7df10',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'aa8f077673c158c4f2a19fc3c50e3fa7',
                'original_reads_file_2_md5':
                'ae6bafef67da3c26576e799c32985ac9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '2',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                '6b9a34ed492dad739ac03e084f3b2ab9',
                'original_reads_file_2_md5':
                '7ceffc5314ff7e305b4ab5bd859850c9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
            {
                'sequence_replicate_number': 2,
                'original_reads_file_1_md5':
                'ec0377e321c59c0b1b6392a3c6dfc2dc',
                'original_reads_file_2_md5':
                'd541ffdb43a0648233ec7408c3626bfd',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
        ]

        expected_rows.sort(key=itemgetter('original_reads_file_1_md5'))
        query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)'
        got_rows = database.query_to_dict(query)
        got_rows.sort(key=itemgetter('original_reads_file_1_md5'))

        # Check reads files etc written correctly
        for isolate_data in got_rows:
            iso_dir = isolate_dir.IsolateDir(pipeline_root,
                                             isolate_data['sample_id'],
                                             isolate_data['isolate_id'])
            self.assertTrue(os.path.exists(iso_dir.reads_dir))

            for i in [1, 2]:
                self.assertTrue(
                    os.path.exists(
                        iso_dir.reads_filename(
                            'original',
                            isolate_data['sequence_replicate_number'], i)))

        # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated.
        for row in got_rows:
            del row['sample_id']
            del row['seqrep_id']
            del row['isolate_id']

        self.assertEqual(expected_rows, got_rows)

        shutil.rmtree(pipeline_root)
        nextflow_helper.clean_files()
        database.commit_and_close()
        db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)