Example #1
0
    def test_run(self):
        '''test run'''
        # use the test ini file. But we don't want the db line,
        # because we're making a new db
        ini_file = os.path.join(modules_dir, 'tests', 'data', 'db.ini')

        # in case database already exists
        try:
            db_connection.DbConnection(ini_file, destroy=True)
        except:
            pass
        # thorws error because database doesn't exist
        with self.assertRaises(db_connection.Error):
            db_connection.DbConnection(ini_file)

        dbm = db_maker.DbMaker(ini_file)
        dbm.run()

        # We'll just check that the database got created, and the
        # expected tables are in there. OTT to check complete
        # schema, methinks.
        dbc = db_connection.DbConnection(ini_file)
        cursor = dbc.connection.cursor()
        cursor.execute('USE ' + dbc.db)
        cursor.execute('show tables')
        got_tables = list(cursor.fetchall())
        got_tables.sort()
        expected_tables = [(x, ) for x in sorted(db_schema.tables)]
        self.assertEqual(expected_tables, got_tables)

        # check version got added
        got_rows = cursor.execute('SELECT * FROM Version;')
        expected_rows = [(db_schema.version)]
        dbc.close()
        db_connection.DbConnection(ini_file, destroy=True)
Example #2
0
    def setUp(self):
        try:
            db_connection.DbConnection(db_ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(db_ini_file)
        dbm.run()
        self.db = db.Db(db_ini_file)
Example #3
0
    def setUp(self):
        self.pipeline_root = os.path.abspath('piperoot')
        os.mkdir(self.pipeline_root)

        try:
            db_connection.DbConnection(ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(ini_file)
        dbm.run()
        self.db = db.Db(ini_file)

        sample_dicts = [
            {
                'subject_id': 'subject_1',
                'site_id': '01',
                'lab_id': 'lab_id_1',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_1_1.fq',
                'reads_file_1_md5': 'md5_1_1',
                'reads_file_2_md5': 'md5_1_2',
                'reads_file_2': 'reads_1_2.fq',
                'dataset_name': 'set1',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 1',
                'ena_on_hold': '0',
                'ena_run_accession': 'ERR123456',
                'ena_sample_accession': 'ERS123456',
            },
            {
                'subject_id': 'subject_2',
                'site_id': '01',
                'lab_id': 'lab_id_2',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_2_1.fq',
                'reads_file_1_md5': 'md5_2_1',
                'reads_file_2_md5': 'md5_2_2',
                'reads_file_2': 'reads_2_2.fq',
                'dataset_name': 'set1',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 1',
                'ena_on_hold': '0',
                'ena_run_accession': 'ERR123457',
                'ena_sample_accession': 'ERS123457',
            },
            {
                'subject_id': 'subject_3',
                'site_id': '02',
                'lab_id': 'lab_id_3',
                'isolate_number': '1',
                'sequence_replicate_number': 1,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_3_1.fq',
                'reads_file_1_md5': 'md5_3_1',
                'reads_file_2_md5': 'md5_3_2',
                'reads_file_2': 'reads_3_2.fq',
                'dataset_name': 'set2',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 2',
                'ena_on_hold': '0',
                'ena_run_accession': None,
                'ena_sample_accession': None,
            },
            {
                'subject_id': 'subject_3',
                'site_id': '02',
                'lab_id': 'lab_id_3',
                'isolate_number': '1',
                'sequence_replicate_number': 2,
                'submission_date': datetime.date(2018, 4, 4),
                'reads_file_1': 'reads_4_1.fq',
                'reads_file_1_md5': 'md5_4_1',
                'reads_file_2_md5': 'md5_4_2',
                'reads_file_2': 'reads_4_2.fq',
                'dataset_name': 'set2',
                'submit_to_ena': '0',
                'instrument_model': 'Illumina HiSeq 2500',
                'ena_center_name': 'Centre 2',
                'ena_on_hold': '0',
                'ena_run_accession': None,
                'ena_sample_accession': None,
            },
        ]

        for d in sample_dicts:
            self.db.add_one_seqrep(d)
            where_dict = {'original_reads_file_1_md5': d['reads_file_1_md5']}
            update_dict = {
                'remove_contam_reads_file_1_md5':
                d['reads_file_1_md5'] + '.remove_contam',
                'remove_contam_reads_file_2_md5':
                d['reads_file_2_md5'] + '.remove_contam',
            }
            self.db.update_row('Seqrep', where_dict, update_dict)

        seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3}
        for seqrep, isolate in seqrep_to_isolate.items():
            ref_id = 1 if seqrep in {1, 2} else 2
            version = '0.1.1' if seqrep in {1, 2} else '0.1.3'
            d = {
                'isolate_id': isolate,
                'seqrep_id': seqrep,
                'seqrep_pool': None,
                'version': version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': ref_id
            }
            self.db.add_row_to_table('Pipeline', d)
            d = {
                'isolate_id': isolate,
                'seqrep_id': seqrep,
                'seqrep_pool': None,
                'version': version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': ref_id + 2
            }
            self.db.add_row_to_table('Pipeline', d)

        var_call_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
            {
                'isolate_id': 2,
                'seqrep_id': None,
                'seqrep_pool': '2',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': '1.2.3',
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 10
            },
        ]
        for d in var_call_rows:
            self.db.add_row_to_table('Pipeline', d)
            d['pipeline_name'] = 'mykrobe_predict'
            self.db.add_row_to_table('Pipeline', d)

        self.db.commit()
Example #4
0
 def tearDown(self):
     self.db.commit_and_close()
     db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)
Example #5
0
    def setUp(self):
        self.pipeline_root = os.path.abspath("piperoot")
        os.mkdir(self.pipeline_root)

        try:
            db_connection.DbConnection(ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(ini_file)
        dbm.run()
        self.db = db.Db(ini_file)

        sample_dicts = [
            {
                "subject_id": "subject_1",
                "site_id": "01",
                "lab_id": "lab_id_1",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_1_1.fq",
                "reads_file_1_md5": "md5_1_1",
                "reads_file_2_md5": "md5_1_2",
                "reads_file_2": "reads_1_2.fq",
                "dataset_name": "set1",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 1",
                "ena_on_hold": "0",
                "ena_run_accession": "ERR123456",
                "ena_sample_accession": "ERS123456",
            },
            {
                "subject_id": "subject_2",
                "site_id": "01",
                "lab_id": "lab_id_2",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_2_1.fq",
                "reads_file_1_md5": "md5_2_1",
                "reads_file_2_md5": "md5_2_2",
                "reads_file_2": "reads_2_2.fq",
                "dataset_name": "set1",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 1",
                "ena_on_hold": "0",
                "ena_run_accession": "ERR123457",
                "ena_sample_accession": "ERS123457",
            },
            {
                "subject_id": "subject_3",
                "site_id": "02",
                "lab_id": "lab_id_3",
                "isolate_number": "1",
                "sequence_replicate_number": 1,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_3_1.fq",
                "reads_file_1_md5": "md5_3_1",
                "reads_file_2_md5": "md5_3_2",
                "reads_file_2": "reads_3_2.fq",
                "dataset_name": "set2",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 2",
                "ena_on_hold": "0",
                "ena_run_accession": None,
                "ena_sample_accession": None,
            },
            {
                "subject_id": "subject_3",
                "site_id": "02",
                "lab_id": "lab_id_3",
                "isolate_number": "1",
                "sequence_replicate_number": 2,
                "submission_date": datetime.date(2018, 4, 4),
                "reads_file_1": "reads_4_1.fq",
                "reads_file_1_md5": "md5_4_1",
                "reads_file_2_md5": "md5_4_2",
                "reads_file_2": "reads_4_2.fq",
                "dataset_name": "set2",
                "submit_to_ena": "0",
                "instrument_model": "Illumina HiSeq 2500",
                "ena_center_name": "Centre 2",
                "ena_on_hold": "0",
                "ena_run_accession": None,
                "ena_sample_accession": None,
            },
        ]

        for d in sample_dicts:
            self.db.add_one_seqrep(d)
            where_dict = {"original_reads_file_1_md5": d["reads_file_1_md5"]}
            update_dict = {
                "remove_contam_reads_file_1_md5":
                d["reads_file_1_md5"] + ".remove_contam",
                "remove_contam_reads_file_2_md5":
                d["reads_file_2_md5"] + ".remove_contam",
            }
            self.db.update_row("Seqrep", where_dict, update_dict)

        seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3}
        for seqrep, isolate in seqrep_to_isolate.items():
            ref_id = 1 if seqrep in {1, 2} else 2
            version = "0.1.1" if seqrep in {1, 2} else "0.1.3"
            d = {
                "isolate_id": isolate,
                "seqrep_id": seqrep,
                "seqrep_pool": None,
                "version": version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": ref_id,
            }
            self.db.add_row_to_table("Pipeline", d)
            d = {
                "isolate_id": isolate,
                "seqrep_id": seqrep,
                "seqrep_pool": None,
                "version": version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": ref_id + 2,
            }
            self.db.add_row_to_table("Pipeline", d)

        var_call_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
            {
                "isolate_id": 2,
                "seqrep_id": None,
                "seqrep_pool": "2",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": "1.2.3",
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 10,
            },
        ]
        for d in var_call_rows:
            self.db.add_row_to_table("Pipeline", d)
            d["pipeline_name"] = "mykrobe_predict"
            self.db.add_row_to_table("Pipeline", d)

        self.db.commit()
Example #6
0
 def __init__(self, ini_file):
     try:
         self.dbc = db_connection.DbConnection(ini_file, create=True)
     except:
         raise Error("Error connecting to database")
Example #7
0
    def test_nextflow_import(self):
        '''test nextflow_import'''
        nextflow_helper.write_config_file()
        pipeline_root = 'tmp.nextflow_import.pipeline_root'
        os.mkdir(pipeline_root)
        try:
            db_connection.DbConnection(db_ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(db_ini_file)
        dbm.run()

        dropbox_dir = 'tmp.nextflow_import.dropbox'
        shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir)
        xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive'
        os.mkdir(xlsx_archive_dir)
        expected_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx'))
        ]

        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf')
        work_dir = 'tmp.nextflow_import.work'
        dag_file = 'nextflow.import.dag.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root',
            pipeline_root, '--db_config_file', db_ini_file,
            '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file,
            '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # All files should be gone from the dropbox
        self.assertEqual([], os.listdir(dropbox_dir))
        shutil.rmtree(dropbox_dir)

        # The two spreadsheets should have been archived
        got_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx'))
        ]
        self.assertEqual(expected_xlsx_files, got_xlsx_files)
        shutil.rmtree(xlsx_archive_dir)

        # Check database updated correctly
        database = db.Db(db_ini_file)
        expected_sample_rows = [
            {
                'subject_id': 'p1',
                'site_id': 's1',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center A',
                'ena_sample_accession': 'ERS123456',
                'ena_study_accession': None
            },
            {
                'subject_id': 'p2',
                'site_id': 's2',
                'sample_id_from_lab': 'l2',
                'dataset_name': 'g2',
                'ena_center_name': 'Center A',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
            {
                'subject_id': 'p1',
                'site_id': 's3',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center B',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
        ]
        got_sample_rows = sorted(database.get_rows_from_table('Sample'),
                                 key=itemgetter('site_id'))
        # the rows also have the sample_id, which is made by mysql auto increment,
        # We don't know the order in which things are added, so can't check the sample_id.
        for row in got_sample_rows:
            del row['sample_id']

        self.assertEqual(expected_sample_rows, got_sample_rows)

        expected_rows = [
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'edc176f367fe8e5a014c819b9ec9b05c',
                'original_reads_file_2_md5':
                '0dd551a0d76d90059808f6f7ddbb0e02',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 0,
                'ena_run_accession': 'ERR123456',
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'fe5cd28cf9394be14794f0a56a2fe845',
                'original_reads_file_2_md5':
                'd026fd9a439294ed42795bd7f1e7df10',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'aa8f077673c158c4f2a19fc3c50e3fa7',
                'original_reads_file_2_md5':
                'ae6bafef67da3c26576e799c32985ac9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '2',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                '6b9a34ed492dad739ac03e084f3b2ab9',
                'original_reads_file_2_md5':
                '7ceffc5314ff7e305b4ab5bd859850c9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
            {
                'sequence_replicate_number': 2,
                'original_reads_file_1_md5':
                'ec0377e321c59c0b1b6392a3c6dfc2dc',
                'original_reads_file_2_md5':
                'd541ffdb43a0648233ec7408c3626bfd',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
        ]

        expected_rows.sort(key=itemgetter('original_reads_file_1_md5'))
        query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)'
        got_rows = database.query_to_dict(query)
        got_rows.sort(key=itemgetter('original_reads_file_1_md5'))

        # Check reads files etc written correctly
        for isolate_data in got_rows:
            iso_dir = isolate_dir.IsolateDir(pipeline_root,
                                             isolate_data['sample_id'],
                                             isolate_data['isolate_id'])
            self.assertTrue(os.path.exists(iso_dir.reads_dir))

            for i in [1, 2]:
                self.assertTrue(
                    os.path.exists(
                        iso_dir.reads_filename(
                            'original',
                            isolate_data['sequence_replicate_number'], i)))

        # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated.
        for row in got_rows:
            del row['sample_id']
            del row['seqrep_id']
            del row['isolate_id']

        self.assertEqual(expected_rows, got_rows)

        shutil.rmtree(pipeline_root)
        nextflow_helper.clean_files()
        database.commit_and_close()
        db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)