Example #1
0
    def test_run(self):
        """test run"""
        archive_dir = "tmp.test_spreadsheet_importer_run.archive"
        os.mkdir(archive_dir)
        tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv"

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, "run.dropbox")
        dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox"
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, "import.xlsx")
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir, xlsx_file, db_ini_file, archive_dir, tsv_file
        )

        # test lock file stops it running
        utils.make_empty_file(importer.lock_file)

        with self.assertRaises(Exception):
            importer.run()

        os.unlink(importer.lock_file)

        # we'll just run it - the details are checked in test_import_reads_and_update_db
        importer.run()
        shutil.rmtree(archive_dir)
        shutil.rmtree(dropbox_dir)
        os.unlink(tsv_file)
def run(options):
    importer = spreadsheet_importer.SpreadsheetImporter(
        options.dropbox_dir,
        options.xlsx_file,
        options.db_config_file,
        options.xls_archive_dir,
        options.jobs_outfile,
        db_backup_dir=options.db_backup_dir,
    )
    importer.run()
Example #3
0
    def test_import_reads_and_update_db(self):
        """test _import_reads_and_update_db"""
        archive_dir = "tmp.test_spreadsheet_importer_run.archive"
        os.mkdir(archive_dir)
        db_backup_dir = "tmp.test_spreadsheet_importer_run.db_backup"
        os.mkdir(db_backup_dir)
        tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv"

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, "run.dropbox")
        dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox"
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, "import.xlsx")
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir,
            xlsx_file,
            db_ini_file,
            archive_dir,
            tsv_file,
            db_backup_dir=db_backup_dir,
        )
        importer._import_reads_and_update_db()

        # check xlsx file and import jobs file got archived
        xlsx_archive_file = os.path.join(archive_dir, date, "import.xlsx")
        self.assertTrue(os.path.exists(xlsx_archive_file))
        self.assertTrue(os.path.exists(xlsx_archive_file + ".import_jobs.tsv"))
        self.assertFalse(os.path.exists(xlsx_file))
        shutil.rmtree(archive_dir)

        # check database got backed up
        backup_files = os.listdir(db_backup_dir)
        self.assertEqual(1, len(backup_files))
        shutil.rmtree(db_backup_dir)

        # check tsv file is correct
        reads_prefix = os.path.abspath(os.path.join(dropbox_dir, "reads"))
        expected_tsv_lines = [
            "\t".join(
                [
                    "seqrep_id",
                    "sample_id",
                    "isolate_id",
                    "sequence_replicate_number",
                    "reads1",
                    "reads2",
                    "reads1_md5",
                    "reads2_md5",
                ]
            ),
            "\t".join(
                [
                    "1",
                    "1",
                    "1",
                    "43",
                    reads_prefix + ".1_1.fq",
                    reads_prefix + ".1_2.fq",
                    "abcdefghijklmnopqrstuvwyx123456",
                    "abcdefghijklmnopqrstuvwyx123457",
                ]
            ),
            "\t".join(
                [
                    "2",
                    "2",
                    "2",
                    "45",
                    reads_prefix + ".2_1.fq",
                    reads_prefix + ".2_2.fq",
                    "a73817805eb1d44ca88eb5cb794c7de7",
                    "d468360c689d482b227256d887a05996",
                ]
            ),
        ]
        with open(tsv_file) as f:
            got_tsv_lines = [line.rstrip() for line in f]

        self.assertEqual(expected_tsv_lines, got_tsv_lines)
        os.unlink(tsv_file)

        # check database is correct
        got_sample = self.db.get_rows_from_table("Sample", order_by="sample_id")
        expected_sample = [
            {
                "sample_id": 1,
                "subject_id": "p1",
                "site_id": "s1",
                "sample_id_from_lab": "l1",
                "dataset_name": "g1",
                "ena_center_name": "Center 1",
                "ena_sample_accession": "ERS123456",
                "ena_study_accession": None,
            },
            {
                "sample_id": 2,
                "subject_id": "p2",
                "site_id": "s2",
                "sample_id_from_lab": "l2",
                "dataset_name": "g2",
                "ena_center_name": "Center 1",
                "ena_sample_accession": None,
                "ena_study_accession": None,
            },
        ]
        self.maxDiff = None
        self.assertEqual(expected_sample, got_sample)

        got_seqrep = self.db.get_rows_from_table("Seqrep", order_by="seqrep_id")
        expected_seqrep = [
            {
                "seqrep_id": 1,
                "isolate_id": 1,
                "sequence_replicate_number": 43,
                "original_reads_file_1_md5": "abcdefghijklmnopqrstuvwyx123456",
                "original_reads_file_2_md5": "abcdefghijklmnopqrstuvwyx123457",
                "remove_contam_reads_file_1_md5": None,
                "remove_contam_reads_file_2_md5": None,
                "withdrawn": 0,
                "import_status": 0,
                "submission_date": datetime.date(2017, 12, 25),
                "instrument_model": "Illumina HiSeq 2000",
                "submit_to_ena": 0,
                "ena_run_accession": "ERR123456",
                "ena_on_hold": 0,
            },
            {
                "seqrep_id": 2,
                "isolate_id": 2,
                "sequence_replicate_number": 45,
                "original_reads_file_1_md5": "a73817805eb1d44ca88eb5cb794c7de7",
                "original_reads_file_2_md5": "d468360c689d482b227256d887a05996",
                "remove_contam_reads_file_1_md5": None,
                "remove_contam_reads_file_2_md5": None,
                "withdrawn": 0,
                "import_status": 0,
                "submission_date": datetime.date(2017, 12, 26),
                "instrument_model": "Illumina HiSeq 2000",
                "submit_to_ena": 1,
                "ena_run_accession": None,
                "ena_on_hold": 1,
            },
        ]

        self.assertEqual(expected_seqrep, got_seqrep)
        shutil.rmtree(dropbox_dir)
    def test_import_reads_and_update_db(self):
        '''test _import_reads_and_update_db'''
        archive_dir = 'tmp.test_spreadsheet_importer_run.archive'
        os.mkdir(archive_dir)
        db_backup_dir = 'tmp.test_spreadsheet_importer_run.db_backup'
        os.mkdir(db_backup_dir)
        tsv_file = 'tmp.test_spreadsheet_importer_run.out.tsv'

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, 'run.dropbox')
        dropbox_dir = 'tmp.test_spreadsheet_importer_run.out.dropbox'
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, 'import.xlsx')
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir,
            xlsx_file,
            db_ini_file,
            archive_dir,
            tsv_file,
            db_backup_dir=db_backup_dir)
        importer._import_reads_and_update_db()

        # check xlsx file and import jobs file got archived
        xlsx_archive_file = os.path.join(archive_dir, date, 'import.xlsx')
        self.assertTrue(os.path.exists(xlsx_archive_file))
        self.assertTrue(os.path.exists(xlsx_archive_file + '.import_jobs.tsv'))
        self.assertFalse(os.path.exists(xlsx_file))
        shutil.rmtree(archive_dir)

        # check database got backed up
        backup_files = os.listdir(db_backup_dir)
        self.assertEqual(1, len(backup_files))
        shutil.rmtree(db_backup_dir)

        # check tsv file is correct
        reads_prefix = os.path.abspath(os.path.join(dropbox_dir, 'reads'))
        expected_tsv_lines = [
            '\t'.join([
                'seqrep_id', 'sample_id', 'isolate_id',
                'sequence_replicate_number', 'reads1', 'reads2', 'reads1_md5',
                'reads2_md5'
            ]),
            '\t'.join([
                '1', '1', '1', '43', reads_prefix + '.1_1.fq',
                reads_prefix + '.1_2.fq', 'abcdefghijklmnopqrstuvwyx123456',
                'abcdefghijklmnopqrstuvwyx123457'
            ]),
            '\t'.join([
                '2', '2', '2', '45', reads_prefix + '.2_1.fq',
                reads_prefix + '.2_2.fq', 'a73817805eb1d44ca88eb5cb794c7de7',
                'd468360c689d482b227256d887a05996'
            ]),
        ]
        with open(tsv_file) as f:
            got_tsv_lines = [line.rstrip() for line in f]

        self.assertEqual(expected_tsv_lines, got_tsv_lines)
        os.unlink(tsv_file)

        # check database is correct
        got_sample = self.db.get_rows_from_table('Sample',
                                                 order_by='sample_id')
        expected_sample = [{
            'sample_id': 1,
            'subject_id': 'p1',
            'site_id': 's1',
            'sample_id_from_lab': 'l1',
            'dataset_name': 'g1',
            'ena_center_name': 'Center 1',
            'ena_sample_accession': 'ERS123456',
            'ena_study_accession': None,
        }, {
            'sample_id': 2,
            'subject_id': 'p2',
            'site_id': 's2',
            'sample_id_from_lab': 'l2',
            'dataset_name': 'g2',
            'ena_center_name': 'Center 1',
            'ena_sample_accession': None,
            'ena_study_accession': None,
        }]
        self.maxDiff = None
        self.assertEqual(expected_sample, got_sample)

        got_seqrep = self.db.get_rows_from_table('Seqrep',
                                                 order_by='seqrep_id')
        expected_seqrep = [{
            'seqrep_id': 1,
            'isolate_id': 1,
            'sequence_replicate_number': 43,
            'original_reads_file_1_md5': 'abcdefghijklmnopqrstuvwyx123456',
            'original_reads_file_2_md5': 'abcdefghijklmnopqrstuvwyx123457',
            'remove_contam_reads_file_1_md5': None,
            'remove_contam_reads_file_2_md5': None,
            'withdrawn': 0,
            'import_status': 0,
            'submission_date': datetime.date(2017, 12, 25),
            'instrument_model': 'Illumina HiSeq 2000',
            'submit_to_ena': 0,
            'ena_run_accession': 'ERR123456',
            'ena_on_hold': 0,
        }, {
            'seqrep_id': 2,
            'isolate_id': 2,
            'sequence_replicate_number': 45,
            'original_reads_file_1_md5': 'a73817805eb1d44ca88eb5cb794c7de7',
            'original_reads_file_2_md5': 'd468360c689d482b227256d887a05996',
            'remove_contam_reads_file_1_md5': None,
            'remove_contam_reads_file_2_md5': None,
            'withdrawn': 0,
            'import_status': 0,
            'submission_date': datetime.date(2017, 12, 26),
            'instrument_model': 'Illumina HiSeq 2000',
            'submit_to_ena': 1,
            'ena_run_accession': None,
            'ena_on_hold': 1,
        }]

        self.assertEqual(expected_seqrep, got_seqrep)
        shutil.rmtree(dropbox_dir)