Ejemplo n.º 1
0
    def test_run(self):
        """test run"""
        archive_dir = "tmp.test_spreadsheet_importer_run.archive"
        os.mkdir(archive_dir)
        tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv"

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, "run.dropbox")
        dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox"
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, "import.xlsx")
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir, xlsx_file, db_ini_file, archive_dir, tsv_file
        )

        # test lock file stops it running
        utils.make_empty_file(importer.lock_file)

        with self.assertRaises(Exception):
            importer.run()

        os.unlink(importer.lock_file)

        # we'll just run it - the details are checked in test_import_reads_and_update_db
        importer.run()
        shutil.rmtree(archive_dir)
        shutil.rmtree(dropbox_dir)
        os.unlink(tsv_file)
Ejemplo n.º 2
0
    def _archive_spreadsheet(cls, xlsx_file, archive_dir):
        date_string = utils.date_string_from_file_mtime(xlsx_file)
        date_directory = os.path.join(archive_dir, date_string)
        if not os.path.exists(date_directory):
            try:
                os.mkdir(date_directory)
            except:
                raise Exception("Error mkdir " + date_directory)

        xlsx_basename = os.path.basename(xlsx_file)
        existing_xlsx_files = set(os.listdir(date_directory))

        # this is unlikely to happen, but if name of archived xlsx
        # file already exists, append .N on the end (where N is smallest
        # int such that file does not already exist)
        if xlsx_basename in existing_xlsx_files:
            i = 1
            while xlsx_basename + "." + str(i) in existing_xlsx_files:
                i += 1
            xlsx_basename += "." + str(i)

        new_name = os.path.join(date_directory, xlsx_basename)
        utils.rsync_and_md5(xlsx_file, new_name)
        os.unlink(xlsx_file)
        return new_name
Ejemplo n.º 3
0
    def test_date_string_from_file_mtime(self):
        '''test date_string_from_file_mtime'''
        tmpfile = 'tmp.test_date_string_from_file_mtime'
        with open(tmpfile, 'w'):
            pass

        today = datetime.date.today()
        got = utils.date_string_from_file_mtime(tmpfile)

        def int_to_str(x):
            if x < 10:
                return '0' + str(x)
            else:
                return str(x)

        self.assertEqual(str(today.year), got[0:4])
        self.assertEqual(int_to_str(today.month), got[4:6])
        self.assertEqual(int_to_str(today.day), got[6:])
        os.unlink(tmpfile)
Ejemplo n.º 4
0
    def test_archive_spreadsheet(self):
        """test _archive_spreadsheet"""
        archive_dir = "tmp.test_archive_spreadsheet_archive"
        os.mkdir(archive_dir)
        xlsx_file = "tmp.test_archive_spreadsheet.xlsx"
        xlsx_file2 = "tmp.test_archive_spreadsheet2.xlsx"
        with open(xlsx_file, "w"):
            pass

        date = utils.date_string_from_file_mtime(xlsx_file)
        date_dir = os.path.join(archive_dir, date)
        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file, archive_dir
        )
        expected_filename = os.path.join(date_dir, xlsx_file)
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file))
        self.assertTrue(os.path.exists(expected_filename))

        with open(xlsx_file, "w"):
            pass

        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file, archive_dir
        )
        expected_filename = os.path.join(date_dir, xlsx_file + ".1")
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file))
        self.assertTrue(os.path.exists(expected_filename))

        with open(xlsx_file2, "w"):
            pass

        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file2, archive_dir
        )
        expected_filename = os.path.join(date_dir, xlsx_file2)
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file2))
        self.assertTrue(os.path.exists(expected_filename))

        shutil.rmtree(archive_dir)
    def test_archive_spreadsheet(self):
        '''test _archive_spreadsheet'''
        archive_dir = 'tmp.test_archive_spreadsheet_archive'
        os.mkdir(archive_dir)
        xlsx_file = 'tmp.test_archive_spreadsheet.xlsx'
        xlsx_file2 = 'tmp.test_archive_spreadsheet2.xlsx'
        with open(xlsx_file, 'w'):
            pass

        date = utils.date_string_from_file_mtime(xlsx_file)
        date_dir = os.path.join(archive_dir, date)
        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file, archive_dir)
        expected_filename = os.path.join(date_dir, xlsx_file)
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file))
        self.assertTrue(os.path.exists(expected_filename))

        with open(xlsx_file, 'w'):
            pass

        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file, archive_dir)
        expected_filename = os.path.join(date_dir, xlsx_file + '.1')
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file))
        self.assertTrue(os.path.exists(expected_filename))

        with open(xlsx_file2, 'w'):
            pass

        got_filename = spreadsheet_importer.SpreadsheetImporter._archive_spreadsheet(
            xlsx_file2, archive_dir)
        expected_filename = os.path.join(date_dir, xlsx_file2)
        self.assertEqual(expected_filename, got_filename)
        self.assertFalse(os.path.exists(xlsx_file2))
        self.assertTrue(os.path.exists(expected_filename))

        shutil.rmtree(archive_dir)
Ejemplo n.º 6
0
    def test_import_reads_and_update_db(self):
        """test _import_reads_and_update_db"""
        archive_dir = "tmp.test_spreadsheet_importer_run.archive"
        os.mkdir(archive_dir)
        db_backup_dir = "tmp.test_spreadsheet_importer_run.db_backup"
        os.mkdir(db_backup_dir)
        tsv_file = "tmp.test_spreadsheet_importer_run.out.tsv"

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, "run.dropbox")
        dropbox_dir = "tmp.test_spreadsheet_importer_run.out.dropbox"
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, "import.xlsx")
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir,
            xlsx_file,
            db_ini_file,
            archive_dir,
            tsv_file,
            db_backup_dir=db_backup_dir,
        )
        importer._import_reads_and_update_db()

        # check xlsx file and import jobs file got archived
        xlsx_archive_file = os.path.join(archive_dir, date, "import.xlsx")
        self.assertTrue(os.path.exists(xlsx_archive_file))
        self.assertTrue(os.path.exists(xlsx_archive_file + ".import_jobs.tsv"))
        self.assertFalse(os.path.exists(xlsx_file))
        shutil.rmtree(archive_dir)

        # check database got backed up
        backup_files = os.listdir(db_backup_dir)
        self.assertEqual(1, len(backup_files))
        shutil.rmtree(db_backup_dir)

        # check tsv file is correct
        reads_prefix = os.path.abspath(os.path.join(dropbox_dir, "reads"))
        expected_tsv_lines = [
            "\t".join(
                [
                    "seqrep_id",
                    "sample_id",
                    "isolate_id",
                    "sequence_replicate_number",
                    "reads1",
                    "reads2",
                    "reads1_md5",
                    "reads2_md5",
                ]
            ),
            "\t".join(
                [
                    "1",
                    "1",
                    "1",
                    "43",
                    reads_prefix + ".1_1.fq",
                    reads_prefix + ".1_2.fq",
                    "abcdefghijklmnopqrstuvwyx123456",
                    "abcdefghijklmnopqrstuvwyx123457",
                ]
            ),
            "\t".join(
                [
                    "2",
                    "2",
                    "2",
                    "45",
                    reads_prefix + ".2_1.fq",
                    reads_prefix + ".2_2.fq",
                    "a73817805eb1d44ca88eb5cb794c7de7",
                    "d468360c689d482b227256d887a05996",
                ]
            ),
        ]
        with open(tsv_file) as f:
            got_tsv_lines = [line.rstrip() for line in f]

        self.assertEqual(expected_tsv_lines, got_tsv_lines)
        os.unlink(tsv_file)

        # check database is correct
        got_sample = self.db.get_rows_from_table("Sample", order_by="sample_id")
        expected_sample = [
            {
                "sample_id": 1,
                "subject_id": "p1",
                "site_id": "s1",
                "sample_id_from_lab": "l1",
                "dataset_name": "g1",
                "ena_center_name": "Center 1",
                "ena_sample_accession": "ERS123456",
                "ena_study_accession": None,
            },
            {
                "sample_id": 2,
                "subject_id": "p2",
                "site_id": "s2",
                "sample_id_from_lab": "l2",
                "dataset_name": "g2",
                "ena_center_name": "Center 1",
                "ena_sample_accession": None,
                "ena_study_accession": None,
            },
        ]
        self.maxDiff = None
        self.assertEqual(expected_sample, got_sample)

        got_seqrep = self.db.get_rows_from_table("Seqrep", order_by="seqrep_id")
        expected_seqrep = [
            {
                "seqrep_id": 1,
                "isolate_id": 1,
                "sequence_replicate_number": 43,
                "original_reads_file_1_md5": "abcdefghijklmnopqrstuvwyx123456",
                "original_reads_file_2_md5": "abcdefghijklmnopqrstuvwyx123457",
                "remove_contam_reads_file_1_md5": None,
                "remove_contam_reads_file_2_md5": None,
                "withdrawn": 0,
                "import_status": 0,
                "submission_date": datetime.date(2017, 12, 25),
                "instrument_model": "Illumina HiSeq 2000",
                "submit_to_ena": 0,
                "ena_run_accession": "ERR123456",
                "ena_on_hold": 0,
            },
            {
                "seqrep_id": 2,
                "isolate_id": 2,
                "sequence_replicate_number": 45,
                "original_reads_file_1_md5": "a73817805eb1d44ca88eb5cb794c7de7",
                "original_reads_file_2_md5": "d468360c689d482b227256d887a05996",
                "remove_contam_reads_file_1_md5": None,
                "remove_contam_reads_file_2_md5": None,
                "withdrawn": 0,
                "import_status": 0,
                "submission_date": datetime.date(2017, 12, 26),
                "instrument_model": "Illumina HiSeq 2000",
                "submit_to_ena": 1,
                "ena_run_accession": None,
                "ena_on_hold": 1,
            },
        ]

        self.assertEqual(expected_seqrep, got_seqrep)
        shutil.rmtree(dropbox_dir)
    def test_import_reads_and_update_db(self):
        '''test _import_reads_and_update_db'''
        archive_dir = 'tmp.test_spreadsheet_importer_run.archive'
        os.mkdir(archive_dir)
        db_backup_dir = 'tmp.test_spreadsheet_importer_run.db_backup'
        os.mkdir(db_backup_dir)
        tsv_file = 'tmp.test_spreadsheet_importer_run.out.tsv'

        # need to copy the dropbox directory, because run() will move the
        # xlsx file
        original_dropbox_dir = os.path.join(data_dir, 'run.dropbox')
        dropbox_dir = 'tmp.test_spreadsheet_importer_run.out.dropbox'
        shutil.copytree(original_dropbox_dir, dropbox_dir)
        xlsx_file = os.path.join(dropbox_dir, 'import.xlsx')
        date = utils.date_string_from_file_mtime(xlsx_file)

        importer = spreadsheet_importer.SpreadsheetImporter(
            dropbox_dir,
            xlsx_file,
            db_ini_file,
            archive_dir,
            tsv_file,
            db_backup_dir=db_backup_dir)
        importer._import_reads_and_update_db()

        # check xlsx file and import jobs file got archived
        xlsx_archive_file = os.path.join(archive_dir, date, 'import.xlsx')
        self.assertTrue(os.path.exists(xlsx_archive_file))
        self.assertTrue(os.path.exists(xlsx_archive_file + '.import_jobs.tsv'))
        self.assertFalse(os.path.exists(xlsx_file))
        shutil.rmtree(archive_dir)

        # check database got backed up
        backup_files = os.listdir(db_backup_dir)
        self.assertEqual(1, len(backup_files))
        shutil.rmtree(db_backup_dir)

        # check tsv file is correct
        reads_prefix = os.path.abspath(os.path.join(dropbox_dir, 'reads'))
        expected_tsv_lines = [
            '\t'.join([
                'seqrep_id', 'sample_id', 'isolate_id',
                'sequence_replicate_number', 'reads1', 'reads2', 'reads1_md5',
                'reads2_md5'
            ]),
            '\t'.join([
                '1', '1', '1', '43', reads_prefix + '.1_1.fq',
                reads_prefix + '.1_2.fq', 'abcdefghijklmnopqrstuvwyx123456',
                'abcdefghijklmnopqrstuvwyx123457'
            ]),
            '\t'.join([
                '2', '2', '2', '45', reads_prefix + '.2_1.fq',
                reads_prefix + '.2_2.fq', 'a73817805eb1d44ca88eb5cb794c7de7',
                'd468360c689d482b227256d887a05996'
            ]),
        ]
        with open(tsv_file) as f:
            got_tsv_lines = [line.rstrip() for line in f]

        self.assertEqual(expected_tsv_lines, got_tsv_lines)
        os.unlink(tsv_file)

        # check database is correct
        got_sample = self.db.get_rows_from_table('Sample',
                                                 order_by='sample_id')
        expected_sample = [{
            'sample_id': 1,
            'subject_id': 'p1',
            'site_id': 's1',
            'sample_id_from_lab': 'l1',
            'dataset_name': 'g1',
            'ena_center_name': 'Center 1',
            'ena_sample_accession': 'ERS123456',
            'ena_study_accession': None,
        }, {
            'sample_id': 2,
            'subject_id': 'p2',
            'site_id': 's2',
            'sample_id_from_lab': 'l2',
            'dataset_name': 'g2',
            'ena_center_name': 'Center 1',
            'ena_sample_accession': None,
            'ena_study_accession': None,
        }]
        self.maxDiff = None
        self.assertEqual(expected_sample, got_sample)

        got_seqrep = self.db.get_rows_from_table('Seqrep',
                                                 order_by='seqrep_id')
        expected_seqrep = [{
            'seqrep_id': 1,
            'isolate_id': 1,
            'sequence_replicate_number': 43,
            'original_reads_file_1_md5': 'abcdefghijklmnopqrstuvwyx123456',
            'original_reads_file_2_md5': 'abcdefghijklmnopqrstuvwyx123457',
            'remove_contam_reads_file_1_md5': None,
            'remove_contam_reads_file_2_md5': None,
            'withdrawn': 0,
            'import_status': 0,
            'submission_date': datetime.date(2017, 12, 25),
            'instrument_model': 'Illumina HiSeq 2000',
            'submit_to_ena': 0,
            'ena_run_accession': 'ERR123456',
            'ena_on_hold': 0,
        }, {
            'seqrep_id': 2,
            'isolate_id': 2,
            'sequence_replicate_number': 45,
            'original_reads_file_1_md5': 'a73817805eb1d44ca88eb5cb794c7de7',
            'original_reads_file_2_md5': 'd468360c689d482b227256d887a05996',
            'remove_contam_reads_file_1_md5': None,
            'remove_contam_reads_file_2_md5': None,
            'withdrawn': 0,
            'import_status': 0,
            'submission_date': datetime.date(2017, 12, 26),
            'instrument_model': 'Illumina HiSeq 2000',
            'submit_to_ena': 1,
            'ena_run_accession': None,
            'ena_on_hold': 1,
        }]

        self.assertEqual(expected_seqrep, got_seqrep)
        shutil.rmtree(dropbox_dir)