def _archive_spreadsheet(cls, xlsx_file, archive_dir):
        date_string = utils.date_string_from_file_mtime(xlsx_file)
        date_directory = os.path.join(archive_dir, date_string)
        if not os.path.exists(date_directory):
            try:
                os.mkdir(date_directory)
            except:
                raise Exception("Error mkdir " + date_directory)

        xlsx_basename = os.path.basename(xlsx_file)
        existing_xlsx_files = set(os.listdir(date_directory))

        # this is unlikely to happen, but if name of archived xlsx
        # file already exists, append .N on the end (where N is smallest
        # int such that file does not already exist)
        if xlsx_basename in existing_xlsx_files:
            i = 1
            while xlsx_basename + "." + str(i) in existing_xlsx_files:
                i += 1
            xlsx_basename += "." + str(i)

        new_name = os.path.join(date_directory, xlsx_basename)
        utils.rsync_and_md5(xlsx_file, new_name)
        os.unlink(xlsx_file)
        return new_name
Esempio n. 2
0
    def _tidy_files(self):
        shutil.rmtree(os.path.join(self.cortex_outdir, 'tmp_filelists'))

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, 'binaries', 'uncleaned', '**',
                             '**')):
            if not (filename.endswith('log') or filename.endswith('.covg')):
                os.unlink(filename)

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, 'calls', '**')):
            if os.path.isdir(filename):
                for filename2 in os.listdir(filename):
                    if not filename2.endswith('log'):
                        os.unlink(os.path.join(filename, filename2))
            elif not (filename.endswith('log')
                      or filename.endswith('callsets.genotyped')):
                os.unlink(filename)

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, 'vcfs', '**')):
            if filename.endswith('.vcf'):
                tmp_vcf = filename + '.tmp'
                _replace_sample_name_in_vcf(filename, tmp_vcf,
                                            self.sample_name)
                utils.rsync_and_md5(tmp_vcf, filename)
                os.unlink(tmp_vcf)

            if not ((filename.endswith('.vcf') and 'FINAL' in filename)
                    or filename.endswith('log')
                    or filename.endswith('aligned_branches')):
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.unlink(filename)
Esempio n. 3
0
    def _tidy_files(self):
        shutil.rmtree(os.path.join(self.cortex_outdir, "tmp_filelists"))

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, "binaries", "uncleaned", "**",
                             "**")):
            if not (filename.endswith("log") or filename.endswith(".covg")):
                os.unlink(filename)

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, "calls", "**")):
            if os.path.isdir(filename):
                for filename2 in os.listdir(filename):
                    if not filename2.endswith("log"):
                        os.unlink(os.path.join(filename, filename2))
            elif not (filename.endswith("log")
                      or filename.endswith("callsets.genotyped")):
                os.unlink(filename)

        for filename in glob.glob(
                os.path.join(self.cortex_outdir, "vcfs", "**")):
            if filename.endswith(".vcf"):
                tmp_vcf = filename + ".tmp"
                _replace_sample_name_in_vcf(filename, tmp_vcf,
                                            self.sample_name)
                utils.rsync_and_md5(tmp_vcf, filename)
                os.unlink(tmp_vcf)

            if not ((filename.endswith(".vcf") and "FINAL" in filename)
                    or filename.endswith("log")
                    or filename.endswith("aligned_branches")):
                if os.path.isdir(filename):
                    shutil.rmtree(filename)
                else:
                    os.unlink(filename)
    def _import_reads_and_update_db(self):
        database = db.Db(self.db_ini_file)
        data = spreadsheet_helper.load_data_from_spreadsheet(self.xlsx_file)
        xlsx_dir = os.path.dirname(self.xlsx_file)
        data_errors = SpreadsheetImporter._validate_data(
            database, data, self.dropbox_dir)

        if len(data_errors) > 0:
            raise Exception("Error(s) importing spreadsheet:\n" +
                            "\n".join(data_errors))

        try:
            f_out = open(self.jobs_outfile, "w")
        except:
            raise Exception('Error opening file "' + self.jobs_outfile +
                            '". Cannot continue')

        print(
            "seqrep_id",
            "sample_id",
            "isolate_id",
            "sequence_replicate_number",
            "reads1",
            "reads2",
            "reads1_md5",
            "reads2_md5",
            sep="\t",
            file=f_out,
        )

        for data_dict in data:
            reads1 = os.path.join(xlsx_dir, data_dict["reads_file_1"])
            reads2 = os.path.join(xlsx_dir, data_dict["reads_file_2"])
            assert os.path.exists(reads1) and os.path.exists(reads2)
            seqrep_id, isolate_id, sample_id = database.add_one_seqrep(
                data_dict)
            print(
                seqrep_id,
                sample_id,
                isolate_id,
                data_dict["sequence_replicate_number"],
                reads1,
                reads2,
                data_dict["reads_file_1_md5"],
                data_dict["reads_file_2_md5"],
                sep="\t",
                file=f_out,
            )

        f_out.close()
        xlsx_backup_file = SpreadsheetImporter._archive_spreadsheet(
            self.xlsx_file, self.xlsx_archive_dir)
        jobs_backup_file = xlsx_backup_file + ".import_jobs.tsv"
        assert not os.path.exists(jobs_backup_file)
        utils.rsync_and_md5(self.jobs_outfile, jobs_backup_file)
        database.commit_and_close()

        if self.db_backup_dir is not None:
            database.backup(self.db_backup_dir)
Esempio n. 5
0
 def add_remove_contam_metadata_tsv(self, infile):
     utils.rsync_and_md5(infile, self.remove_contam_metadata_tsv)
     data_by_group, sequence_is_contam = contam_remover.ContamRemover._load_metadata_file(
         self.remove_contam_metadata_tsv)
     names_in_contam = set(sequence_is_contam.keys())
     names_in_fasta = {}
     pyfastaq.tasks.lengths_from_fai(self.ref_fai, names_in_fasta)
     names_in_fasta = set(names_in_fasta.keys())
     if names_in_fasta != names_in_contam:
         raise Error("Mismtach in names from metadata tsv and fasta files")
 def _copy_reads_file(cls, old_name, new_name, expected_md5):
     old_name_md5 = utils.md5(old_name)
     if old_name_md5 != expected_md5:
         raise Error(
             "MD5 given by submitter "
             + expected_md5
             + " does not match calculated MD5 "
             + old_name_md5
         )
     utils.rsync_and_md5(old_name, new_name, expected_md5)
Esempio n. 7
0
 def test_rsync_and_md5(self):
     '''test rsync_and_md5'''
     old_name = os.path.join(data_dir, 'rsync_and_md5.txt')
     new_name = 'tmp.test_rsync_and_md5'
     got = utils.rsync_and_md5(old_name, new_name)
     expected = 'a00096ee7316167c6ceef09f43433667'
     self.assertEqual(expected, got)
     self.assertTrue(filecmp.cmp(old_name, new_name, shallow=False))
     os.unlink(new_name)
Esempio n. 8
0
 def test_rsync_and_md5(self):
     """test rsync_and_md5"""
     old_name = os.path.join(data_dir, "rsync_and_md5.txt")
     new_name = "tmp.test_rsync_and_md5"
     got = utils.rsync_and_md5(old_name, new_name)
     expected = "a00096ee7316167c6ceef09f43433667"
     self.assertEqual(expected, got)
     self.assertTrue(filecmp.cmp(old_name, new_name, shallow=False))
     os.unlink(new_name)
Esempio n. 9
0
    def setup_files(self, species, panel_name, probes_fasta, var_to_res_json):
        try:
            os.mkdir(self.root_dir)
        except:
            raise Exception("Error mkdir " + self.root_dir)

        self.metadata = {
            "species":
            species,
            "name":
            panel_name,
            "is_built_in":
            species in built_in_panels
            and panel_name in built_in_panels[species],
        }

        with open(self.json_file, "w") as f:
            print(json.dumps(self.metadata), file=f)

        if not self.metadata["is_built_in"]:
            utils.rsync_and_md5(probes_fasta, self.probes_fasta)
            utils.rsync_and_md5(var_to_res_json, self.var_to_res_json)
Esempio n. 10
0
    def setup_files(self, species, panel_name, probes_fasta, var_to_res_json):
        try:
            os.mkdir(self.root_dir)
        except:
            raise Error('Error mkdir ' + self.root_dir)

        self.metadata = {
            'species':
            species,
            'name':
            panel_name,
            'is_built_in':
            species in built_in_panels
            and panel_name in built_in_panels[species],
        }

        with open(self.json_file, 'w') as f:
            print(json.dumps(self.metadata), file=f)

        if not self.metadata['is_built_in']:
            utils.rsync_and_md5(probes_fasta, self.probes_fasta)
            utils.rsync_and_md5(var_to_res_json, self.var_to_res_json)
Esempio n. 11
0
    def _test_run(self, original_reads1, original_reads2, expected_import_status):
        '''test run'''
        pipeline_root = 'tmp.read_pair_importer.run.root'
        if os.path.exists(pipeline_root):
            shutil.rmtree(pipeline_root)
        os.mkdir(pipeline_root)
        seqrep_id = 1
        sample = 3
        isolate = 2
        sequence_replicate_number = 42

        # copy the reads because the pipeline will delete them later
        reads1 = 'tmp.read_pair_importer.reads1.fq'
        reads2 = 'tmp.read_pair_importer.reads2.fq'
        md5_1 = utils.rsync_and_md5(original_reads1, reads1)
        md5_2 = utils.rsync_and_md5(original_reads2, reads2)

        # write an md5 file, to check it gets deleted later
        md5_file = reads1 + '.md5'
        utils.syscall('md5sum ' + reads1 + ' > ' + md5_file)

        importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root,
            seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2)

        # no row in Seqrep table
        with self.assertRaises(read_pair_importer.Error):
            importer.run()

        seqrep_row = {'seqrep_id': seqrep_id, 'isolate_id': isolate, 'sequence_replicate_number' : sequence_replicate_number,
            'original_reads_file_1_md5': md5_1, 'original_reads_file_2_md5': md5_2,
            'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None,
            'withdrawn': 0, 'import_status': 0, 'instrument_model': 'Illumina HiSeq 2000',
            'submission_date': '20170101', 'submit_to_ena': 0, 'ena_run_accession': None, 'ena_on_hold': 0}
        self.db.add_row_to_table('Seqrep', seqrep_row)
        self.db.commit()

        # need to create a new object so the database changes get picked up.
        # Separate connections do not see changes made by each other.
        importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root,
            seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2)

        # reads file doesn't exit
        importer.reads_file_1 = 'oops'
        with self.assertRaises(read_pair_importer.Error):
            importer.run()
        importer.reads_file_1 = reads1

        # check lock file works
        iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate)
        iso_dir.make_essential_dirs()
        lock_file = os.path.join(iso_dir.reads_dir, 'import_lock.' + str(seqrep_id))
        utils.make_empty_file(lock_file)
        with self.assertRaises(read_pair_importer.Error):
            importer.run()
        os.unlink(lock_file)

        # should run ok
        where_query = ' and '.join(['sample_id=' + str(sample), 'isolate_number=' + str(isolate), 'sequence_replicate_number=' + str(seqrep_id)])
        rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id))
        self.assertEqual(1, len(rows))
        self.assertEqual(0, rows[0]['import_status'])

        importer = read_pair_importer.ReadPairImporter(ini_file, pipeline_root,
            seqrep_id, isolate, sample, sequence_replicate_number, reads1, reads2, md5_1, md5_2)
        importer.run()
        # reconnect so that we pick up the changes made by the previous line
        self.db.reconnect()
        reads_out_1 = iso_dir.reads_filename('original', sequence_replicate_number, 1)
        reads_out_2 = iso_dir.reads_filename('original', sequence_replicate_number, 2)
        rows = self.db.get_rows_from_table('Seqrep', where='seqrep_id=' + str(seqrep_id))
        self.assertEqual(1, len(rows))
        self.assertEqual(expected_import_status, rows[0]['import_status'])

        # Files either copied/deleted or not depending on if import was successful
        if expected_import_status == 1:
            self.assertTrue(os.path.exists(reads_out_1))
            self.assertTrue(os.path.exists(reads_out_2))
            self.assertFalse(os.path.exists(reads1))
            self.assertFalse(os.path.exists(reads2))
            self.assertFalse(os.path.exists(md5_file))
        else:
            self.assertFalse(os.path.exists(reads_out_1))
            self.assertFalse(os.path.exists(reads_out_2))
            self.assertTrue(os.path.exists(reads1))
            self.assertTrue(os.path.exists(reads2))
            self.assertTrue(os.path.exists(md5_file))
            os.unlink(reads1)
            os.unlink(reads2)
            os.unlink(md5_file)

        shutil.rmtree(pipeline_root)
Esempio n. 12
0
    def _test_run(self, original_reads1, original_reads2,
                  expected_import_status):
        """test run"""
        pipeline_root = "tmp.read_pair_importer.run.root"
        if os.path.exists(pipeline_root):
            shutil.rmtree(pipeline_root)
        os.mkdir(pipeline_root)
        seqrep_id = 1
        sample = 3
        isolate = 2
        sequence_replicate_number = 42

        # copy the reads because the pipeline will delete them later
        reads1 = "tmp.read_pair_importer.reads1.fq"
        reads2 = "tmp.read_pair_importer.reads2.fq"
        md5_1 = utils.rsync_and_md5(original_reads1, reads1)
        md5_2 = utils.rsync_and_md5(original_reads2, reads2)

        # write an md5 file, to check it gets deleted later
        md5_file = reads1 + ".md5"
        utils.syscall("md5sum " + reads1 + " > " + md5_file)

        importer = read_pair_importer.ReadPairImporter(
            ini_file,
            pipeline_root,
            seqrep_id,
            isolate,
            sample,
            sequence_replicate_number,
            reads1,
            reads2,
            md5_1,
            md5_2,
        )

        # no row in Seqrep table
        with self.assertRaises(read_pair_importer.Error):
            importer.run()

        seqrep_row = {
            "seqrep_id": seqrep_id,
            "isolate_id": isolate,
            "sequence_replicate_number": sequence_replicate_number,
            "original_reads_file_1_md5": md5_1,
            "original_reads_file_2_md5": md5_2,
            "remove_contam_reads_file_1_md5": None,
            "remove_contam_reads_file_2_md5": None,
            "withdrawn": 0,
            "import_status": 0,
            "instrument_model": "Illumina HiSeq 2000",
            "submission_date": "20170101",
            "submit_to_ena": 0,
            "ena_run_accession": None,
            "ena_on_hold": 0,
        }
        self.db.add_row_to_table("Seqrep", seqrep_row)
        self.db.commit()

        # need to create a new object so the database changes get picked up.
        # Separate connections do not see changes made by each other.
        importer = read_pair_importer.ReadPairImporter(
            ini_file,
            pipeline_root,
            seqrep_id,
            isolate,
            sample,
            sequence_replicate_number,
            reads1,
            reads2,
            md5_1,
            md5_2,
        )

        # reads file doesn't exit
        importer.reads_file_1 = "oops"
        with self.assertRaises(read_pair_importer.Error):
            importer.run()
        importer.reads_file_1 = reads1

        # check lock file works
        iso_dir = isolate_dir.IsolateDir(pipeline_root, sample, isolate)
        iso_dir.make_essential_dirs()
        lock_file = os.path.join(iso_dir.reads_dir,
                                 "import_lock." + str(seqrep_id))
        utils.make_empty_file(lock_file)
        with self.assertRaises(read_pair_importer.Error):
            importer.run()
        os.unlink(lock_file)

        # should run ok
        where_query = " and ".join([
            "sample_id=" + str(sample),
            "isolate_number=" + str(isolate),
            "sequence_replicate_number=" + str(seqrep_id),
        ])
        rows = self.db.get_rows_from_table("Seqrep",
                                           where="seqrep_id=" + str(seqrep_id))
        self.assertEqual(1, len(rows))
        self.assertEqual(0, rows[0]["import_status"])

        importer = read_pair_importer.ReadPairImporter(
            ini_file,
            pipeline_root,
            seqrep_id,
            isolate,
            sample,
            sequence_replicate_number,
            reads1,
            reads2,
            md5_1,
            md5_2,
        )
        importer.run()
        # reconnect so that we pick up the changes made by the previous line
        self.db.reconnect()
        reads_out_1 = iso_dir.reads_filename("original",
                                             sequence_replicate_number, 1)
        reads_out_2 = iso_dir.reads_filename("original",
                                             sequence_replicate_number, 2)
        rows = self.db.get_rows_from_table("Seqrep",
                                           where="seqrep_id=" + str(seqrep_id))
        self.assertEqual(1, len(rows))
        self.assertEqual(expected_import_status, rows[0]["import_status"])

        # Files either copied/deleted or not depending on if import was successful
        if expected_import_status == 1:
            self.assertTrue(os.path.exists(reads_out_1))
            self.assertTrue(os.path.exists(reads_out_2))
            self.assertFalse(os.path.exists(reads1))
            self.assertFalse(os.path.exists(reads2))
            self.assertFalse(os.path.exists(md5_file))
        else:
            self.assertFalse(os.path.exists(reads_out_1))
            self.assertFalse(os.path.exists(reads_out_2))
            self.assertTrue(os.path.exists(reads1))
            self.assertTrue(os.path.exists(reads2))
            self.assertTrue(os.path.exists(md5_file))
            os.unlink(reads1)
            os.unlink(reads2)
            os.unlink(md5_file)

        shutil.rmtree(pipeline_root)