Ejemplo n.º 1
0
    def test_check_and_report_dependencies(self):
        """test check_and_report_dependencies"""
        tmpfile = "tmp.check_and_report_dependencies.out"
        if os.path.exists(tmpfile):
            os.unlink(tmpfile)
        dependencies.check_and_report_dependencies(outfile=tmpfile)
        # We don't inow what the versions etc will be, so just
        # check file got written
        self.assertTrue(os.path.exists(tmpfile))
        os.unlink(tmpfile)

        os.environ["MINOS_BWA"] = "oops_this_is_wrong"
        with self.assertRaises(Exception):
            dependencies.check_and_report_dependencies(outfile=tmpfile)
        del os.environ["MINOS_BWA"]
        self.assertTrue(os.path.exists(tmpfile))
        os.unlink(tmpfile)
Ejemplo n.º 2
0
    def test_check_and_report_dependencies(self):
        '''test check_and_report_dependencies'''
        tmpfile = 'tmp.check_and_report_dependencies.out'
        if os.path.exists(tmpfile):
            os.unlink(tmpfile)
        dependencies.check_and_report_dependencies(outfile=tmpfile)
        # We don't inow what the versions etc will be, so just
        # check file got written
        self.assertTrue(os.path.exists(tmpfile))
        os.unlink(tmpfile)

        os.environ['MINOS_BWA'] = 'oops_this_is_wrong'
        with self.assertRaises(dependencies.Error):
            dependencies.check_and_report_dependencies(outfile=tmpfile)
        del os.environ['MINOS_BWA']
        self.assertTrue(os.path.exists(tmpfile))
        os.unlink(tmpfile)
Ejemplo n.º 3
0
def run(options):
    dependencies.check_and_report_dependencies("-")
Ejemplo n.º 4
0
    def run(self):
        self._make_output_dir()
        fh = logging.FileHandler(self.log_file, mode='w')
        log = logging.getLogger()
        formatter = logging.Formatter(
            '[minos %(asctime)s %(levelname)s] %(message)s',
            datefmt='%d-%m-%Y %H:%M:%S')
        fh.setFormatter(formatter)
        log.addHandler(fh)
        dependencies.check_and_report_dependencies(programs=['nextflow'])

        self._prepare_nextflow_input_files()
        original_dir = os.getcwd()
        os.chdir(self.output_dir)
        nextflow_script = 'nextflow.run.nf'
        MultiSamplePipeline._write_nextflow_script(nextflow_script)
        logging.info('Prepared nextflow files. cd ' + self.output_dir)

        nextflow = dependencies.find_binary('nextflow')
        nextflow_command = [
            nextflow,
            'run',
            '-work-dir',
            self.nextflow_work_dir,
            '-with-dag',
            'nextflow.out.dag.pdf',
            '-with-trace',
            'nextflow.out.trace.txt',
        ]

        if self.nextflow_config_file is not None:
            nextflow_command.extend(['-c', self.nextflow_config_file])

        nextflow_command += [
            nextflow_script,
            '--ref_fasta',
            self.ref_fasta,
            '--data_in_tsv',
            self.nextflow_input_tsv,
            '--max_alleles_per_cluster',
            str(self.max_alleles_per_cluster),
            '--min_large_ref_length',
            str(self.min_large_ref_length),
            '--final_outdir',
            self.output_dir,
            '--gramtools_max_read_length',
            str(self.gramtools_max_read_length),
            '--cluster_small_vars_ram',
            str(self.nf_ram_cluster_small_vars),
            '--gramtools_build_small_vars_ram',
            str(self.nf_ram_gramtools_build_small),
            '--gramtools_kmer_size',
            str(self.gramtools_kmer_size),
            '--gramtools_build_threads',
            str(self.gramtools_build_threads),
            '--minos_small_vars_ram',
            str(self.nf_ram_minos_small_vars),
            '--merge_small_vars_ram',
            str(self.nf_ram_merge_small_vars),
        ]

        if self.testing:
            nextflow_command.append('--testing')

        if self.use_unmapped_reads:
            nextflow_command.append('--use_unmapped_reads')

        if self.variants_per_split is not None:
            nextflow_command.append('--variants_per_split ' +
                                    str(self.variants_per_split))
        if self.alleles_per_split is not None:
            nextflow_command.append('--alleles_per_split ' +
                                    str(self.alleles_per_split))
        elif self.total_splits is not None:
            nextflow_command.append('--total_splits ' + str(self.total_splits))

        nextflow_command = ' '.join(nextflow_command)

        if self.no_run:
            print(
                'Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:'
            )
            print(nextflow_command)
            return
        else:
            logging.info('Start running nextflow: ' + nextflow_command)
            syscall_process = utils.syscall(nextflow_command)
            logging.info(
                'Finish running nextflow. Writing nextflow stdout/stderr to files'
            )
            with open('nextflow.stdout', 'w') as f:
                print(syscall_process.stdout.rstrip(), file=f)
            with open('nextflow.stderr', 'w') as f:
                print(syscall_process.stderr.rstrip(), file=f)

            logging.info('cd ' + original_dir)

        if self.clean:
            logging.info('Delete nextflow work directory ' +
                         self.nextflow_work_dir)
            shutil.rmtree(self.nextflow_work_dir)
            logging.info('Delete .nextflow directory')
            shutil.rmtree('.nextflow')

        logging.info('Rename .nextflow.log -> nextflow.log')
        os.rename('.nextflow.log', 'nextflow.log')
        os.chdir(original_dir)
Ejemplo n.º 5
0
    def run(self):
        if os.path.exists(self.outdir) and self.overwrite_outdir:
            shutil.rmtree(self.outdir)

        try:
            os.mkdir(self.outdir)
        except:
            raise Error('Error making output directory ' + self.outdir)

        fh = logging.FileHandler(self.log_file, mode='w')
        log = logging.getLogger()
        formatter = logging.Formatter(
            '[minos %(asctime)s %(levelname)s] %(message)s',
            datefmt='%d-%m-%Y %H:%M:%S')
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info('Command run: ' + ' '.join(sys.argv))
        dependencies.check_and_report_dependencies(programs=['gramtools'])
        logging.info('Dependencies look OK')

        if self.read_error_rate is None or self.max_read_length is None:
            logging.info(
                'One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads...'
            )
            estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info('Estimated max_read_length=' +
                         str(estimated_read_length) + ' and read_error_rate=' +
                         str(estimated_read_error_rate))

        self.read_error_rate = estimated_read_error_rate if self.read_error_rate is None else self.read_error_rate
        self.max_read_length = estimated_read_length if self.max_read_length is None else self.max_read_length
        logging.info('Using max_read_length=' + str(self.max_read_length) +
                     ' and read_error_rate=' + str(self.read_error_rate))

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                'User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering'
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        else:
            logging.info(
                'Clustering VCF file(s), to make one VCF input file for gramtools'
            )
            clusterer = vcf_clusterer.VcfClusterer(
                self.vcf_files,
                self.ref_fasta,
                self.clustered_vcf,
                max_distance_between_variants=1,
                max_alleles_per_cluster=self.max_alleles_per_cluster,
            )
            clusterer.run()

            logging.info('Finished clustering VCF file(s)')

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = 'No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant'
            logging.error(error_message)
            raise Error(error_message)

        if self.total_splits is not None or self.variants_per_split is not None or self.alleles_per_split is not None or os.path.exists(
                os.path.join(self.split_input_dir, 'data.pickle')):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info('Making plots from final.vcf')
        plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix)

        logging.info('All done! Thank you for using minos :)')
Ejemplo n.º 6
0
    def run(self):
        self._make_output_dir()
        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        dependencies.check_and_report_dependencies(programs=["nextflow"])

        self._prepare_nextflow_input_files()
        original_dir = os.getcwd()
        os.chdir(self.output_dir)
        nextflow_script = "nextflow.run.nf"
        MultiSamplePipeline._write_nextflow_script(nextflow_script)
        logging.info("Prepared nextflow files. cd " + self.output_dir)

        nextflow = dependencies.find_binary("nextflow")
        nextflow_command = [
            nextflow,
            "run",
            "-work-dir",
            self.nextflow_work_dir,
            "-with-dag",
            "nextflow.out.dag.pdf",
            "-with-trace",
            "nextflow.out.trace.txt",
        ]

        if self.nextflow_config_file is not None:
            nextflow_command.extend(["-c", self.nextflow_config_file])

        nextflow_command += [
            nextflow_script,
            "--ref_fasta",
            self.ref_fasta,
            "--data_in_tsv",
            self.nextflow_input_tsv,
            "--max_alleles_per_cluster",
            str(self.max_alleles_per_cluster),
            "--min_large_ref_length",
            str(self.min_large_ref_length),
            "--final_outdir",
            self.output_dir,
            "--gramtools_max_read_length",
            str(self.gramtools_max_read_length),
            "--cluster_small_vars_ram",
            str(self.nf_ram_cluster_small_vars),
            "--gramtools_build_small_vars_ram",
            str(self.nf_ram_gramtools_build_small),
            "--gramtools_kmer_size",
            str(self.gramtools_kmer_size),
            "--gramtools_build_threads",
            str(self.gramtools_build_threads),
            "--minos_small_vars_ram",
            str(self.nf_ram_minos_small_vars),
            "--merge_small_vars_ram",
            str(self.nf_ram_merge_small_vars),
        ]

        if self.testing:
            nextflow_command.append("--testing")

        if self.use_unmapped_reads:
            nextflow_command.append("--use_unmapped_reads")

        if self.variants_per_split is not None:
            nextflow_command.append("--variants_per_split " +
                                    str(self.variants_per_split))
        if self.alleles_per_split is not None:
            nextflow_command.append("--alleles_per_split " +
                                    str(self.alleles_per_split))
        elif self.total_splits is not None:
            nextflow_command.append("--total_splits " + str(self.total_splits))

        nextflow_command = " ".join(nextflow_command)

        if self.no_run:
            print(
                "Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:"
            )
            print(nextflow_command)
            return
        else:
            logging.info("Start running nextflow: " + nextflow_command)
            syscall_process = utils.syscall(nextflow_command)
            logging.info(
                "Finish running nextflow. Writing nextflow stdout/stderr to files"
            )
            with open("nextflow.stdout", "w") as f:
                print(syscall_process.stdout.rstrip(), file=f)
            with open("nextflow.stderr", "w") as f:
                print(syscall_process.stderr.rstrip(), file=f)

            logging.info("cd " + original_dir)

        if self.clean:
            logging.info("Delete nextflow work directory " +
                         self.nextflow_work_dir)
            shutil.rmtree(self.nextflow_work_dir)
            logging.info("Delete .nextflow directory")
            shutil.rmtree(".nextflow")

        logging.info("Rename .nextflow.log -> nextflow.log")
        os.rename(".nextflow.log", "nextflow.log")
        os.chdir(original_dir)
Ejemplo n.º 7
0
    def run(self):
        self.build_output_dir()

        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info("Command run: " + " ".join(sys.argv))
        to_check = [
            "gramtools",
            "vcfbreakmulti",
            "vcfallelicprimitives",
            "vcfuniq",
            "vt",
        ]
        dependencies.check_and_report_dependencies(programs=to_check)
        logging.info("Dependencies look OK")

        self.ref_fasta = os.path.join(self.outdir, "ref.fa")
        utils.fasta_to_upper_and_ACGT_only(self.original_ref_fasta,
                                           self.ref_fasta)

        if self.read_error_rate is None:
            logging.info(
                "read_error_rate unknown. Estimate from first 10,000 reads...")
            (
                estimated_read_length,
                estimated_read_error_rate,
            ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info(
                f"Estimated read_error_rate={estimated_read_error_rate}")

            self.read_error_rate = (estimated_read_error_rate
                                    if self.read_error_rate is None else
                                    self.read_error_rate)
            logging.info(f"Using read_error_rate={self.read_error_rate}")

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering"
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        elif not self.cluster_input_vcfs:
            logging.info(
                "Skipping VCF clustering because user requested to skip")
        else:
            logging.info(
                "Clustering VCF file(s), to make one VCF input file for gramtools"
            )
            tracker = variant_tracking.VariantTracker(self.cluster_dir,
                                                      self.ref_fasta)
            tracker.merge_vcf_files(self.vcf_files)
            tracker.cluster(self.clustered_vcf_prefix,
                            float("Inf"),
                            max_alleles=5000)
            if not self.debug:
                os.unlink(f"{self.clustered_vcf_prefix}.excluded.tsv")
                utils.rm_rf(self.cluster_dir)
            logging.info("Finished clustering VCF file(s)")

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant"
            logging.error(error_message)
            raise Exception(error_message)

        if (self.total_splits is not None
                or self.variants_per_split is not None
                or self.alleles_per_split is not None or os.path.exists(
                    os.path.join(self.split_input_dir, "data.pickle"))):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info("All done! Thank you for using minos :)")
Ejemplo n.º 8
0
    def run(self):
        self.build_output_dir()

        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info("Command run: " + " ".join(sys.argv))
        dependencies.check_and_report_dependencies(programs=["gramtools"])
        logging.info("Dependencies look OK")

        if self.read_error_rate is None or self.max_read_length is None:
            logging.info(
                "One or both of read_error_rate and max_read_length not known. Estimate from first 10,000 reads..."
            )
            estimated_read_length, estimated_read_error_rate = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info("Estimated max_read_length=" +
                         str(estimated_read_length) + " and read_error_rate=" +
                         str(estimated_read_error_rate))

            self.read_error_rate = (estimated_read_error_rate
                                    if self.read_error_rate is None else
                                    self.read_error_rate)
            self.max_read_length = (estimated_read_length
                                    if self.max_read_length is None else
                                    self.max_read_length)
        logging.info("Using max_read_length=" + str(self.max_read_length) +
                     " and read_error_rate=" + str(self.read_error_rate))

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering"
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        else:
            logging.info(
                "Clustering VCF file(s), to make one VCF input file for gramtools"
            )
            clusterer = vcf_clusterer.VcfClusterer(
                self.vcf_files,
                self.ref_fasta,
                self.clustered_vcf,
                cluster_boundary_size=0,
                max_alleles_per_cluster=self.max_alleles_per_cluster,
            )
            clusterer.run()

            logging.info("Finished clustering VCF file(s)")

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant"
            logging.error(error_message)
            raise Exception(error_message)

        if (self.total_splits is not None
                or self.variants_per_split is not None
                or self.alleles_per_split is not None or os.path.exists(
                    os.path.join(self.split_input_dir, "data.pickle"))):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info("Making plots from final.vcf")
        plots.plots_from_minos_vcf(self.final_vcf, self.plots_prefix)

        logging.info("All done! Thank you for using minos :)")