Beispiel #1
0
    def _index_vcf(cls, vcffile):
        """Index VCF file"""
        bgzip_binary = dependencies.find_binary("bgzip")
        command = " ".join([bgzip_binary, "-c", vcffile, ">", vcffile + ".gz"])
        utils.syscall(command)

        tabix_binary = dependencies.find_binary("tabix")
        command = " ".join([tabix_binary, "-p", "vcf", vcffile + ".gz"])
        utils.syscall(command)
Beispiel #2
0
def get_version_of_program(program, binary=None, allow_fail=False):
    if binary is None:
        binary = find_binary(program, allow_fail=allow_fail)

    if program == "bcftools":
        bcftools_process = utils.syscall(binary, allow_fail=True)
        for line in bcftools_process.stderr.split("\n"):
            # example version line:
            # Version: 1.3.1 (using htslib 1.3.1)
            if line.startswith("Version:"):
                try:
                    version = line.rstrip().split(maxsplit=1)[1]
                except:
                    return None
                return version
        return None
    elif program == "gramtools":
        gramtools_process = utils.syscall(binary + " --version")
        gramtools_json = json.loads(gramtools_process.stdout)
        return gramtools_json.get("version_number", None)
    elif program == "bwa":
        # To get version of BWA, need to run it with no options.
        # This returns an error code of 1, which we need to ignore
        bwa_process = utils.syscall(binary, allow_fail=True)
        for line in bwa_process.stderr.split("\n"):
            if line.strip().startswith("Version:"):
                try:
                    version = line.rstrip().split()[-1]
                except:
                    return None
                return version
        return None
    elif program == "dnadiff":
        dnadiff_process = utils.syscall(binary + " --version", allow_fail=True)
        for line in dnadiff_process.stderr.split("\n"):
            if line.strip().startswith("DNAdiff version"):
                try:
                    version = line.rstrip().split()[-1]
                except:
                    return None
                return version
    elif program == "nextflow":
        nextflow_process = utils.syscall(binary + " -version", allow_fail=True)
        # example line that we want to capture;
        #       version 0.27.6 build 4775
        for line in nextflow_process.stdout.split("\n"):
            if line.strip().startswith("version"):
                try:
                    version = line.rstrip().split(maxsplit=1)[1]
                except:
                    return None
                return version
        return None
    else:
        raise Exception('Program name "' + program +
                        '" not recognised. Cannot continue')
Beispiel #3
0
 def _map_seqs_to_ref(cls, seqs_file, ref_file, outfile):
     """Map seqs_file to ref_file using BWA MEM.
     Output is SAM file written to outfile"""
     bwa_binary = dependencies.find_binary("bwa")
     command = " ".join([
         bwa_binary,
         "mem",
         "-a",  # report all mappings
         "-Y",  # use soft clipping for supplementary alignments
         ref_file,
         seqs_file,
         ">",
         outfile,
     ])
     utils.syscall(command)
Beispiel #4
0
 def _map_seqs_to_ref(cls, seqs_file, ref_file, outfile):
     '''Map seqs_file to ref_file using BWA MEM.
     Output is SAM file written to outfile'''
     bwa_binary = dependencies.find_binary('bwa')
     command = ' '.join([
         bwa_binary,
         'mem',
         '-a',  # report all mappings
         '-Y',  # use soft clipping for supplementary alignments
         ref_file,
         seqs_file,
         '>',
         outfile,
     ])
     utils.syscall(command)
def test_regenotype_pipeline():
    outdir = "tmp.nextflow_regeno_test.out"
    utils.rm_rf(outdir)
    os.mkdir(outdir)
    manifest = "tmp.nextflow_regeno_test.tsv"
    _write_manifest(os.path.join(outdir, manifest))

    regeno_nf = os.path.join(minos_dir, "nextflow", "regenotype.nf")
    regeno_config = os.path.join(minos_dir, "nextflow", "regenotype.config")
    dag = "tmp.nextflow_regeno_test.dag.pdf"
    ref_fasta = os.path.join(data_dir, "data.ref.fa")
    mask_bed = os.path.join(data_dir, "mask.bed")
    command = f"nextflow run -c {regeno_config} -profile tiny -with-dag {dag} {regeno_nf} --make_distance_matrix --mask_bed_file {mask_bed} --max_variants_per_sample 10 --ref_fasta {ref_fasta} --manifest {manifest} --outdir OUT"
    utils.syscall(command, cwd=outdir)

    expect_failed_samples = os.path.join(data_dir, "expect.failed_samples.txt")
    got_failed_samples = os.path.join(outdir, "OUT", "failed_samples.txt")
    assert filecmp.cmp(got_failed_samples,
                       expect_failed_samples,
                       shallow=False)

    expect_dist_matrix = os.path.join(data_dir, "expect.distance_matrix.txt")
    got_dist_matrix = os.path.join(outdir, "OUT", "distance_matrix.txt")
    assert filecmp.cmp(got_dist_matrix, expect_dist_matrix, shallow=False)

    # Don't know order of lines in the manifest tsv, or the filename that will
    # be given to each sample. We'll load in each VCF and check it matches the
    # sample name from the manifest. Also check info in json and tsv files
    # match
    manifest_json = os.path.join(outdir, "OUT", "manifest.json")
    assert os.path.exists(manifest_json)
    manifest_tsv = os.path.join(outdir, "OUT", "manifest.tsv")

    with open(manifest_json) as f:
        manifest_data = json.load(f)

    with open(manifest_tsv) as f:
        reader = csv.DictReader(f, delimiter="\t")
        for d in reader:
            vcf = os.path.join(outdir, "OUT", d["vcf_file"])
            assert d["sample"] == vcf_file_read.get_sample_name_from_vcf_file(
                vcf)
            assert os.path.exists(os.path.join(outdir, "OUT", d["log_file"]))
            assert manifest_data[d["sample"]]["log_file"] == d["log_file"]
            assert manifest_data[d["sample"]]["vcf_file"] == d["vcf_file"]

    utils.rm_rf(outdir)
Beispiel #6
0
def run_gramtools_build(outdir,
                        vcf_file,
                        ref_file,
                        max_read_length,
                        kmer_size=10):
    '''Runs gramtools build. Makes new directory called 'outdir' for
    the output'''
    gramtools_exe = dependencies.find_binary('gramtools')
    build_command = ' '.join([
        gramtools_exe,
        'build',
        '--gram-directory',
        outdir,
        '--vcf',
        vcf_file,
        '--reference',
        ref_file,
        '--max-read-length',
        str(max_read_length),
        '--all-kmers',
        '--kmer-size',
        str(kmer_size),
    ])
    logging.info('Running gramtools build: ' + build_command)
    completed_process = utils.syscall(build_command, allow_fail=True)
    logging.info('Finished running gramtools build. Return code: ' +
                 str(completed_process.returncode))
    build_report = os.path.join(outdir, 'build_report.json')
    ran_ok = _build_json_file_is_good(
        build_report) and completed_process.returncode == 0
    if not ran_ok:
        logging.info('Error running gramtools build. See build report file ' +
                     build_report)
        raise Error('Error running gramtools build: ' + build_command)

    # Older gramtools called the perl generated VCF file perl_generated_vcf.
    # New gramtools calls it perl_generated.vcf.
    # Whichever one doesn't exist, symlink it to the one that does
    cwd = os.getcwd()
    os.chdir(outdir)
    vcf1 = 'perl_generated_vcf'
    vcf2 = 'perl_generated.vcf'
    if os.path.exists(vcf1):
        assert not os.path.exists(vcf2)
        os.symlink(vcf1, vcf2)
    elif os.path.exists(vcf2):
        assert not os.path.exists(vcf1)
        os.symlink(vcf2, vcf1)
    else:
        message = f'Could not find perl generated VCF file in directory {outdir}. Looked for {vcf1}, {vcf2}. Cannot continue'
        logging.error(message)
        raise Error(message)
    os.chdir(cwd)

    logging.info('Build report file looks good from gramtools build: ' +
                 build_report)
Beispiel #7
0
    def run(self):
        snps_file = self.outprefix + ".snps"
        qdiff_file = self.outprefix + ".qdiff"
        self.unmerged_vcf = self.outprefix + ".raw.vcf"
        self.merged_vcf = self.outprefix + ".merged.vcf"
        for filename in [snps_file, qdiff_file]:
            if os.path.exists(filename):
                os.unlink(filename)
        tmp_prefix = self.outprefix + ".tmp"

        for ref_name, query_name in zip(self.ref_seq_names, self.query_seq_names):
            ref_fasta = tmp_prefix + ".ref.fa"
            query_fasta = tmp_prefix + ".query.fa"
            with open(ref_fasta, "w") as f:
                print(self.ref_seqs[ref_name], file=f)
            with open(query_fasta, "w") as f:
                print(self.query_seqs[query_name], file=f)
            Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix)
            utils.syscall("cat " + tmp_prefix + ".snps >> " + snps_file)
            utils.syscall("cat " + tmp_prefix + ".qdiff >> " + qdiff_file)
            Dnadiff.clean_dnadiff_files(tmp_prefix)
            os.unlink(ref_fasta)
            os.unlink(query_fasta)

        Dnadiff._snps_file_file_to_unmerged_vcf(
            self.outprefix + ".snps", self.query_seqs, self.unmerged_vcf
        )
        clusterer = vcf_clusterer.VcfClusterer(
            [self.unmerged_vcf],
            self.query_fasta,
            self.merged_vcf,
            merge_method="simple",
            cluster_boundary_size=31,
        )
        clusterer.run()
        header, self.variants = vcf_file_read.vcf_file_to_dict(
            self.merged_vcf, remove_useless_start_nucleotides=True
        )
        self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix + ".qdiff")
        self.all_variant_intervals = Dnadiff._make_all_variants_intervals(
            self.variants, self.big_variant_intervals
        )
    def _index_vcf(cls, vcffile):
        '''Index VCF file'''
        bgzip_binary = dependencies.find_binary('bgzip')
        command = ' '.join([
            bgzip_binary,
            '-c',
            vcffile,
            '>',
            vcffile + ".gz",
        ])
        utils.syscall(command)

        tabix_binary = dependencies.find_binary('tabix')
        command = ' '.join([
            tabix_binary,
            '-p',
            'vcf',
            vcffile + ".gz",
        ])
        utils.syscall(command)
    def _map_seqs_to_seqs(cls, seqs_file_ref, seqs_file_query, outfile):
        '''Map seqs_file to ref_file using BWA MEM.
        Output is SAM file written to outfile'''
        bwa_binary = dependencies.find_binary('bwa')
        command = ' '.join([
            bwa_binary,
            'index',
            seqs_file_ref,
        ])
        utils.syscall(command)

        command = ' '.join([
            bwa_binary,
            'aln',
            seqs_file_ref,
            seqs_file_query,
            '>',
            outfile + ".tmp",
        ])
        utils.syscall(command)

        command = ' '.join([
            bwa_binary,
            'samse',
            seqs_file_ref,
            outfile + ".tmp",
            seqs_file_query,
            '>',
            outfile,
        ])
        utils.syscall(command)
Beispiel #10
0
    def run(self):
        snps_file = self.outprefix + '.snps'
        qdiff_file = self.outprefix + '.qdiff'
        self.unmerged_vcf = self.outprefix + '.raw.vcf'
        self.merged_vcf = self.outprefix + '.merged.vcf'
        for filename in [snps_file, qdiff_file]:
            if os.path.exists(filename):
                os.unlink(filename)
        tmp_prefix = self.outprefix + '.tmp'

        for ref_name, query_name in zip(self.ref_seq_names,
                                        self.query_seq_names):
            ref_fasta = tmp_prefix + '.ref.fa'
            query_fasta = tmp_prefix + '.query.fa'
            with open(ref_fasta, 'w') as f:
                print(self.ref_seqs[ref_name], file=f)
            with open(query_fasta, 'w') as f:
                print(self.query_seqs[query_name], file=f)
            Dnadiff._run_dnadiff(ref_fasta, query_fasta, tmp_prefix)
            utils.syscall('cat ' + tmp_prefix + '.snps >> ' + snps_file)
            utils.syscall('cat ' + tmp_prefix + '.qdiff >> ' + qdiff_file)
            Dnadiff.clean_dnadiff_files(tmp_prefix)
            os.unlink(ref_fasta)
            os.unlink(query_fasta)

        Dnadiff._snps_file_file_to_unmerged_vcf(self.outprefix + '.snps',
                                                self.query_seqs,
                                                self.unmerged_vcf)
        clusterer = vcf_clusterer.VcfClusterer([self.unmerged_vcf],
                                               self.query_fasta,
                                               self.merged_vcf,
                                               merge_method='simple')
        clusterer.run()
        header, self.variants = vcf_file_read.vcf_file_to_dict(
            self.merged_vcf, remove_useless_start_nucleotides=True)
        self.big_variant_intervals = Dnadiff._load_qdiff_file(self.outprefix +
                                                              '.qdiff')
        self.all_variant_intervals = Dnadiff._make_all_variants_intervals(
            self.variants, self.big_variant_intervals)
Beispiel #11
0
def get_version_of_program(program, binary=None, allow_fail=False):
    if binary is None:
        binary = find_binary(program, allow_fail=allow_fail)

    if program == "gramtools":
        gramtools_process = utils.syscall(binary + " --version")
        gramtools_json = json.loads(gramtools_process.stdout)
        return gramtools_json.get("version_number", None)
    elif program == "nextflow":
        nextflow_process = utils.syscall(binary + " -version", allow_fail=True)
        # example line that we want to capture;
        #       version 0.27.6 build 4775
        for line in nextflow_process.stdout.split("\n"):
            if line.strip().startswith("version"):
                try:
                    version = line.rstrip().split(maxsplit=1)[1]
                except:
                    return None
                return version
        return None
    elif program == "vt":
        vt_process = utils.syscall(binary + " -v", allow_fail=True)
        # Example line:
        # vt v0.57721
        for line in vt_process.stderr.split("\n"):
            if line.rstrip().startswith("vt "):
                try:
                    version = line.rstrip().split(maxsplit=1)[1]
                except:
                    return None
                return version
        return None
    elif program in [
            "vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vcflib"
    ]:
        return "Unknown"
    else:
        raise Exception('Program name "' + program +
                        '" not recognised. Cannot continue')
Beispiel #12
0
def run_gramtools_build(outdir,
                        vcf_file,
                        ref_file,
                        max_read_length,
                        kmer_size=10):
    """Runs gramtools build. Makes new directory called 'outdir' for
    the output"""
    if os.path.exists(outdir):
        raise FileExistsError(
            f"Gramtools build output directory '{outdir}' already exists. Cannot continue"
        )
    os.mkdir(outdir)
    gramtools_exe = dependencies.find_binary("gramtools")
    build_command = " ".join([
        gramtools_exe,
        "build",
        "--gram-directory",
        outdir,
        "--vcf",
        vcf_file,
        "--reference",
        ref_file,
        "--max-read-length",
        str(max_read_length),
        "--kmer-size",
        str(kmer_size),
    ])
    logging.info("Running gramtools build: " + build_command)
    completed_process = utils.syscall(build_command, allow_fail=True)
    logging.info("Finished running gramtools build. Return code: " +
                 str(completed_process.returncode))
    build_report = os.path.join(outdir, "build_report.json")
    ran_ok = (_build_json_file_is_good(build_report)
              and completed_process.returncode == 0)
    if not ran_ok:
        logging.info("Error running gramtools build. See build report file " +
                     build_report)
        raise Exception(
            f"Error running gramtools build: {build_command}\nstdout:{completed_process.stdout}\nstderr:\n{completed_process.stderr}"
        )

    logging.info("Build report file looks good from gramtools build: " +
                 build_report)
Beispiel #13
0
    def _map_seqs_to_seqs(cls, seqs_file_ref, seqs_file_query, outfile):
        """Map seqs_file to ref_file using BWA MEM.
        Output is SAM file written to outfile"""
        bwa_binary = dependencies.find_binary("bwa")
        command = " ".join([bwa_binary, "index", seqs_file_ref])
        utils.syscall(command)

        command = " ".join([
            bwa_binary, "aln", seqs_file_ref, seqs_file_query, ">",
            outfile + ".tmp"
        ])
        utils.syscall(command)

        command = " ".join([
            bwa_binary,
            "samse",
            seqs_file_ref,
            outfile + ".tmp",
            seqs_file_query,
            ">",
            outfile,
        ])
        utils.syscall(command)
Beispiel #14
0
def compress_file(filenames):
    infile, outfile = filenames
    zipper = "bgzip" if infile.endswith(".vcf") else "gzip -9"
    utils.syscall(f"{zipper} -c {infile} > {outfile}")
Beispiel #15
0
    def run(self):
        self._make_output_dir()
        fh = logging.FileHandler(self.log_file, mode='w')
        log = logging.getLogger()
        formatter = logging.Formatter(
            '[minos %(asctime)s %(levelname)s] %(message)s',
            datefmt='%d-%m-%Y %H:%M:%S')
        fh.setFormatter(formatter)
        log.addHandler(fh)
        dependencies.check_and_report_dependencies(programs=['nextflow'])

        self._prepare_nextflow_input_files()
        original_dir = os.getcwd()
        os.chdir(self.output_dir)
        nextflow_script = 'nextflow.run.nf'
        MultiSamplePipeline._write_nextflow_script(nextflow_script)
        logging.info('Prepared nextflow files. cd ' + self.output_dir)

        nextflow = dependencies.find_binary('nextflow')
        nextflow_command = [
            nextflow,
            'run',
            '-work-dir',
            self.nextflow_work_dir,
            '-with-dag',
            'nextflow.out.dag.pdf',
            '-with-trace',
            'nextflow.out.trace.txt',
        ]

        if self.nextflow_config_file is not None:
            nextflow_command.extend(['-c', self.nextflow_config_file])

        nextflow_command += [
            nextflow_script,
            '--ref_fasta',
            self.ref_fasta,
            '--data_in_tsv',
            self.nextflow_input_tsv,
            '--max_alleles_per_cluster',
            str(self.max_alleles_per_cluster),
            '--min_large_ref_length',
            str(self.min_large_ref_length),
            '--final_outdir',
            self.output_dir,
            '--gramtools_max_read_length',
            str(self.gramtools_max_read_length),
            '--cluster_small_vars_ram',
            str(self.nf_ram_cluster_small_vars),
            '--gramtools_build_small_vars_ram',
            str(self.nf_ram_gramtools_build_small),
            '--gramtools_kmer_size',
            str(self.gramtools_kmer_size),
            '--gramtools_build_threads',
            str(self.gramtools_build_threads),
            '--minos_small_vars_ram',
            str(self.nf_ram_minos_small_vars),
            '--merge_small_vars_ram',
            str(self.nf_ram_merge_small_vars),
        ]

        if self.testing:
            nextflow_command.append('--testing')

        if self.use_unmapped_reads:
            nextflow_command.append('--use_unmapped_reads')

        if self.variants_per_split is not None:
            nextflow_command.append('--variants_per_split ' +
                                    str(self.variants_per_split))
        if self.alleles_per_split is not None:
            nextflow_command.append('--alleles_per_split ' +
                                    str(self.alleles_per_split))
        elif self.total_splits is not None:
            nextflow_command.append('--total_splits ' + str(self.total_splits))

        nextflow_command = ' '.join(nextflow_command)

        if self.no_run:
            print(
                'Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:'
            )
            print(nextflow_command)
            return
        else:
            logging.info('Start running nextflow: ' + nextflow_command)
            syscall_process = utils.syscall(nextflow_command)
            logging.info(
                'Finish running nextflow. Writing nextflow stdout/stderr to files'
            )
            with open('nextflow.stdout', 'w') as f:
                print(syscall_process.stdout.rstrip(), file=f)
            with open('nextflow.stderr', 'w') as f:
                print(syscall_process.stderr.rstrip(), file=f)

            logging.info('cd ' + original_dir)

        if self.clean:
            logging.info('Delete nextflow work directory ' +
                         self.nextflow_work_dir)
            shutil.rmtree(self.nextflow_work_dir)
            logging.info('Delete .nextflow directory')
            shutil.rmtree('.nextflow')

        logging.info('Rename .nextflow.log -> nextflow.log')
        os.rename('.nextflow.log', 'nextflow.log')
        os.chdir(original_dir)
Beispiel #16
0
def run_gramtools(
    build_dir,
    quasimap_dir,
    vcf_file,
    ref_file,
    reads,
    max_read_length,
    kmer_size=10,
    seed=42,
):
    """If build_dir does not exist, runs runs gramtools build and quasimap.
    Otherwise, just runs quasimap. quasimap output is in new
    directory called quasimap_dir.
    "reads" can be one filename, or a list of filenames.
    Raises Error if either of the expected json coverage
    files made by quasimap are not found."""
    gramtools_exe = dependencies.find_binary("gramtools")
    if not os.path.exists(build_dir):
        run_gramtools_build(build_dir,
                            vcf_file,
                            ref_file,
                            max_read_length,
                            kmer_size=kmer_size)

    if type(reads) is not list:
        assert type(reads) is str
        reads = [reads]

    quasimap_command = " ".join([
        gramtools_exe,
        "quasimap",
        f"--seed {seed}",
        "--gram-dir",
        build_dir,
        "--run-dir",
        quasimap_dir,
        " ".join(["--reads " + x for x in reads]),
    ])
    logging.info("Running gramtools quasimap: " + quasimap_command)
    utils.syscall(quasimap_command)
    logging.info("Finished running gramtools quasimap")

    build_report = os.path.join(build_dir, "build_report.json")
    quasimap_report = os.path.join(quasimap_dir, "quasimap_outputs",
                                   "quasimap_report.json")
    allele_base_counts_file = os.path.join(quasimap_dir, "quasimap_outputs",
                                           "allele_base_coverage.json")
    grouped_allele_counts_file = os.path.join(
        quasimap_dir, "quasimap_outputs",
        "grouped_allele_counts_coverage.json")
    files_ok = True
    for filename in (
            build_report,
            quasimap_report,
            allele_base_counts_file,
            grouped_allele_counts_file,
    ):
        if not (os.path.exists(filename)):
            files_ok = False
            logging.error("gramtools file not found: " + filename)

    if not files_ok:
        error_message = "Looks like something went wrong during gramtools run. At least one output file not present. Cannot continue."
        logging.error(error_message)
        raise Exception(error_message)

    with open(build_report) as f:
        json_build_report = json.load(f)
    with open(quasimap_report) as f:
        json_quasimap_report = json.load(f)

    return json_build_report, json_quasimap_report
Beispiel #17
0
def run_gramtools(build_dir,
                  quasimap_dir,
                  vcf_file,
                  ref_file,
                  reads,
                  max_read_length,
                  kmer_size=10,
                  seed=42):
    '''If build_dir does not exist, runs runs gramtools build and quasimap.
    Otherwise, just runs quasimap. quasimap output is in new
    directory called quasimap_dir.
    "reads" can be one filename, or a list of filenames.
    Raises Error if either of the expected json coverage
    files made by quasimap are not found.'''
    gramtools_exe = dependencies.find_binary('gramtools')
    if not os.path.exists(build_dir):
        run_gramtools_build(build_dir,
                            vcf_file,
                            ref_file,
                            max_read_length,
                            kmer_size=kmer_size)

    if type(reads) is not list:
        assert type(reads) is str
        reads = [reads]

    quasimap_command = ' '.join([
        gramtools_exe,
        'quasimap',
        f'--seed {seed}',
        '--gram-directory',
        build_dir,
        '--output-directory',
        quasimap_dir,
        ' '.join(['--reads ' + x for x in reads]),
    ])
    logging.info('Running gramtools quasimap: ' + quasimap_command)
    utils.syscall(quasimap_command)
    logging.info('Finished running gramtools quasimap')

    build_report = os.path.join(build_dir, 'build_report.json')
    quasimap_report = os.path.join(quasimap_dir, 'report.json')
    allele_base_counts_file = os.path.join(quasimap_dir,
                                           'allele_base_coverage.json')
    grouped_allele_counts_file = os.path.join(
        quasimap_dir, 'grouped_allele_counts_coverage.json')
    files_ok = True
    for filename in build_report, quasimap_report, allele_base_counts_file, grouped_allele_counts_file:
        if not (os.path.exists(filename)):
            files_ok = False
            logging.error('gramtools file not found: ' + filename)

    if not files_ok:
        error_message = 'Looks like something went wrong duing gramtools run. At least one output file not present. Cannot continue.'
        logging.error(error_message)
        raise Error(error_message)

    with open(build_report) as f:
        json_build_report = json.load(f)
    with open(quasimap_report) as f:
        json_quasimap_report = json.load(f)

    return json_build_report, json_quasimap_report
Beispiel #18
0
    def run(self):
        self._make_output_dir()
        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        dependencies.check_and_report_dependencies(programs=["nextflow"])

        self._prepare_nextflow_input_files()
        original_dir = os.getcwd()
        os.chdir(self.output_dir)
        nextflow_script = "nextflow.run.nf"
        MultiSamplePipeline._write_nextflow_script(nextflow_script)
        logging.info("Prepared nextflow files. cd " + self.output_dir)

        nextflow = dependencies.find_binary("nextflow")
        nextflow_command = [
            nextflow,
            "run",
            "-work-dir",
            self.nextflow_work_dir,
            "-with-dag",
            "nextflow.out.dag.pdf",
            "-with-trace",
            "nextflow.out.trace.txt",
        ]

        if self.nextflow_config_file is not None:
            nextflow_command.extend(["-c", self.nextflow_config_file])

        nextflow_command += [
            nextflow_script,
            "--ref_fasta",
            self.ref_fasta,
            "--data_in_tsv",
            self.nextflow_input_tsv,
            "--max_alleles_per_cluster",
            str(self.max_alleles_per_cluster),
            "--min_large_ref_length",
            str(self.min_large_ref_length),
            "--final_outdir",
            self.output_dir,
            "--gramtools_max_read_length",
            str(self.gramtools_max_read_length),
            "--cluster_small_vars_ram",
            str(self.nf_ram_cluster_small_vars),
            "--gramtools_build_small_vars_ram",
            str(self.nf_ram_gramtools_build_small),
            "--gramtools_kmer_size",
            str(self.gramtools_kmer_size),
            "--gramtools_build_threads",
            str(self.gramtools_build_threads),
            "--minos_small_vars_ram",
            str(self.nf_ram_minos_small_vars),
            "--merge_small_vars_ram",
            str(self.nf_ram_merge_small_vars),
        ]

        if self.testing:
            nextflow_command.append("--testing")

        if self.use_unmapped_reads:
            nextflow_command.append("--use_unmapped_reads")

        if self.variants_per_split is not None:
            nextflow_command.append("--variants_per_split " +
                                    str(self.variants_per_split))
        if self.alleles_per_split is not None:
            nextflow_command.append("--alleles_per_split " +
                                    str(self.alleles_per_split))
        elif self.total_splits is not None:
            nextflow_command.append("--total_splits " + str(self.total_splits))

        nextflow_command = " ".join(nextflow_command)

        if self.no_run:
            print(
                "Prepared nextflow pipeline. --no_run used, so not running. The nextflow command to run is:"
            )
            print(nextflow_command)
            return
        else:
            logging.info("Start running nextflow: " + nextflow_command)
            syscall_process = utils.syscall(nextflow_command)
            logging.info(
                "Finish running nextflow. Writing nextflow stdout/stderr to files"
            )
            with open("nextflow.stdout", "w") as f:
                print(syscall_process.stdout.rstrip(), file=f)
            with open("nextflow.stderr", "w") as f:
                print(syscall_process.stderr.rstrip(), file=f)

            logging.info("cd " + original_dir)

        if self.clean:
            logging.info("Delete nextflow work directory " +
                         self.nextflow_work_dir)
            shutil.rmtree(self.nextflow_work_dir)
            logging.info("Delete .nextflow directory")
            shutil.rmtree(".nextflow")

        logging.info("Rename .nextflow.log -> nextflow.log")
        os.rename(".nextflow.log", "nextflow.log")
        os.chdir(original_dir)
Beispiel #19
0
 def _run_dnadiff(cls, ref_fasta, query_fasta, outprefix):
     command = ' '.join(['dnadiff -p', outprefix, ref_fasta, query_fasta])
     utils.syscall(command)