Exemple #1
0
def test_parse_manifest_file():
    """test parse_manifest_file"""
    manifest_tsv = "tmp.parse_manifest_file.in.tsv"
    with open(manifest_tsv, "w") as f:
        vcf_prefix = os.path.join(data_dir, "parse_manifest_file")
        vcf1 = f"{vcf_prefix}.1.vcf"
        vcf2 = f"{vcf_prefix}.2.vcf"
        vcf3 = f"{vcf_prefix}.3.vcf"
        print("name", "vcf", "reads", sep="\t", file=f)
        print("sample1", vcf1, "1.reads.fq", sep="\t", file=f)
        print("sample2", vcf2, "2.reads.1.fq 2.reads.2.fq", sep="\t", file=f)
        print("sample3", vcf3, "3.reads.1.fq 3.reads.2.fq", sep="\t", file=f)
    merge_out = "tmp.parse_manifest_file.merge.fofn"
    adjudicate_out = "tmp.parse_manifest_file.adjudicate.tsv"
    ref_fasta = os.path.join(data_dir, "parse_manifest_file.ref.fa")
    utils.rm_rf(merge_out, adjudicate_out)
    regeno_helper.parse_manifest_file(manifest_tsv, merge_out, adjudicate_out,
                                      ref_fasta)
    os.unlink(manifest_tsv)
    expect_adj = os.path.join(data_dir, "parse_manifest_file.out.tsv")
    assert filecmp.cmp(expect_adj, adjudicate_out, shallow=False)
    os.unlink(adjudicate_out)
    with open(merge_out) as f:
        got_lines = [x.rstrip() for x in f]
    assert got_lines == [vcf1, vcf3]
    os.unlink(merge_out)
Exemple #2
0
 def test_fasta_to_upper_and_ACGT_only(self):
     """test fasta_to_upper_and_ACGT_only"""
     infile = os.path.join(data_dir, "fasta_to_upper_and_ACGT_only.in.fa")
     tmp_file = "tmp.fasta_to_upper_and_ACGT_only.fa"
     utils.rm_rf(tmp_file)
     utils.fasta_to_upper_and_ACGT_only(infile, tmp_file)
     expect = os.path.join(data_dir,
                           "fasta_to_upper_and_ACGT_only.expect.fa")
     self.assertTrue(filecmp.cmp(tmp_file, expect, shallow=False))
     os.unlink(tmp_file)
Exemple #3
0
def test_compress_file():
    """test compress_file"""
    vcf_in = os.path.join(data_dir, "compress_file.vcf")
    vcf_out = "tmp.compress_file.vcf.gz"
    txt_in = os.path.join(data_dir, "compress_file.txt")
    txt_out = "tmp.compress_file.txt.gz"
    utils.rm_rf(vcf_out, txt_out)
    regeno_helper.compress_file((vcf_in, vcf_out))
    regeno_helper.compress_file((txt_in, txt_out))
    assert os.path.exists(vcf_out)
    assert os.path.exists(txt_out)
    os.unlink(vcf_out)
    os.unlink(txt_out)
def test_distance_matrix_from_vcf_file():
    vcf_file = os.path.join(data_dir, "distance_matrix_from_vcf_file.vcf")
    outfile = "tmp.distance_matrix_from_vcf_file.out"
    utils.rm_rf(outfile)
    dist_matrix.distance_matrix_from_vcf_file(vcf_file, outfile)
    expect = os.path.join(data_dir, "distance_matrix_from_vcf_file.expect.no_mask")
    assert filecmp.cmp(outfile, expect, shallow=False)
    os.unlink(outfile)

    mask_bed = os.path.join(data_dir, "distance_matrix_from_vcf_file.mask.bed")
    dist_matrix.distance_matrix_from_vcf_file(vcf_file, outfile, mask_bed_file=mask_bed)
    expect = os.path.join(data_dir, "distance_matrix_from_vcf_file.expect.mask")
    assert filecmp.cmp(outfile, expect, shallow=False)
    os.unlink(outfile)
def test_regenotype_pipeline():
    outdir = "tmp.nextflow_regeno_test.out"
    utils.rm_rf(outdir)
    os.mkdir(outdir)
    manifest = "tmp.nextflow_regeno_test.tsv"
    _write_manifest(os.path.join(outdir, manifest))

    regeno_nf = os.path.join(minos_dir, "nextflow", "regenotype.nf")
    regeno_config = os.path.join(minos_dir, "nextflow", "regenotype.config")
    dag = "tmp.nextflow_regeno_test.dag.pdf"
    ref_fasta = os.path.join(data_dir, "data.ref.fa")
    mask_bed = os.path.join(data_dir, "mask.bed")
    command = f"nextflow run -c {regeno_config} -profile tiny -with-dag {dag} {regeno_nf} --make_distance_matrix --mask_bed_file {mask_bed} --max_variants_per_sample 10 --ref_fasta {ref_fasta} --manifest {manifest} --outdir OUT"
    utils.syscall(command, cwd=outdir)

    expect_failed_samples = os.path.join(data_dir, "expect.failed_samples.txt")
    got_failed_samples = os.path.join(outdir, "OUT", "failed_samples.txt")
    assert filecmp.cmp(got_failed_samples,
                       expect_failed_samples,
                       shallow=False)

    expect_dist_matrix = os.path.join(data_dir, "expect.distance_matrix.txt")
    got_dist_matrix = os.path.join(outdir, "OUT", "distance_matrix.txt")
    assert filecmp.cmp(got_dist_matrix, expect_dist_matrix, shallow=False)

    # Don't know order of lines in the manifest tsv, or the filename that will
    # be given to each sample. We'll load in each VCF and check it matches the
    # sample name from the manifest. Also check info in json and tsv files
    # match
    manifest_json = os.path.join(outdir, "OUT", "manifest.json")
    assert os.path.exists(manifest_json)
    manifest_tsv = os.path.join(outdir, "OUT", "manifest.tsv")

    with open(manifest_json) as f:
        manifest_data = json.load(f)

    with open(manifest_tsv) as f:
        reader = csv.DictReader(f, delimiter="\t")
        for d in reader:
            vcf = os.path.join(outdir, "OUT", d["vcf_file"])
            assert d["sample"] == vcf_file_read.get_sample_name_from_vcf_file(
                vcf)
            assert os.path.exists(os.path.join(outdir, "OUT", d["log_file"]))
            assert manifest_data[d["sample"]]["log_file"] == d["log_file"]
            assert manifest_data[d["sample"]]["vcf_file"] == d["vcf_file"]

    utils.rm_rf(outdir)
Exemple #6
0
def test_make_per_sample_vcfs_dir():
    """test make_per_sample_vcfs_dir"""
    manifest_file = "tmp.make_per_sample_vcfs_dir.tsv"
    indir = os.path.join(data_dir, "make_per_sample_vcfs_dir")
    minos_indirs = {}
    with open(manifest_file, "w") as f:
        for i in range(1, 6):
            minos_dir = os.path.join(indir, f"minos.{i}")
            print(f"sample.{i}", minos_dir, sep="\t", file=f)
            minos_indirs[f"sample.{i}"] = minos_dir

    root_out = "tmp.make_per_sample_vcfs_dir.out"
    utils.rm_rf(root_out)
    regeno_helper.make_per_sample_vcfs_dir(manifest_file,
                                           root_out,
                                           samples_per_dir=2,
                                           cpus=2)
    os.unlink(manifest_file)
    expect_tsv = os.path.join(data_dir, "make_per_sample_vcfs_dir.expect.tsv")
    got_tsv = os.path.join(root_out, "manifest.tsv")
    assert filecmp.cmp(expect_tsv, got_tsv, shallow=False)
    expect_json = os.path.join(data_dir,
                               "make_per_sample_vcfs_dir.expect.json")
    got_json = os.path.join(root_out, "manifest.json")
    assert filecmp.cmp(expect_json, got_json, shallow=False)
    with open(got_json) as f:
        json_data = json.load(f)

    for sample, minos_dir in minos_indirs.items():
        original_vcf = os.path.join(minos_dir,
                                    "debug.calls_with_zero_cov_alleles.vcf")
        original_log = os.path.join(minos_dir, "log.txt")
        new_vcf = os.path.join(root_out, json_data[sample]["vcf_file"])
        new_log = os.path.join(root_out, json_data[sample]["log_file"])
        assert _file_contents_the_same(original_vcf, new_vcf)
        assert _file_contents_the_same(original_log, new_log)

    utils.rm_rf(root_out)
Exemple #7
0
    def run(self):
        self.build_output_dir()

        fh = logging.FileHandler(self.log_file, mode="w")
        log = logging.getLogger()
        formatter = logging.Formatter(
            "[minos %(asctime)s %(levelname)s] %(message)s",
            datefmt="%d-%m-%Y %H:%M:%S")
        fh.setFormatter(formatter)
        log.addHandler(fh)
        logging.info("Command run: " + " ".join(sys.argv))
        to_check = [
            "gramtools",
            "vcfbreakmulti",
            "vcfallelicprimitives",
            "vcfuniq",
            "vt",
        ]
        dependencies.check_and_report_dependencies(programs=to_check)
        logging.info("Dependencies look OK")

        self.ref_fasta = os.path.join(self.outdir, "ref.fa")
        utils.fasta_to_upper_and_ACGT_only(self.original_ref_fasta,
                                           self.ref_fasta)

        if self.read_error_rate is None:
            logging.info(
                "read_error_rate unknown. Estimate from first 10,000 reads...")
            (
                estimated_read_length,
                estimated_read_error_rate,
            ) = utils.estimate_max_read_length_and_read_error_rate_from_qual_scores(
                self.reads_files[0])
            logging.info(
                f"Estimated read_error_rate={estimated_read_error_rate}")

            self.read_error_rate = (estimated_read_error_rate
                                    if self.read_error_rate is None else
                                    self.read_error_rate)
            logging.info(f"Using read_error_rate={self.read_error_rate}")

        if self.user_supplied_gramtools_build_dir:
            logging.info(
                "User supplied gramtools build dir. Assuming VCF already clustered, so skipping clustering"
            )
            assert len(self.vcf_files) == 1
            self.clustered_vcf = self.vcf_files[0]
        elif not self.cluster_input_vcfs:
            logging.info(
                "Skipping VCF clustering because user requested to skip")
        else:
            logging.info(
                "Clustering VCF file(s), to make one VCF input file for gramtools"
            )
            tracker = variant_tracking.VariantTracker(self.cluster_dir,
                                                      self.ref_fasta)
            tracker.merge_vcf_files(self.vcf_files)
            tracker.cluster(self.clustered_vcf_prefix,
                            float("Inf"),
                            max_alleles=5000)
            if not self.debug:
                os.unlink(f"{self.clustered_vcf_prefix}.excluded.tsv")
                utils.rm_rf(self.cluster_dir)
            logging.info("Finished clustering VCF file(s)")

        if not vcf_file_read.vcf_file_has_at_least_one_record(
                self.clustered_vcf):
            error_message = "No VCF records. Cannot continue. Please check that the input VCF files contained at least one variant"
            logging.error(error_message)
            raise Exception(error_message)

        if (self.total_splits is not None
                or self.variants_per_split is not None
                or self.alleles_per_split is not None or os.path.exists(
                    os.path.join(self.split_input_dir, "data.pickle"))):
            self._run_gramtools_with_split_vcf()
        else:
            self._run_gramtools_not_split_vcf()

        logging.info("All done! Thank you for using minos :)")
Exemple #8
0
def make_per_sample_vcfs_dir(sample_data_tsv,
                             root_outdir,
                             original_manifest=None,
                             samples_per_dir=1000,
                             cpus=1):
    vcf_root_out = os.path.join("VCFs")
    logs_root_out = os.path.join("Logs")
    if not os.path.exists(root_outdir):
        os.mkdir(root_outdir)
    # utils.rm_rf(f"{root_outdir}/*")
    utils.rm_rf(vcf_root_out)
    utils.rm_rf(logs_root_out)
    os.mkdir(os.path.join(root_outdir, vcf_root_out))
    os.mkdir(os.path.join(root_outdir, logs_root_out))
    sample_number = 0
    tsv_out = os.path.join(root_outdir, "manifest.tsv")
    utils.rm_rf(tsv_out)
    json_out = os.path.join(root_outdir, "manifest.json")
    utils.rm_rf(json_out)
    data = {}
    parallel_jobs_data = []

    with open(sample_data_tsv) as f_in, open(tsv_out, "w") as f_out:
        print("sample", "vcf_file", "log_file", sep="\t", file=f_out)
        for line in f_in:
            if sample_number % samples_per_dir == 0:
                outdir = str(sample_number // samples_per_dir)
                vcf_dir = os.path.join(vcf_root_out, outdir)
                vcf_dir_full = os.path.join(root_outdir, vcf_dir)
                os.mkdir(vcf_dir_full)
                log_dir = os.path.join(logs_root_out, outdir)
                log_dir_full = os.path.join(root_outdir, log_dir)
                os.mkdir(log_dir_full)

            sample_name, minos_indir = line.rstrip().split()
            vcf_in = os.path.join(minos_indir,
                                  "debug.calls_with_zero_cov_alleles.vcf")
            log_in = os.path.join(minos_indir, "log.txt")
            vcf_out = os.path.join(vcf_dir, f"{sample_number}.vcf.gz")
            vcf_out_full = os.path.join(root_outdir, vcf_out)
            log_out = os.path.join(log_dir, f"{sample_number}.log.gz")
            log_out_full = os.path.join(root_outdir, log_out)
            parallel_jobs_data.append((vcf_in, vcf_out_full))
            parallel_jobs_data.append((log_in, log_out_full))
            sample_number += 1
            print(sample_name, vcf_out, log_out, sep="\t", file=f_out)
            data[sample_name] = {"vcf_file": vcf_out, "log_file": log_out}

    with multiprocessing.Pool(cpus) as pool:
        pool.map(compress_file, parallel_jobs_data)

    with open(json_out, "w") as f:
        json.dump(data, f, indent=2, sort_keys=True)

    if original_manifest is None:
        return

    expect_samples = manifest_to_set_of_sample_names(original_manifest)
    failed_samples = expect_samples.difference(data)
    if len(failed_samples) > 0:
        failed_samples = sorted(list(failed_samples))
        with open(os.path.join(root_outdir, "failed_samples.txt"), "w") as f:
            print(*failed_samples, sep="\n", file=f)