Esempio n. 1
0
    def test_init(self):
        '''test __init__'''
        with self.assertRaises(reference_dir.Error):
            refdir = reference_dir.ReferenceDir(pipeline_references_root_dir='foo')
        with self.assertRaises(reference_dir.Error):
            refdir = reference_dir.ReferenceDir(reference_id=42)

        refdir = reference_dir.ReferenceDir(pipeline_references_root_dir='foo', reference_id=42)
        self.assertEqual(refdir.directory, os.path.join(os.getcwd(), 'foo', '42'))

        refdir = reference_dir.ReferenceDir(directory='bar')
        self.assertEqual(refdir.directory, os.path.join(os.getcwd(), 'bar'))
    def test_init(self):
        """test __init__"""
        with self.assertRaises(Exception):
            refdir = reference_dir.ReferenceDir(pipeline_references_root_dir="foo")
        with self.assertRaises(Exception):
            refdir = reference_dir.ReferenceDir(reference_id=42)

        refdir = reference_dir.ReferenceDir(
            pipeline_references_root_dir="foo", reference_id=42
        )
        self.assertEqual(refdir.directory, os.path.join(os.getcwd(), "foo", "42"))

        refdir = reference_dir.ReferenceDir(directory="bar")
        self.assertEqual(refdir.directory, os.path.join(os.getcwd(), "bar"))
    def test_add_remove_contam_metadata_tsv(self):
        """test add_remove_contam_metadata_tsv"""
        tmp_root_dir = "tmp.reference_dir.add_remove_contam_metadata_tsv"
        if os.path.exists(tmp_root_dir):
            shutil.rmtree(tmp_root_dir)
        fasta_in = os.path.join(data_dir,
                                "add_remove_contam_metadata_tsv.ref.fa")
        ref_dir = reference_dir.ReferenceDir(
            pipeline_references_root_dir=tmp_root_dir, reference_id=42)
        ref_dir.make_index_files(fasta_in, False, False, cortex_mem_height=17)

        bad_files = [
            os.path.join(
                data_dir,
                "add_remove_contam_metadata_tsv.ref.missing_from_tsv.tsv"),
            os.path.join(
                data_dir,
                "add_remove_contam_metadata_tsv.ref.extra_in_tsv.tsv"),
        ]

        for bad_file in bad_files:
            with self.assertRaises(reference_dir.Error):
                ref_dir.add_remove_contam_metadata_tsv(bad_file)

        ref_dir.add_remove_contam_metadata_tsv(
            os.path.join(data_dir,
                         "add_remove_contam_metadata_tsv.ref.good.tsv"))
        shutil.rmtree(tmp_root_dir)
    def test_make_index_files(self):
        """test make_index_files"""
        tmp_root_dir = "tmp.reference_dir.make_index_files"
        if os.path.exists(tmp_root_dir):
            shutil.rmtree(tmp_root_dir)
        fasta_in = os.path.join(data_dir, "make_index_files.ref.in.fa.gz")
        expected_ref = os.path.join(data_dir,
                                    "make_index_files.ref.expected.fa")
        ref_dir = reference_dir.ReferenceDir(
            pipeline_references_root_dir=tmp_root_dir, reference_id=42)
        with self.assertRaises(reference_dir.Error):
            ref_dir.make_index_files("file_does_not_exist",
                                     False,
                                     True,
                                     cortex_mem_height=17)

        ref_dir.make_index_files(fasta_in, False, True, cortex_mem_height=17)
        self.assertTrue(os.path.exists(ref_dir.directory))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta))
        self.assertTrue(
            filecmp.cmp(ref_dir.ref_fasta, expected_ref, shallow=False))
        self.assertTrue(os.path.exists(ref_dir.ref_fai))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta + ".bwt"))
        self.assertTrue(
            os.path.exists(ref_dir.ref_fasta_prefix + ".stampy.sthash"))
        self.assertTrue(
            os.path.exists(ref_dir.ref_fasta_prefix + ".stampy.stidx"))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + ".k31.ctx"))
        shutil.rmtree(tmp_root_dir)
    def test_run(self):
        root_outdir = "tmp.var_call_one_sample"
        utils.syscall(f"rm -rf {root_outdir}")
        os.mkdir(root_outdir)

        ref_fa = os.path.join(root_outdir, "ref.fa")
        ref_fa_mutated = f"{ref_fa}.mut.fa"
        random.seed(42)
        ref_seq = random.choices(["A", "C", "G", "T"], k=1000)
        ref_seq[499] = "A"
        with open(ref_fa, "w") as f:
            print(">ref", "".join(ref_seq), sep="\n", file=f)
        ref_fa_mutated = f"{ref_fa}.mut.fa"
        ref_seq[499] = "T"
        with open(ref_fa_mutated, "w") as f:
            print(">ref_mutated", "".join(ref_seq), sep="\n", file=f)

        reads1 = os.path.join(root_outdir, "reads1.fq")
        reads2 = os.path.join(root_outdir, "reads2.fq")
        utils.syscall(
            f"fastaq to_perfect_reads {ref_fa_mutated} - 200 1 20 75 | fastaq deinterleave - {reads1} {reads2}"
        )

        ref_dir = reference_dir.ReferenceDir(
            directory=os.path.join(root_outdir, "ref_dir")
        )
        ref_dir.make_index_files(ref_fa, False, True, cortex_mem_height=21)
        var_call_out = os.path.join(root_outdir, "varcall")
        var_call_one_sample_pipeline.run(
            [reads1],
            [reads2],
            ref_dir.directory,
            var_call_out,
            sample_name="test_sample",
            debug=False,
            keep_bam=True,
            cortex_mem_height=21,
        )

        got_files = sorted(list(os.listdir(var_call_out)))
        expect_files = [
            "cortex.vcf",
            "final.vcf",
            "map.bam",
            "map.bam.bai",
            "samtools.vcf",
        ]
        self.assertEqual(got_files, expect_files)

        with open(os.path.join(var_call_out, "final.vcf")) as f:
            calls = [x for x in f if not x.startswith("#")]
        self.assertEqual(len(calls), 1)
        fields = calls[0].split("\t")
        self.assertEqual(fields[1], "500")
        self.assertEqual(fields[3], "A")
        self.assertEqual(fields[4], "T")
        utils.syscall(f"rm -r {root_outdir}")
Esempio n. 6
0
def run(options):
    using_db = None not in (
        options.db_config_file,
        options.pipeline_references_root,
        options.name,
    )
    if using_db and options.outdir:
        print(
            "Error! If adding to database, must use --db_config_file,--pipeline_references_root,--name.",
            file=sys.stderr,
        )
        print("Otherwise, use --outdir.", file=sys.stderr)
        sys.exit(1)

    if using_db:
        lock = lock_file.LockFile(
            os.path.join(options.pipeline_references_root,
                         "add_reference.lock"))
        database = db.Db(options.db_config_file)
        ref_id = database.add_reference(options.name)
        database.commit_and_close()
        lock.stop()
    else:
        ref_id = None

    ref_dir = reference_dir.ReferenceDir(
        pipeline_references_root_dir=options.pipeline_references_root,
        reference_id=ref_id,
        directory=options.outdir,
    )

    genome_is_big = options.contam_tsv is not None
    using_cortex = options.contam_tsv is None
    ref_dir.make_index_files(
        options.fasta_file,
        genome_is_big,
        using_cortex,
        cortex_mem_height=options.cortex_mem_height,
    )

    if options.contam_tsv is not None:
        ref_dir.add_remove_contam_metadata_tsv(options.contam_tsv)
Esempio n. 7
0
    def test_make_index_files(self):
        '''test make_index_files'''
        tmp_root_dir = 'tmp.reference_dir.make_index_files'
        if os.path.exists(tmp_root_dir):
            shutil.rmtree(tmp_root_dir)
        fasta_in = os.path.join(data_dir, 'make_index_files.ref.in.fa.gz')
        expected_ref = os.path.join(data_dir, 'make_index_files.ref.expected.fa')
        ref_dir = reference_dir.ReferenceDir(pipeline_references_root_dir=tmp_root_dir, reference_id=42)
        with self.assertRaises(reference_dir.Error):
            ref_dir.make_index_files('file_does_not_exist', False, True, cortex_mem_height=17)

        ref_dir.make_index_files(fasta_in, False, True, cortex_mem_height=17)
        self.assertTrue(os.path.exists(ref_dir.directory))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta))
        self.assertTrue(filecmp.cmp(ref_dir.ref_fasta, expected_ref, shallow=False))
        self.assertTrue(os.path.exists(ref_dir.ref_fai))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta + '.bwt'))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.stampy.sthash'))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.stampy.stidx'))
        self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.k31.ctx'))
        shutil.rmtree(tmp_root_dir)
Esempio n. 8
0
def run(
    reads1_list,
    reads2_list,
    ref_dir,
    outdir,
    sample_name="sample",
    cortex_mem_height=22,
    debug=False,
    keep_bam=False,
):
    if len(reads1_list) != len(reads2_list):
        raise Exception(
            "Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}"
        )

    os.mkdir(outdir)

    trimmed_reads_1 = []
    trimmed_reads_2 = []
    for i in range(len(reads1_list)):
        trimmed_reads_1.append(
            os.path.join(outdir, f"trimmed_reads.{i}.1.fq.gz"))
        trimmed_reads_2.append(
            os.path.join(outdir, f"trimmed_reads.{i}.2.fq.gz"))
        read_trim.run_trimmomatic(
            reads1_list[i],
            reads2_list[i],
            trimmed_reads_1[-1],
            trimmed_reads_2[-1],
        )

    refdir = reference_dir.ReferenceDir(directory=ref_dir)
    rmdup_bam = os.path.join(outdir, "map.bam")
    read_map.map_reads_set(
        refdir.ref_fasta,
        trimmed_reads_1,
        trimmed_reads_2,
        rmdup_bam,
        rmdup=True,
        read_group=("1", sample_name),
    )
    utils.syscall(f"samtools index {rmdup_bam}")
    if not debug:
        for filename in trimmed_reads_1 + trimmed_reads_2:
            os.unlink(filename)

    samtools_vcf = os.path.join(outdir, "samtools.vcf")
    cmd = f"bcftools mpileup --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -vm -O v -o {samtools_vcf}"
    utils.syscall(cmd)

    cortex_dir = os.path.join(outdir, "cortex")
    ctx = cortex.CortexRunCalls(
        refdir.directory,
        rmdup_bam,
        cortex_dir,
        sample_name,
        mem_height=cortex_mem_height,
    )
    ctx.run(run_mccortex_view_kmers=False)
    ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs")
    cortex_vcfs = [
        os.path.join(ctx_vcf_dir, x) for x in os.listdir(ctx_vcf_dir)
        if x.endswith("raw.vcf")
    ]
    if len(cortex_vcfs) != 1:
        raise Exception("Error running cortex. Could not find output VCF file")
    cortex_vcf = os.path.join(outdir, "cortex.vcf")
    os.rename(cortex_vcfs[0], cortex_vcf)
    if not debug:
        utils.syscall(f"rm -rf {cortex_dir}")

    minos_dir = os.path.join(outdir, "minos")
    cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}"
    utils.syscall(cmd)
    os.rename(os.path.join(minos_dir, "final.vcf"),
              os.path.join(outdir, "final.vcf"))
    if not debug:
        utils.syscall(f"rm -rf {minos_dir}")

    if not (keep_bam or debug):
        os.unlink(rmdup_bam)
        os.unlink(rmdup_bam + ".bai")

    final_vcf = os.path.join(outdir, "final.vcf")
    if not os.path.exists(final_vcf):
        raise Exception(f"Error. Final VCF file not found: {final_vcf}")

    logging.info(f"Finished variant calling. Final VCF file: {final_vcf}")