def test_init(self): '''test __init__''' with self.assertRaises(reference_dir.Error): refdir = reference_dir.ReferenceDir(pipeline_references_root_dir='foo') with self.assertRaises(reference_dir.Error): refdir = reference_dir.ReferenceDir(reference_id=42) refdir = reference_dir.ReferenceDir(pipeline_references_root_dir='foo', reference_id=42) self.assertEqual(refdir.directory, os.path.join(os.getcwd(), 'foo', '42')) refdir = reference_dir.ReferenceDir(directory='bar') self.assertEqual(refdir.directory, os.path.join(os.getcwd(), 'bar'))
def test_init(self): """test __init__""" with self.assertRaises(Exception): refdir = reference_dir.ReferenceDir(pipeline_references_root_dir="foo") with self.assertRaises(Exception): refdir = reference_dir.ReferenceDir(reference_id=42) refdir = reference_dir.ReferenceDir( pipeline_references_root_dir="foo", reference_id=42 ) self.assertEqual(refdir.directory, os.path.join(os.getcwd(), "foo", "42")) refdir = reference_dir.ReferenceDir(directory="bar") self.assertEqual(refdir.directory, os.path.join(os.getcwd(), "bar"))
def test_add_remove_contam_metadata_tsv(self): """test add_remove_contam_metadata_tsv""" tmp_root_dir = "tmp.reference_dir.add_remove_contam_metadata_tsv" if os.path.exists(tmp_root_dir): shutil.rmtree(tmp_root_dir) fasta_in = os.path.join(data_dir, "add_remove_contam_metadata_tsv.ref.fa") ref_dir = reference_dir.ReferenceDir( pipeline_references_root_dir=tmp_root_dir, reference_id=42) ref_dir.make_index_files(fasta_in, False, False, cortex_mem_height=17) bad_files = [ os.path.join( data_dir, "add_remove_contam_metadata_tsv.ref.missing_from_tsv.tsv"), os.path.join( data_dir, "add_remove_contam_metadata_tsv.ref.extra_in_tsv.tsv"), ] for bad_file in bad_files: with self.assertRaises(reference_dir.Error): ref_dir.add_remove_contam_metadata_tsv(bad_file) ref_dir.add_remove_contam_metadata_tsv( os.path.join(data_dir, "add_remove_contam_metadata_tsv.ref.good.tsv")) shutil.rmtree(tmp_root_dir)
def test_make_index_files(self): """test make_index_files""" tmp_root_dir = "tmp.reference_dir.make_index_files" if os.path.exists(tmp_root_dir): shutil.rmtree(tmp_root_dir) fasta_in = os.path.join(data_dir, "make_index_files.ref.in.fa.gz") expected_ref = os.path.join(data_dir, "make_index_files.ref.expected.fa") ref_dir = reference_dir.ReferenceDir( pipeline_references_root_dir=tmp_root_dir, reference_id=42) with self.assertRaises(reference_dir.Error): ref_dir.make_index_files("file_does_not_exist", False, True, cortex_mem_height=17) ref_dir.make_index_files(fasta_in, False, True, cortex_mem_height=17) self.assertTrue(os.path.exists(ref_dir.directory)) self.assertTrue(os.path.exists(ref_dir.ref_fasta)) self.assertTrue( filecmp.cmp(ref_dir.ref_fasta, expected_ref, shallow=False)) self.assertTrue(os.path.exists(ref_dir.ref_fai)) self.assertTrue(os.path.exists(ref_dir.ref_fasta + ".bwt")) self.assertTrue( os.path.exists(ref_dir.ref_fasta_prefix + ".stampy.sthash")) self.assertTrue( os.path.exists(ref_dir.ref_fasta_prefix + ".stampy.stidx")) self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + ".k31.ctx")) shutil.rmtree(tmp_root_dir)
def test_run(self): root_outdir = "tmp.var_call_one_sample" utils.syscall(f"rm -rf {root_outdir}") os.mkdir(root_outdir) ref_fa = os.path.join(root_outdir, "ref.fa") ref_fa_mutated = f"{ref_fa}.mut.fa" random.seed(42) ref_seq = random.choices(["A", "C", "G", "T"], k=1000) ref_seq[499] = "A" with open(ref_fa, "w") as f: print(">ref", "".join(ref_seq), sep="\n", file=f) ref_fa_mutated = f"{ref_fa}.mut.fa" ref_seq[499] = "T" with open(ref_fa_mutated, "w") as f: print(">ref_mutated", "".join(ref_seq), sep="\n", file=f) reads1 = os.path.join(root_outdir, "reads1.fq") reads2 = os.path.join(root_outdir, "reads2.fq") utils.syscall( f"fastaq to_perfect_reads {ref_fa_mutated} - 200 1 20 75 | fastaq deinterleave - {reads1} {reads2}" ) ref_dir = reference_dir.ReferenceDir( directory=os.path.join(root_outdir, "ref_dir") ) ref_dir.make_index_files(ref_fa, False, True, cortex_mem_height=21) var_call_out = os.path.join(root_outdir, "varcall") var_call_one_sample_pipeline.run( [reads1], [reads2], ref_dir.directory, var_call_out, sample_name="test_sample", debug=False, keep_bam=True, cortex_mem_height=21, ) got_files = sorted(list(os.listdir(var_call_out))) expect_files = [ "cortex.vcf", "final.vcf", "map.bam", "map.bam.bai", "samtools.vcf", ] self.assertEqual(got_files, expect_files) with open(os.path.join(var_call_out, "final.vcf")) as f: calls = [x for x in f if not x.startswith("#")] self.assertEqual(len(calls), 1) fields = calls[0].split("\t") self.assertEqual(fields[1], "500") self.assertEqual(fields[3], "A") self.assertEqual(fields[4], "T") utils.syscall(f"rm -r {root_outdir}")
def run(options): using_db = None not in ( options.db_config_file, options.pipeline_references_root, options.name, ) if using_db and options.outdir: print( "Error! If adding to database, must use --db_config_file,--pipeline_references_root,--name.", file=sys.stderr, ) print("Otherwise, use --outdir.", file=sys.stderr) sys.exit(1) if using_db: lock = lock_file.LockFile( os.path.join(options.pipeline_references_root, "add_reference.lock")) database = db.Db(options.db_config_file) ref_id = database.add_reference(options.name) database.commit_and_close() lock.stop() else: ref_id = None ref_dir = reference_dir.ReferenceDir( pipeline_references_root_dir=options.pipeline_references_root, reference_id=ref_id, directory=options.outdir, ) genome_is_big = options.contam_tsv is not None using_cortex = options.contam_tsv is None ref_dir.make_index_files( options.fasta_file, genome_is_big, using_cortex, cortex_mem_height=options.cortex_mem_height, ) if options.contam_tsv is not None: ref_dir.add_remove_contam_metadata_tsv(options.contam_tsv)
def test_make_index_files(self): '''test make_index_files''' tmp_root_dir = 'tmp.reference_dir.make_index_files' if os.path.exists(tmp_root_dir): shutil.rmtree(tmp_root_dir) fasta_in = os.path.join(data_dir, 'make_index_files.ref.in.fa.gz') expected_ref = os.path.join(data_dir, 'make_index_files.ref.expected.fa') ref_dir = reference_dir.ReferenceDir(pipeline_references_root_dir=tmp_root_dir, reference_id=42) with self.assertRaises(reference_dir.Error): ref_dir.make_index_files('file_does_not_exist', False, True, cortex_mem_height=17) ref_dir.make_index_files(fasta_in, False, True, cortex_mem_height=17) self.assertTrue(os.path.exists(ref_dir.directory)) self.assertTrue(os.path.exists(ref_dir.ref_fasta)) self.assertTrue(filecmp.cmp(ref_dir.ref_fasta, expected_ref, shallow=False)) self.assertTrue(os.path.exists(ref_dir.ref_fai)) self.assertTrue(os.path.exists(ref_dir.ref_fasta + '.bwt')) self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.stampy.sthash')) self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.stampy.stidx')) self.assertTrue(os.path.exists(ref_dir.ref_fasta_prefix + '.k31.ctx')) shutil.rmtree(tmp_root_dir)
def run( reads1_list, reads2_list, ref_dir, outdir, sample_name="sample", cortex_mem_height=22, debug=False, keep_bam=False, ): if len(reads1_list) != len(reads2_list): raise Exception( "Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}" ) os.mkdir(outdir) trimmed_reads_1 = [] trimmed_reads_2 = [] for i in range(len(reads1_list)): trimmed_reads_1.append( os.path.join(outdir, f"trimmed_reads.{i}.1.fq.gz")) trimmed_reads_2.append( os.path.join(outdir, f"trimmed_reads.{i}.2.fq.gz")) read_trim.run_trimmomatic( reads1_list[i], reads2_list[i], trimmed_reads_1[-1], trimmed_reads_2[-1], ) refdir = reference_dir.ReferenceDir(directory=ref_dir) rmdup_bam = os.path.join(outdir, "map.bam") read_map.map_reads_set( refdir.ref_fasta, trimmed_reads_1, trimmed_reads_2, rmdup_bam, rmdup=True, read_group=("1", sample_name), ) utils.syscall(f"samtools index {rmdup_bam}") if not debug: for filename in trimmed_reads_1 + trimmed_reads_2: os.unlink(filename) samtools_vcf = os.path.join(outdir, "samtools.vcf") cmd = f"bcftools mpileup --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -vm -O v -o {samtools_vcf}" utils.syscall(cmd) cortex_dir = os.path.join(outdir, "cortex") ctx = cortex.CortexRunCalls( refdir.directory, rmdup_bam, cortex_dir, sample_name, mem_height=cortex_mem_height, ) ctx.run(run_mccortex_view_kmers=False) ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs") cortex_vcfs = [ os.path.join(ctx_vcf_dir, x) for x in os.listdir(ctx_vcf_dir) if x.endswith("raw.vcf") ] if len(cortex_vcfs) != 1: raise Exception("Error running cortex. Could not find output VCF file") cortex_vcf = os.path.join(outdir, "cortex.vcf") os.rename(cortex_vcfs[0], cortex_vcf) if not debug: utils.syscall(f"rm -rf {cortex_dir}") minos_dir = os.path.join(outdir, "minos") cmd = f"minos adjudicate --reads {rmdup_bam} {minos_dir} {refdir.ref_fasta} {samtools_vcf} {cortex_vcf}" utils.syscall(cmd) os.rename(os.path.join(minos_dir, "final.vcf"), os.path.join(outdir, "final.vcf")) if not debug: utils.syscall(f"rm -rf {minos_dir}") if not (keep_bam or debug): os.unlink(rmdup_bam) os.unlink(rmdup_bam + ".bai") final_vcf = os.path.join(outdir, "final.vcf") if not os.path.exists(final_vcf): raise Exception(f"Error. Final VCF file not found: {final_vcf}") logging.info(f"Finished variant calling. Final VCF file: {final_vcf}")