def pe_align(log, sample, sample_dir, ref, cores, r1, r2): r1sai = create_sai(log, sample, sample_dir, ref, cores, r1, 1) r2sai = create_sai(log, sample, sample_dir, ref, cores, r2, 2) cmd1 = [ get_user_path("bwa", "bwa"), "sampe", "-a", "700", ref, r1sai, r2sai, r1.pth, r2.pth ] cmd2 = [ get_user_path("samtools", "samtools"), "view", "-bS", "-" ] sampe_out_fname = os.path.join(sample_dir, '{}.pe.bwa-sampe-out.log'.format(sample)) samtools_out_fname = os.path.join(sample_dir, '{}.pe.samtools-out.log'.format(sample)) bam_out_fname = os.path.join(sample_dir, '{}.bam'.format(sample)) log.info("Building BAM for {}".format(sample)) with open(sampe_out_fname, 'w') as sampe_out: with open(samtools_out_fname, 'w') as samtools_out: with open(bam_out_fname, 'w') as bam_out: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=sampe_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=bam_out, stderr=samtools_out) proc1.stdout.close() proc2.communicate() # remove the sai files (they'll be stale soon) os.remove(r1sai) os.remove(r2sai) return bam_out_fname
def phase(log, sample, sample_dir, reference, bam): log.info("Phasing BAM file with CALMD for {}".format(sample)) cmd1 = [ get_user_path("binaries", "samtools"), "calmd", "-A", "-E", "-u", "-r", bam, reference ] cmd2 = [ get_user_path("binaries", "samtools"), "phase", "-A", "-F", "-Q", "20", "-b", sample_dir, "-" ] samtools_calmd_out_fname = '{}.samtools-calmd-phase-out.log'.format(sample_dir) samtools_phase_out_fname = '{}.samtools-phase-out.log'.format(sample_dir) with open(samtools_calmd_out_fname, 'w') as samtools_calmd_out: with open(samtools_phase_out_fname, 'w') as samtools_phase_out: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=samtools_calmd_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=samtools_phase_out, stderr=subprocess.STDOUT) proc1.stdout.close() proc2.communicate() return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
def phase(log, sample, sample_dir, reference, bam): log.info("Phasing BAM file with CALMD for {}".format(sample)) cmd1 = [ get_user_path("binaries", "samtools"), "calmd", "-A", "-E", "-u", "-r", bam, reference ] cmd2 = [ get_user_path("binaries", "samtools"), "phase", "-A", "-F", "-Q", "20", "-b", sample_dir, "-" ] samtools_calmd_out_fname = '{}.samtools-calmd-phase-out.log'.format( sample_dir) samtools_phase_out_fname = '{}.samtools-phase-out.log'.format(sample_dir) with open(samtools_calmd_out_fname, 'w') as samtools_calmd_out: with open(samtools_phase_out_fname, 'w') as samtools_phase_out: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=samtools_calmd_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=samtools_phase_out, stderr=subprocess.STDOUT) proc1.stdout.close() proc2.communicate() return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
def mem_pe_align(log, sample, sample_dir, ref, cores, r1, r2): #pdb.set_trace() cmd1 = [ get_user_path("bwa", "bwa"), "mem", "-t", str(cores), "-M", ref, r1.pth, r2.pth ] cmd2 = [ get_user_path("samtools", "samtools"), "view", "-bS", "-" ] sampe_out_fname = os.path.join(sample_dir, '{}.pe.bwa-sampe-out.log'.format(sample)) samtools_out_fname = os.path.join(sample_dir, '{}.pe.samtools-view-out.log'.format(sample)) bam_out_fname = os.path.join(sample_dir, '{}.bam'.format(sample)) log.info("Building BAM for {}".format(sample)) with open(sampe_out_fname, 'w') as sampe_out: with open(samtools_out_fname, 'w') as samtools_out: with open(bam_out_fname, 'w') as bam_out: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=sampe_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=bam_out, stderr=samtools_out) proc1.stdout.close() proc2.communicate() return bam_out_fname
def __init__(self, target, query, coverage, identity, out=False, min_match=None): # if not an output file, create a temp file to hold output if not out: fd, self.output = tempfile.mkstemp(suffix='.lastz') os.close(fd) else: self.output = out if identity and not min_match: self.cli = '{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --strand=both \ --seed=12of19 \ --transition \ --nogfextend \ --nochain \ --gap=400,30 \ --xdrop=910 \ --ydrop=8370 \ --hspthresh=3000 \ --gappedthresh=3000 \ --noentropy \ --coverage={2} \ --identity={3} \ --output={4} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format( target, query, coverage, identity, self.output, get_user_path("lastz", "lastz")) elif min_match: self.cli = '{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --strand=both \ --seed=12of19 \ --transition \ --nogfextend \ --nochain \ --gap=400,30 \ --xdrop=910 \ --ydrop=8370 \ --hspthresh=3000 \ --gappedthresh=3000 \ --noentropy \ --matchcount={2} \ --identity={3} \ --output={4} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format( target, query, min_match, identity, self.output, get_user_path("lastz", "lastz"))
def coverage(log, sample, assembly_pth, assembly, cores, bam): log.info("Computing coverage with GATK for {}".format(sample)) cwd = os.getcwd() # move into reference directory os.chdir(assembly_pth) cmd = [ get_user_path("binaries", "gatk"), "-T", "DepthOfCoverage", "-R", assembly, "-I", bam, "-o", "{}-coverage".format(sample), "-nt", str(cores), "--omitIntervalStatistics", "--omitLocusTable" ] gatk_coverage_fname = os.path.join(assembly_pth, '{}.GATK-coverage-out.log'.format(sample)) with open(gatk_coverage_fname, 'w') as gatk_out: proc = subprocess.Popen(cmd, stdout=gatk_out, stderr=subprocess.STDOUT) proc.communicate() os.chdir(cwd) return os.path.join(assembly_pth, "{}-coverage".format(sample))
def test_config_directories_exist(self): for directory in self.directories: param = get_user_path(directory[0], directory[1], package_only=True) self.assertTrue(os.path.isdir(param), "Directory {} is missing".format(param))
def test_binaries_exist(self): """Test that binaries in config are properly located""" for program in self.binaries: binary = get_user_path(program[0], program[1], package_only=True) self.assertTrue( os.path.isfile(binary) and os.access(binary, os.X_OK), "Binary {} is missing".format(binary))
def test_binaries_exist(self): """Test that binaries in config are properly located""" for program in self.binaries: binary = get_user_path(program[0], program[1], package_only=True) self.assertTrue(os.path.isfile(binary) and os.access(binary, os.X_OK), "Binary {} is missing".format(binary) )
def test_config_directories_exist(self): for directory in self.directories: param = get_user_path(directory[0], directory[1], package_only=True) self.assertTrue( os.path.isdir(param), "Directory {} is missing".format(param) )
def test_config_binaries(self): """Test that config is properly located""" for program in self.binaries: binary = get_user_path(program[0], program[1], package_only=True) expected = os.path.join(sys.prefix, "bin", program[2]) self.assertEqual( binary, expected, "Config entry {} != {} (expected)".format(binary, expected))
def test_config_directories(self): for directory in self.directories: param = get_user_path(directory[0], directory[1], package_only=True) expected = os.path.join(sys.prefix, directory[1]) self.assertEqual(param, expected, "Directory {} is missing".format(directory[1]))
def __init__(self, target, query, coverage, identity, out=False, min_match=None): # if not an output file, create a temp file to hold output if not out: fd, self.output = tempfile.mkstemp(suffix=".lastz") os.close(fd) else: self.output = out if identity and not min_match: self.cli = "{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --strand=both \ --seed=12of19 \ --transition \ --nogfextend \ --nochain \ --gap=400,30 \ --xdrop=910 \ --ydrop=8370 \ --hspthresh=3000 \ --gappedthresh=3000 \ --noentropy \ --coverage={2} \ --identity={3} \ --output={4} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format( target, query, coverage, identity, self.output, get_user_path("lastz", "lastz") ) elif min_match: self.cli = "{5} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --strand=both \ --seed=12of19 \ --transition \ --nogfextend \ --nochain \ --gap=400,30 \ --xdrop=910 \ --ydrop=8370 \ --hspthresh=3000 \ --gappedthresh=3000 \ --noentropy \ --matchcount={2} \ --identity={3} \ --output={4} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format( target, query, min_match, identity, self.output, get_user_path("lastz", "lastz") )
def test_config_directories(self): for directory in self.directories: param = get_user_path(directory[0], directory[1], package_only=True) expected = os.path.join(sys.prefix, directory[1]) self.assertEqual( param, expected, "Directory {} is missing".format(directory[1]) )
def test_config_binaries(self): """Test that config is properly located""" for program in self.binaries: binary = get_user_path(program[0], program[1], package_only=True) expected = os.path.join(sys.prefix, "bin", program[2]) self.assertEqual( binary, expected, "Config entry {} != {} (expected)".format(binary, expected) )
def index(log, sample, sample_dir, bam): log.info("Indexing BAM for {}".format(sample)) cmd = [get_user_path("samtools", "samtools"), "index", bam] samtools_out_fname = os.path.join( sample_dir, '{}.samtools-index-out.log'.format(sample)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate()
def create_faidx(log, sample, sample_dir, fasta): log.info("Indexing fasta for {}".format(sample)) cmd = [get_user_path("samtools", "samtools"), "faidx", fasta] samtools_out_fname = os.path.join( sample_dir, '{}.samtools-faidx-out.log'.format(sample)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate()
def create_faidx(log, sample, sample_dir, fasta): log.info("Indexing fasta for {}".format(sample)) cmd = [ get_user_path("samtools", "samtools"), "faidx", fasta ] samtools_out_fname = os.path.join(sample_dir, '{}.samtools-faidx-out.log'.format(sample)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate()
def create_index_files(log, reference): log.info("Running bwa indexing against {}".format(reference)) cwd = os.getcwd() # move into reference directory os.chdir(os.path.dirname(reference)) cmd = [get_user_path("bwa", "bwa"), "index", reference] with open('bwa-index-file.log', 'a') as outf: proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT) proc.communicate() # mvoe back to working directory os.chdir(cwd)
def __init__(self, target, query, out=False): # if not an output file, create a temp file to hold output if not out: fd, self.output = tempfile.mkstemp(suffix='.lastz') os.close(fd) else: self.output = out self.cli = '{3} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --output={2} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity'.format( target, query, self.output, get_user_path("lastz", "lastz"))
def index(log, sample, sample_dir, bam): log.info("Indexing BAM for {}".format(sample)) cmd = [ get_user_path("samtools", "samtools"), "index", bam ] samtools_out_fname = os.path.join(sample_dir, '{}.samtools-index-out.log'.format(sample)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate()
def call(log, sample, sample_dir, reference, bam, phase=None): if phase is None: log.info("Creating REF/ALT allele FASTQ file --Unphased--") else: log.info("Creating REF/ALT allele FASTQ file {}".format(phase)) cmd1 = [ get_user_path("samtools", "samtools"), "mpileup", "-u", "-f", reference, bam ] cmd2 = [ get_user_path("samtools", "bcftools"), "view", "-cg", "-" ] cmd3 = [ get_user_path("samtools", "vcfutils"), "vcf2fq" ] mpileup_out_fname = "{}.samtools-mpileup-out.log".format(sample_dir) bcftools_out_fname = "{}.samtools-bcftools-out.log".format(sample_dir) vcfutils_out_fname = "{}.samtools-vcfutils-out.log".format(sample_dir) if phase is None: vcfutils_fastq_fname = "{}.fq".format(sample_dir) else: vcfutils_fastq_fname = "{}.{}.fq".format(sample_dir, phase) with open(mpileup_out_fname, 'w') as mpileup_out: with open(bcftools_out_fname, 'w') as bcftools_out: with open(vcfutils_out_fname, 'w') as vcfutils_out: with open(vcfutils_fastq_fname, 'w') as vcfutils_fastq: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=mpileup_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=subprocess.PIPE, stderr=bcftools_out) proc3 = subprocess.Popen(cmd3, stdin=proc2.stdout, stdout=vcfutils_fastq, stderr=vcfutils_out) proc1.stdout.close() proc2.stdout.close() proc3.communicate() return vcfutils_fastq_fname
def __init__(self, target, query, out=False): # if not an output file, create a temp file to hold output if not out: fd, self.output = tempfile.mkstemp(suffix=".lastz") os.close(fd) else: self.output = out self.cli = "{3} {0}[multiple,nameparse=full] {1}[nameparse=full]\ --output={2} \ --format=general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity".format( target, query, self.output, get_user_path("lastz", "lastz") )
def lastz_params(target, query, coverage, identity, outfile): output_format = "general-:score,name1,strand1,zstart1,end1,length1,name2,strand2,zstart2,end2,length2,diff,cigar,identity,continuity,coverage" cmd = [ get_user_path("lastz", "lastz"), "{0}[multiple]".format(target), "{0}[nameparse=full]".format(query), "--strand=both", "--seed=12of19", "--transition", "--nogfextend", "--nochain", "--gap=400,30", "--xdrop=910", "--ydrop=8370", "--hspthresh=3000", "--gappedthresh=3000", "--noentropy", "--coverage={0}".format(coverage), "--identity={0}".format(identity), "--output={0}".format(outfile), "--format={0}".format(output_format) ] return cmd
def create_reference_dict(log, sample, sample_dir, reference): log.info("Creating FASTA dict for {}".format(sample)) outf = os.path.splitext(reference)[0] + ".dict" cmd = [ get_user_path("binaries", "picard"), "CreateSequenceDictionary", "R={}".format(reference), "O={}".format(outf) ] picard_ref_dict_fname = os.path.join(sample_dir, '{}.picard-reference-dict-out.log'.format(sample)) with open(picard_ref_dict_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate()
def se_align(log, sample, sample_dir, ref, cores, rS): bam_out_fname = os.path.join(sample_dir, '{}-se.bam'.format(sample)) cmd = [ get_user_path("ngm", "ngm"), "-r", ref, "-q", rS.pth, "-b", "-o", bam_out_fname, "-t", str(cores), "--no-progress" ] ngmse_out_fname = os.path.join(sample_dir, '{}.ngm.se.log'.format(sample)) log.info("Building BAM for {}".format(sample)) with open(ngmse_out_fname, 'w') as outf: proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT) proc.communicate() return bam_out_fname
def sort(log, sample, sample_dir, bam): log.info("Sorting BAM for {}".format(sample)) out_prefix = "{}.sorted.bam".format(os.path.splitext(bam)[0]) cmd = [ get_user_path("binaries", "samtools"), "sort", bam, "-o", out_prefix ] samtools_out_fname = '{}.samtools-sort-out.log'.format(sample_dir) with open(samtools_out_fname, 'a') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate() return out_prefix
def sort(log, sample, sample_dir, bam): log.info("Sorting BAM for {}".format(sample)) out_prefix = "{}.sorted".format(os.path.splitext(bam)[0]) cmd = [ get_user_path("samtools", "samtools"), "sort", bam, out_prefix ] samtools_out_fname = '{}.samtools-sort-out.log'.format(sample_dir) with open(samtools_out_fname, 'a') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate() return "{}.bam".format(out_prefix)
def clean_up_bam(log, sample, sample_dir, bam, type): log.info("Cleaning BAM for {}".format(sample)) new_bam = new_bam_name(bam, "CL") cmd = [ get_user_path("binaries", "picard"), "CleanSam", "I={}".format(bam), "O={}".format(new_bam) ] picard_clean_out_fname = os.path.join(sample_dir, '{}.{}.picard-clean-out.log'.format(sample, type)) with open(picard_clean_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() # remove old bam os.remove(bam) return new_bam
def fix_mate_information(log, sample, sample_dir, bam, type): log.info("Fixing mate information for {}".format(sample)) new_bam = new_bam_name(bam, "CL") cmd = [ get_user_path("binaries", "picard"), "FixMateInformation", "I={}".format(bam), "O={}".format(new_bam), "VALIDATION_STRINGENCY=SILENT" ] picard_clean_out_fname = os.path.join(sample_dir, '{}.{}.picard.fixmate.log'.format(sample, type)) with open(picard_clean_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() # remove old bam os.remove(bam) return new_bam
def create_sai(log, sample, sample_dir, ref, cores, reads, read): log.info("Creating read index file for {}".format(reads.file)) cmd = [ get_user_path("bwa", "bwa"), "aln", "-t", str(cores), ref, reads.pth ] aln_out_fname = os.path.join(sample_dir, '{}-r{}.sai'.format(sample, read)) aln_err_fname = os.path.join(sample_dir, '{}-r{}.bwa-aln-out.log'.format(sample, read)) with open(aln_out_fname, 'w') as aln_out: with open(aln_err_fname, 'w') as aln_err: proc = subprocess.Popen(cmd, stdout=aln_out, stderr=aln_err) proc.communicate() return aln_out_fname
def coverage(log, sample, assembly_pth, assembly, cores, bam): log.info("Computing coverage with GATK for {}".format(sample)) cwd = os.getcwd() # move into reference directory os.chdir(assembly_pth) cmd = [ get_user_path("binaries", "gatk"), "-T", "DepthOfCoverage", "-R", assembly, "-I", bam, "-o", "{}-coverage".format(sample), "-nt", str(cores), "--omitIntervalStatistics", "--omitLocusTable" ] gatk_coverage_fname = os.path.join( assembly_pth, '{}.GATK-coverage-out.log'.format(sample)) with open(gatk_coverage_fname, 'w') as gatk_out: proc = subprocess.Popen(cmd, stdout=gatk_out, stderr=subprocess.STDOUT) proc.communicate() os.chdir(cwd) return os.path.join(assembly_pth, "{}-coverage".format(sample))
def run_alignment(self, clean=True): """ muscle """ # create results file fd, aln = tempfile.mkstemp(suffix='.muscle') os.close(fd) # run MUSCLE on the temp file cmd = [get_user_path("binaries", "muscle"), "-in", self.input, "-out", aln] proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) stdout, stderr = proc.communicate() self.alignment = AlignIO.read(open(aln, 'rU'), \ "fasta", alphabet=Gapped(IUPAC.unambiguous_dna, "-")) # cleanup temp files if clean: self._clean(aln)
def calculate_hs_metrics(log, sample, sample_dir, reference, bam, target, bait): log.info("Calculating coverage metrics for {}".format(sample)) hs_metrics_file = os.path.join(sample_dir, "{}.reads-on-target.txt".format(sample)) cmd = [ get_user_path("binaries", "picard"), "CollectHsMetrics", "I={}".format(bam), "O={}".format(hs_metrics_file), "REFERENCE_SEQUENCE={}".format(reference), "TARGET_INTERVALS={}".format(target), "BAIT_INTERVALS={}".format(bait), "VALIDATION_STRINGENCY=LENIENT" ] picard_hs_out_fname = os.path.join(sample_dir, '{}.picard-hs-metrics-out.log'.format(sample)) with open(picard_hs_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() return hs_metrics_file
def phase(log, sample, sample_dir, bam): log.info("Phasing BAM file for {}".format(sample)) cmd = [ get_user_path("samtools", "samtools"), "phase", "-A", "-F", "-Q", "20", "-b", sample_dir, bam ] samtools_out_fname = '{}.samtools-phase-out.log'.format(sample_dir) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate() return "{}.0.bam".format(sample_dir), "{}.1.bam".format(sample_dir)
def fq_to_fa(log, sample, sample_dir, fastq, phase=None): if phase is None: log.info("Converting --Unphased-- FASTQ files to FASTA files") else: log.info( "Creating REF/ALT allele FASTA file {0} from FASTQ {0}".format( phase)) cmd = [get_user_path("seqtk", "seqtk"), "seq", "-a", fastq] seqtk_out_fname = "{}.seqtk-seq-out.log".format(sample_dir) if phase is None: seqtk_fasta_fname = "{}.fasta".format(sample_dir) else: seqtk_fasta_fname = "{}.{}.fasta".format(sample_dir, phase) with open(seqtk_out_fname, 'w') as seqtk_out: with open(seqtk_fasta_fname, 'w') as seqtk_fasta: proc = subprocess.Popen(cmd, stdout=seqtk_fasta, stderr=seqtk_out) proc.communicate() return seqtk_fasta_fname
def run_alignment(self, clean=True): # create results file fd, aln = tempfile.mkstemp(suffix='.mafft') os.close(fd) aln_stdout = open(aln, 'w') # run MAFFT on the temp file cmd = [get_user_path("mafft", "mafft"), "--adjustdirection", "--maxiterate", "1000", self.input] # just pass all ENV params proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=aln_stdout ) stderr = proc.communicate() aln_stdout.close() self.alignment = AlignIO.read(open(aln, 'rU'), "fasta", \ alphabet=Gapped(IUPAC.unambiguous_dna, "-")) if clean: self._clean(aln)
def run_alignment(self, clean=True): # create results file fd, aln = tempfile.mkstemp(suffix='.mafft') os.close(fd) aln_stdout = open(aln, 'w') # run MAFFT on the temp file cmd = [ get_user_path("mafft", "mafft"), "--adjustdirection", "--maxiterate", "1000", self.input ] # just pass all ENV params proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=aln_stdout) stderr = proc.communicate() aln_stdout.close() self.alignment = AlignIO.read(open(aln, 'rU'), "fasta", \ alphabet=Gapped(IUPAC.unambiguous_dna, "-")) if clean: self._clean(aln)
def fq_to_fa(log, sample, sample_dir, fastq, phase=None): if phase is None: log.info("Converting --Unphased-- FASTQ files to FASTA files") else: log.info("Creating REF/ALT allele FASTA file {0} from FASTQ {0}".format(phase)) cmd = [ get_user_path("seqtk", "seqtk"), "seq", "-a", fastq ] seqtk_out_fname = "{}.seqtk-seq-out.log".format(sample_dir) if phase is None: seqtk_fasta_fname = "{}.fasta".format(sample_dir) else: seqtk_fasta_fname = "{}.{}.fasta".format(sample_dir, phase) with open(seqtk_out_fname, 'w') as seqtk_out: with open(seqtk_fasta_fname, 'w') as seqtk_fasta: proc = subprocess.Popen(cmd, stdout=seqtk_fasta, stderr=seqtk_out) proc.communicate() return seqtk_fasta_fname
def merge_two_bams(log, sample, sample_dir, bam, bam_se): log.info("Merging BAMs for {}".format(sample)) new_bam = new_bam_name(bam, "M") cmd = [ get_user_path("binaries", "picard"), "MergeSamFiles", "SO=coordinate", "AS=true", "I={}".format(bam), "I={}".format(bam_se), "O={}".format(new_bam), "VALIDATION_STRINGENCY=LENIENT", ] picard_merge_out_fname = os.path.join(sample_dir, '{}.picard-merge-out.log'.format(sample)) with open(picard_merge_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() # remove old bam os.remove(bam) os.remove(bam_se) return new_bam
def mark_duplicates(log, sample, sample_dir, bam, type): log.info("Marking read duplicates from BAM for {}".format(sample)) new_bam = new_bam_name(bam, "MD") metricsfile = os.path.join(sample_dir, "{}.{}.picard-metricsfile.txt".format(sample, type)) cmd = [ get_user_path("binaries", "picard"), "MarkDuplicates", "I={}".format(bam), "O={}".format(new_bam), "METRICS_FILE={}".format(metricsfile), "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=250", "ASSUME_SORTED=true", "VALIDATION_STRINGENCY=SILENT", "REMOVE_DUPLICATES=false", ] picard_dd_out_fname = os.path.join(sample_dir, '{}.{}.picard-MD-out.log'.format(sample, type)) with open(picard_dd_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() # remove old bam os.remove(bam) return new_bam
def add_rg_header_info(log, sample, sample_dir, flowcell, bam, type): #pdb.set_trace() log.info("Adding RG header to BAM for {}".format(sample)) new_bam = new_bam_name(bam, "RG") cmd = [ get_user_path("binaries", "picard"), "AddOrReplaceReadGroups", "I={}".format(bam), "O={}".format(new_bam), "SORT_ORDER=coordinate", "RGPL=illumina", "RGPU={}".format(flowcell), "RGLB=Lib1", "RGID={}".format(sample), "RGSM={}".format(sample), "VALIDATION_STRINGENCY=LENIENT" ] picard_rg_out_fname = os.path.join(sample_dir, '{}.{}.picard-RG-out.log'.format(sample, type)) with open(picard_rg_out_fname, 'w') as picard_out: proc = subprocess.Popen(cmd, stdout=picard_out, stderr=subprocess.STDOUT) proc.communicate() # remove old bam os.remove(bam) return new_bam
This code is distributed under a 3-clause BSD license. Please see LICENSE.txt for more information. Created on 26 June 2014 17:13 PDT (-0700) """ import os import subprocess from phyluce.pth import get_user_path, get_user_param JAVA = get_user_param("java", "executable") JAVA_PARAMS = get_user_param("java", "mem") JAR_PATH = get_user_path("java", "jar") def new_bam_name(bam, append): pth, bamfname = os.path.split(bam) bamfname = os.path.splitext(bamfname)[0] new_bamfname = "{}-{}.bam".format(bamfname, append) new_bam = os.path.join(pth, new_bamfname) return new_bam def create_reference_dict(log, sample, sample_dir, reference): log.info("Creating FASTA dict for {}".format(sample)) outf = os.path.splitext(reference)[0] + ".dict" cmd = [ JAVA,
import os import re import gzip import glob import numpy import subprocess from collections import OrderedDict from phyluce.pth import get_user_param, get_user_path from Bio import SeqIO JAVA = get_user_param("java", "executable") JAVA_PARAMS = get_user_param("java", "mem") JAR_PATH = get_user_path("java", "jar") GATK = get_user_param("java", "gatk") def coverage(log, sample, assembly_pth, assembly, cores, bam): log.info("Computing coverage with GATK for {}".format(sample)) cwd = os.getcwd() # move into reference directory os.chdir(assembly_pth) cmd = [ JAVA, JAVA_PARAMS, "-jar", os.path.join(JAR_PATH, GATK), "-T", "DepthOfCoverage", "-R", assembly, "-I", bam, "-o", "{}-coverage".format(sample), "-nt", str(cores), "--omitIntervalStatistics", "--omitLocusTable" ] gatk_coverage_fname = os.path.join(