def _aln(ref, fastq, tmp="/tmp", threads=8, threshold=0.05): sai = os.path.join(tmp, '%09d.sai' % random.randrange(0, 1e10)) with file_transaction(sai) as tx: cmd = ("bwa aln -n {threshold} -t {threads} " "{ref} {fastq} > {tx}").format(**locals()) run(cmd) return sai
def samse_aln(ref, reads, bamout, tmp="/tmp", threads=8, threshold=0.05): with bwa_index(ref) as bwaidx: r_sai = _aln(bwaidx, reads, tmp, threads, threshold) samse = ("bwa samse {ref} {r_sai} {reads} | samtools view -bSF0x0004 - " "| samtools sort -f -m 8 - {bam_sorted}") with file_transaction(bam_sorted) as tx: run(samse) return bam_sorted
def index_bam(bam_file): """ Build an index for a bam file. parameters bam_file : alignment file path returns index file name : string """ bam_index = bam_file + '.bai' if not file_exists(bam_index): with file_transaction(bam_index) as tx_out_file: run('samtools index %s %s' % (bam_file, tx_out_file)) return bam_index
def sampe_aln(ref, reads, bam_sorted, tmp="/tmp", threads=1, threshold=0.05): r1, r2 = tmp_split_reads(reads, tmp) with bwa_index(ref) as bwaidx: r1_sai = _aln(bwaidx, r1, tmp, threads, threshold) r2_sai = _aln(bwaidx, r2, tmp, threads, threshold) sampe = ("bwa sampe {ref} {r1_sai} {r2_sai} {r1} {r2} " "| samtools view -bSF0x0004 - " "| samtools sort -f -m 8 - {bam_sorted}").format(ref=bwaidx, r1_sai=r1_sai, r2_sai=r2_sai, r1=r1, r2=r2, bam_sorted=bam_sorted) with file_transaction(bam_sorted) as tx: run(sampe) return bam_sorted
def extract_fastq(bam, out_fastq): ''' Uses bedtools bamtofastq function to extract reads from bam Args: bam (string): path to bam alignment file out_fastq (string): output fastq to write to Returns: out_fastq (string): path to written output >> bam = 'Tara_test1_vs_Simons_LoCos_Conc.pctid95.overlap0.minlen100.bam' >> outfastq = 'testout.fastq' >> extract_fastq(bam, outfastq) == out_fastq ''' with file_transaction(out_fastq) as temp_oh: cmd = "bedtools bamtofastq -i {bam} -fq {fastq}".format(bam=bam, fastq=temp_oh) run(cmd) return out_fastq
def sag_checkm_completeness(fasta, cores): '''run checkm lineage_wf on SAG to get completeness values Args: fasta (str): full path to SAG genomic contigs in fasta format cores (int): number of cores to use to run checkm Returns: "completeness" statistics as a pandas dataframe ''' logger.info("Running checkm on %s " % fasta) fasta = op.abspath(fasta) if op.isdir == True or op.exists == False: return None with tmp_dir() as tdir: bindir = op.join(tdir, "bindir") safe_makedir(bindir) outdir = op.join(tdir, "outdir") safe_makedir(outdir) tmp_fasta = op.join(bindir, op.basename(fasta)) try: shutil.copy(fasta, tmp_fasta) assert op.exists(tmp_fasta) print(tmp_fasta, "created") except Exception, e: print("copying %s to the temporary directory failed, %s" % (fasta, e)) return None logger.info("Running lineage workflow on %s" % fasta) completeness_tsv = op.join(outdir, "completeness.tsv") cmd = "checkm lineage_wf -f {outfile} --tab_table -q -x fasta -t {cores} {binpath} {outdir}".format(outfile=completeness_tsv, outdir=outdir, cores=cores, binpath=bindir) logger.info("running checkm lineage, command is: {cmd}".format(**locals())) run(cmd) completeness = pd.read_csv(completeness_tsv, sep="\t", header=0)
def run_seqtk_sample(fastq, outfile, n, seed=37): """Subsample incoming paired-end fastqs to `n` reads (serially). Args: fastqs (str): path to fastq outfile (str): path of output fastq paths; output files are always gzipped n (int): number of subsampled reads seed (int): for random selection of reads Returns: str: subsampled reads file path """ if file_exists(outfile): return outfile logger.info("Subsampling to %d reads" % n) with file_transaction(outfile) as tx: cmd = "seqtk sample -s {seed} {fastq} {number} | gzip > {out}".format( seed=seed, fastq=fastq, number=n, out=tx) run(cmd) print("%s created" % outfile) return outfile
def bwa_mem(fastq, out_file, reference, options, cores=1): """ align reads using bwa mem. parameters fastq : path to reads out_file : path to aligned reads bam index : path to bwa index options : bwa mem options cores : int returns output file path : string """ if file_exists(out_file): return out_file predefined_options = [('-t', False)] if options is not None: options = filter_options(options, predefined_options) opts = " ".join(options) else: opts = "" logger.info("Mapping %s to %s using bwa mem" % (fastq, reference)) reference = bwa_index(reference) with file_transaction(out_file) as tx_out_file: cmd = ("bwa mem -t {cores} {options} {index} {fastq} | samtools view " "-ShuF4q2 - | samtools sort -o -m 8G - tmp > {result}" ).format(cores=cores, options=opts, index=reference, fastq=fastq, result=tx_out_file) run(cmd) index_bam(tx_out_file) return out_file