Esempio n. 1
0
def _aln(ref, fastq, tmp="/tmp", threads=8, threshold=0.05):
    sai = os.path.join(tmp, '%09d.sai' % random.randrange(0, 1e10))
    with file_transaction(sai) as tx:
        cmd = ("bwa aln -n {threshold} -t {threads} "
               "{ref} {fastq} > {tx}").format(**locals())
        run(cmd)
    return sai
Esempio n. 2
0
def chop_reads(fastq, out_file, length, minlength, cores=10):

    if not out_file.endswith('.gz'):
        out_file = out_file + '.gz'

    if file_exists(out_file):
        print("output file exists, chop reads will not repeat itself.")
        return out_file

    pd.set_option('display.float_format', lambda x: '%.0f' % x)
    p = multiprocessing.Pool(cores)

    original_count = 0
    readcount = 0
    bpcount = 0
    size_var = OnlineVariance(ddof=0)

    with file_transaction(out_file) as tx_outfile:
        with open(tx_outfile, "w") as txoh:
            for result in multiprocess(chop_read,
                                       readfx(fastq),
                                       length,
                                       minlength,
                                       pool=p):

                if type(result) == list:
                    for r in result:
                        print(r, file=txoh)
                        original_count += 1
                        lines = r.split("\n")
                        for name, seq, qual in read_fq_string(lines):
                            readcount += 1
                            bpcount += len(seq)
                            size_var.include(len(seq))
                else:
                    print(result, file=txoh)
                    original_count += 1
                    lines = result.split("\n")
                    for name, seq, qual in read_fq_string(lines):

                        readcount += 1
                        bpcount += len(seq)
                        size_var.include(len(seq))

    meanbp = bpcount / readcount
    readbp_std = size_var.std
    out_file = pigz_file(out_file, cores)

    outdata = pd.Series(
        data=[original_count, readcount, bpcount, meanbp, readbp_std],
        index=[
            'original_count', 'read_count', 'bp_count', 'mean_read_len',
            'read_len_std'
        ])
    print(outdata)
    outdata.to_csv(out_file.replace("fastq.gz", "chop_data"), sep="\t")

    return out_file
Esempio n. 3
0
def samse_aln(ref, reads, bamout, tmp="/tmp", threads=8, threshold=0.05):
    
    with bwa_index(ref) as bwaidx:
        r_sai = _aln(bwaidx, reads, tmp, threads, threshold)
        samse = ("bwa samse {ref} {r_sai} {reads} | samtools view -bSF0x0004 - "
                 "| samtools sort -f -m 8 - {bam_sorted}")
        with file_transaction(bam_sorted) as tx:
            run(samse)
        return bam_sorted
def combine_fasta_qual(fas, qual, outfile, cores=8):
    if outfile.endswith(gz) == False:
        outfile = outfile + ".gz"

    with file_transaction(outfile) as tx_out:
        with open(fas) as fin, open(qual) as qin, open(tx_out, "w") as oh:
            for rec in PairedFastaQualIterator(fin, qin):
                SeqIO.write(rec, oh, "fastq")
    outfile = pigz_outfile(outfile, cores)
    return outfile
Esempio n. 5
0
def index_bam(bam_file):
    """
    Build an index for a bam file.
    parameters
        bam_file : alignment file path
    returns
        index file name : string
    """
    bam_index = bam_file + '.bai'
    if not file_exists(bam_index):
        with file_transaction(bam_index) as tx_out_file:
            run('samtools index %s %s' % (bam_file, tx_out_file))
    return bam_index
Esempio n. 6
0
def sampe_aln(ref, reads, bam_sorted, tmp="/tmp", threads=1, threshold=0.05):
    r1, r2 = tmp_split_reads(reads, tmp)
    with bwa_index(ref) as bwaidx:
        r1_sai = _aln(bwaidx, r1, tmp, threads, threshold)
        r2_sai = _aln(bwaidx, r2, tmp, threads, threshold)
        sampe = ("bwa sampe {ref} {r1_sai} {r2_sai} {r1} {r2} "
                 "| samtools view -bSF0x0004 - "
                 "| samtools sort -f -m 8 - {bam_sorted}").format(ref=bwaidx,
                                                       r1_sai=r1_sai,
                                                       r2_sai=r2_sai,
                                                       r1=r1,
                                                       r2=r2,
                                                       bam_sorted=bam_sorted)
        with file_transaction(bam_sorted) as tx:
            run(sampe)
        return bam_sorted
Esempio n. 7
0
def get_coverage(bam_file, bedout=None):
    '''
    create per base coverage patterns from sorted bam
    ''' 
    bedgraph = ""
    filename, ext = op.splitext(bam_file)
    if bedout is None:
        bedout = filename + ".genomecoverage"
    
    if op.exists(bedout):
        return bedout

    with file_transaction(bedout) as tx_oh:
        cmd = ("bedtools genomecov -dz -ibam {bam_file} > {tx_oh}").format(**locals())
        subprocess.check_call(cmd, shell=True)
    return bedout
Esempio n. 8
0
def extract_fastq(bam, out_fastq):
    ''' Uses bedtools bamtofastq function to extract reads from bam
    Args:
        bam (string): path to bam alignment file
        out_fastq (string): output fastq to write to
    Returns:
        out_fastq (string): path to written output

    >> bam = 'Tara_test1_vs_Simons_LoCos_Conc.pctid95.overlap0.minlen100.bam'
    >> outfastq = 'testout.fastq'
    >> extract_fastq(bam, outfastq) == out_fastq
    '''
    with file_transaction(out_fastq) as temp_oh:
        cmd = "bedtools bamtofastq -i {bam} -fq {fastq}".format(bam=bam,
                                                                fastq=temp_oh)
        run(cmd)
    return out_fastq
Esempio n. 9
0
def run_seqtk_sample(fastq, outfile, n, seed=37):
    """Subsample incoming paired-end fastqs to `n` reads (serially).

    Args:
        fastqs (str): path to fastq
        outfile (str): path of output fastq paths; output files are always gzipped
        n (int): number of subsampled reads
        seed (int): for random selection of reads

    Returns:
        str: subsampled reads file path
    """
    if file_exists(outfile):
        return outfile

    logger.info("Subsampling to %d reads" % n)
    with file_transaction(outfile) as tx:
        cmd = "seqtk sample -s {seed} {fastq} {number} | gzip > {out}".format(
            seed=seed, fastq=fastq, number=n, out=tx)
        run(cmd)
    print("%s created" % outfile)
    return outfile
Esempio n. 10
0
def bwa_mem(fastq, out_file, reference, options, cores=1):
    """
    align reads using bwa mem.
    parameters
        fastq : path to reads
        out_file : path to aligned reads bam
        index : path to bwa index
        options : bwa mem options
        cores : int
    returns
        output file path : string
    """
    if file_exists(out_file):
        return out_file
    predefined_options = [('-t', False)]
    
    if options is not None:
        options = filter_options(options, predefined_options)
        opts = " ".join(options)
    else:
        opts = ""
    
    logger.info("Mapping %s to %s using bwa mem" % (fastq, reference))
    
    reference = bwa_index(reference)

    with file_transaction(out_file) as tx_out_file:
        cmd = ("bwa mem -t {cores} {options} {index} {fastq} | samtools view "
               "-ShuF4q2 - | samtools sort -o -m 8G - tmp > {result}"
              ).format(cores=cores,
                       options=opts,
                       index=reference,
                       fastq=fastq,
                       result=tx_out_file)
        run(cmd)
        index_bam(tx_out_file)

    return out_file