Ejemplo n.º 1
0
def _aln(ref, fastq, tmp="/tmp", threads=8, threshold=0.05):
    sai = os.path.join(tmp, '%09d.sai' % random.randrange(0, 1e10))
    with file_transaction(sai) as tx:
        cmd = ("bwa aln -n {threshold} -t {threads} "
               "{ref} {fastq} > {tx}").format(**locals())
        run(cmd)
    return sai
Ejemplo n.º 2
0
def samse_aln(ref, reads, bamout, tmp="/tmp", threads=8, threshold=0.05):
    
    with bwa_index(ref) as bwaidx:
        r_sai = _aln(bwaidx, reads, tmp, threads, threshold)
        samse = ("bwa samse {ref} {r_sai} {reads} | samtools view -bSF0x0004 - "
                 "| samtools sort -f -m 8 - {bam_sorted}")
        with file_transaction(bam_sorted) as tx:
            run(samse)
        return bam_sorted
Ejemplo n.º 3
0
def index_bam(bam_file):
    """
    Build an index for a bam file.
    parameters
        bam_file : alignment file path
    returns
        index file name : string
    """
    bam_index = bam_file + '.bai'
    if not file_exists(bam_index):
        with file_transaction(bam_index) as tx_out_file:
            run('samtools index %s %s' % (bam_file, tx_out_file))
    return bam_index
Ejemplo n.º 4
0
def sampe_aln(ref, reads, bam_sorted, tmp="/tmp", threads=1, threshold=0.05):
    r1, r2 = tmp_split_reads(reads, tmp)
    with bwa_index(ref) as bwaidx:
        r1_sai = _aln(bwaidx, r1, tmp, threads, threshold)
        r2_sai = _aln(bwaidx, r2, tmp, threads, threshold)
        sampe = ("bwa sampe {ref} {r1_sai} {r2_sai} {r1} {r2} "
                 "| samtools view -bSF0x0004 - "
                 "| samtools sort -f -m 8 - {bam_sorted}").format(ref=bwaidx,
                                                       r1_sai=r1_sai,
                                                       r2_sai=r2_sai,
                                                       r1=r1,
                                                       r2=r2,
                                                       bam_sorted=bam_sorted)
        with file_transaction(bam_sorted) as tx:
            run(sampe)
        return bam_sorted
Ejemplo n.º 5
0
def extract_fastq(bam, out_fastq):
    ''' Uses bedtools bamtofastq function to extract reads from bam
    Args:
        bam (string): path to bam alignment file
        out_fastq (string): output fastq to write to
    Returns:
        out_fastq (string): path to written output

    >> bam = 'Tara_test1_vs_Simons_LoCos_Conc.pctid95.overlap0.minlen100.bam'
    >> outfastq = 'testout.fastq'
    >> extract_fastq(bam, outfastq) == out_fastq
    '''
    with file_transaction(out_fastq) as temp_oh:
        cmd = "bedtools bamtofastq -i {bam} -fq {fastq}".format(bam=bam,
                                                                fastq=temp_oh)
        run(cmd)
    return out_fastq
Ejemplo n.º 6
0
def sag_checkm_completeness(fasta,  cores):
    '''run checkm lineage_wf on SAG to get completeness values

    Args:
        fasta (str): full path to SAG genomic contigs in fasta format
        cores (int): number of cores to use to run checkm
    Returns:
        "completeness" statistics as a pandas dataframe
    '''
    logger.info("Running checkm on %s " % fasta)

    fasta = op.abspath(fasta)
    if op.isdir == True or op.exists == False:
        return None
    
    with tmp_dir() as tdir:
        bindir = op.join(tdir, "bindir")
        safe_makedir(bindir)
        outdir = op.join(tdir, "outdir")
        safe_makedir(outdir)
        
        tmp_fasta = op.join(bindir, op.basename(fasta))
        
        try:
            shutil.copy(fasta, tmp_fasta)
            assert op.exists(tmp_fasta)
            print(tmp_fasta, "created")
        except Exception, e:
            print("copying %s to the temporary directory failed, %s" % (fasta, e))
            return None

        logger.info("Running lineage workflow on %s" % fasta)
        
        completeness_tsv = op.join(outdir, "completeness.tsv")
        
        cmd = "checkm lineage_wf -f {outfile} --tab_table -q -x fasta -t {cores} {binpath} {outdir}".format(outfile=completeness_tsv, outdir=outdir, cores=cores, binpath=bindir)
        
        logger.info("running checkm lineage, command is: {cmd}".format(**locals()))
        run(cmd)
        completeness = pd.read_csv(completeness_tsv, sep="\t", header=0)
Ejemplo n.º 7
0
def run_seqtk_sample(fastq, outfile, n, seed=37):
    """Subsample incoming paired-end fastqs to `n` reads (serially).

    Args:
        fastqs (str): path to fastq
        outfile (str): path of output fastq paths; output files are always gzipped
        n (int): number of subsampled reads
        seed (int): for random selection of reads

    Returns:
        str: subsampled reads file path
    """
    if file_exists(outfile):
        return outfile

    logger.info("Subsampling to %d reads" % n)
    with file_transaction(outfile) as tx:
        cmd = "seqtk sample -s {seed} {fastq} {number} | gzip > {out}".format(
            seed=seed, fastq=fastq, number=n, out=tx)
        run(cmd)
    print("%s created" % outfile)
    return outfile
Ejemplo n.º 8
0
def bwa_mem(fastq, out_file, reference, options, cores=1):
    """
    align reads using bwa mem.
    parameters
        fastq : path to reads
        out_file : path to aligned reads bam
        index : path to bwa index
        options : bwa mem options
        cores : int
    returns
        output file path : string
    """
    if file_exists(out_file):
        return out_file
    predefined_options = [('-t', False)]
    
    if options is not None:
        options = filter_options(options, predefined_options)
        opts = " ".join(options)
    else:
        opts = ""
    
    logger.info("Mapping %s to %s using bwa mem" % (fastq, reference))
    
    reference = bwa_index(reference)

    with file_transaction(out_file) as tx_out_file:
        cmd = ("bwa mem -t {cores} {options} {index} {fastq} | samtools view "
               "-ShuF4q2 - | samtools sort -o -m 8G - tmp > {result}"
              ).format(cores=cores,
                       options=opts,
                       index=reference,
                       fastq=fastq,
                       result=tx_out_file)
        run(cmd)
        index_bam(tx_out_file)

    return out_file