Beispiel #1
0
 def rmdup(self):
     msg = "Removing duplicates in %s" % self.bamfn
     print(msg)
     tempfn_stem = os.path.join(self.basedir, temp_filename())
     pysam.rmdup("-s", self.bamfn, tempfn_stem)
     tempfn_glob = glob.glob(tempfn_stem + '*')
     assert len(tempfn_glob) == 1, "Unexpected number of temporary output files: %r" % tempfn_glob
     tempfn = tempfn_glob[0]
     # rename our dedupped bamfn 
     os.rename(tempfn, self.bamfn)
Beispiel #2
0
def proc_sam(arg):
    samfile = arg[0]
    rmdup = arg[1]
    #se = arg[2]
    print samfile
    print rmdup
    
    sam_dir = "/".join(samfile.split("/")[:-1]) + "/"
    sam_prefix = os.path.basename(samfile).split(".sam")[0]
    mapped_sam = sam_dir + sam_prefix + "_mapped.sam"
    rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam"
    sort_sam = sam_dir + sam_prefix + "_sort"
    
    if not os.path.exists(mapped_sam):
        print "Removing unmapped..."
        sam = pysam.Samfile(samfile, 'r')
        mb = pysam.Samfile(mapped_sam, 'w', template=sam)
        for read in sam:
            if not read.is_unmapped:
                mb.write(read)
        mb.close()
        print "Finished removing unmapped."
    if not os.path.exists(rmdup_sam) and rmdup == "True":
        print "Removing duplicates..."
        pysam.rmdup("-S", mapped_sam, rmdup_sam)
        os.remove(mapped_sam)
        print "Sorting..."
        pysam.sort(rmdup_sam, sort_sam)
        os.remove(rmdup_sam)
    else:
        print "Sorting..."
        pysam.sort(mapped_sam, sort_sam)
        os.remove(mapped_sam)
    print "Indexing..."
    sort_sam = sort_sam + ".sam"
    pysam.index(sort_sam)

    samfile_fs = open(samfile + "_stat", 'w')
    for line in pysam.flagstat(samfile):
        samfile_fs.write(line)
    samfile_fs.close
    sort_sam_fs = open(sort_sam + "_stat", 'w')
    for line in pysam.flagstat(sort_sam):
        sort_sam_fs.write(line)
    sort_sam_fs.close()
Beispiel #3
0
def proc_sam(arg):
    samfile = arg[0]
    rmdup = arg[1]
    #se = arg[2]
    print samfile
    print rmdup

    sam_dir = "/".join(samfile.split("/")[:-1]) + "/"
    sam_prefix = os.path.basename(samfile).split(".sam")[0]
    mapped_sam = sam_dir + sam_prefix + "_mapped.sam"
    rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam"
    sort_sam = sam_dir + sam_prefix + "_sort"

    if not os.path.exists(mapped_sam):
        print "Removing unmapped..."
        sam = pysam.Samfile(samfile, 'r')
        mb = pysam.Samfile(mapped_sam, 'w', template=sam)
        for read in sam:
            if not read.is_unmapped:
                mb.write(read)
        mb.close()
        print "Finished removing unmapped."
    if not os.path.exists(rmdup_sam) and rmdup == "True":
        print "Removing duplicates..."
        pysam.rmdup("-S", mapped_sam, rmdup_sam)
        os.remove(mapped_sam)
        print "Sorting..."
        pysam.sort(rmdup_sam, sort_sam)
        os.remove(rmdup_sam)
    else:
        print "Sorting..."
        pysam.sort(mapped_sam, sort_sam)
        os.remove(mapped_sam)
    print "Indexing..."
    sort_sam = sort_sam + ".sam"
    pysam.index(sort_sam)

    samfile_fs = open(samfile + "_stat", 'w')
    for line in pysam.flagstat(samfile):
        samfile_fs.write(line)
    samfile_fs.close
    sort_sam_fs = open(sort_sam + "_stat", 'w')
    for line in pysam.flagstat(sort_sam):
        sort_sam_fs.write(line)
    sort_sam_fs.close()
Beispiel #4
0
def trim_reads(bamfile):
    """

    Wrapper to remove PCR duplicate reads from bed file
    
    Input
    bamfile -- location of bamfile on disk
    assumes .bam ending of bam file
    returns bamfile_trimed.bam file
    
    """

    if not os.path.exists(bamfile):
        raise NameError("file %s does not exist" % (bamfile))

    outfile = ".".join(bamfile.split(".")[:-1])
    outfile += ".rmdup.bam"
    rmdup("-S", bamfile, outfile)
    return outfile
Beispiel #5
0
def trim_reads(bamfile):

    """

    Wrapper to remove PCR duplicate reads from bed file
    
    Input
    bamfile -- location of bamfile on disk
    assumes .bam ending of bam file
    returns bamfile_trimed.bam file
    
    """

    if not os.path.exists(bamfile):
        raise NameError("file %s does not exist" % (bamfile))

    outfile = ".".join(bamfile.split(".")[:-1])
    outfile += ".rmdup.bam"
    rmdup("-S", bamfile, outfile)
    return outfile
Beispiel #6
0
    def rm_dup(self, inbam, outbam):
        ''' remove pcr duplicates '''
        pysam.rmdup(inbam, outbam)

        return
Beispiel #7
0
	def rm_dup(self,inbam,outbam):
		''' remove pcr duplicates '''
		pysam.rmdup(inbam,outbam)
		
		return
Beispiel #8
0
def filter_main(fastq1,
                fastq2,
                bwa_index,
                mapq,
                outdir,
                prefix,
                threads,
                to_file=False):
    sys.stdout = logger.Logger(outdir + "/" + prefix + ".feather.log")
    print(time.ctime() + " starting mapping and filtering operation")
    check_arguments(fastq1, fastq2, bwa_index, mapq, threads)
    paired_filename, bwa1_filename, bwa2_filename, bwa1_sorted_filename, bwa2_sorted_filename, combined_bwa_filename, qc_filename = set_filenames(
        fastq1, fastq2, outdir, prefix)
    #running bwa mem
    for fastq, bwa_filename in [(fastq1, bwa1_filename),
                                (fastq2, bwa2_filename)]:
        if fastq.endswith(".fastq") or fastq.endswith("fastq.gz"):
            bwa_mem(fastq, bwa_index, threads, bwa_filename)
        elif not (fastq.endswith(".sam") or fastq.endswith(".bam")):
            exit(
                "Error: Input file for filtering should be of type fastq, fastq.gz, sam, or bam. Exiting!"
            )
    if bwa1_filename.endswith(".bam"):
        proc = subprocess.Popen("samtools view " + bwa1_filename +
                                " | awk ' $1 !~ /@/ {print $1}' " +
                                "| uniq -c|wc -l",
                                stdout=subprocess.PIPE,
                                shell=True)
        read_count = proc.stdout.read().decode("utf-8")
    else:
        proc = subprocess.Popen("awk ' $1 !~ /@/ {print $1}' " +
                                bwa1_filename + "| uniq -c|wc -l",
                                stdout=subprocess.PIPE,
                                shell=True)
        read_count = proc.stdout.read().decode("utf-8")

    #pairing and filtering alignments for chimeric reads
    for bwa_filename, bwa_sorted_filename in ([
            bwa1_filename, bwa1_sorted_filename
    ], [bwa2_filename, bwa2_sorted_filename]):
        bwa = pysam.AlignmentFile(bwa_filename)
        if not is_sorted_queryname(bwa.header):
            print(time.ctime() + " calling samtools sort for " + bwa_filename +
                  " storing in " + bwa_sorted_filename)
            pysam.sort("-o", bwa_sorted_filename, "-n", "-@", str(threads),
                       bwa_filename)
        else:
            copyfile(bwa_filename, bwa_sorted_filename)
    print(time.ctime() + " merging " + bwa1_sorted_filename + " and " +
          bwa2_sorted_filename)
    pysam.merge("-n", "-f", combined_bwa_filename, bwa1_sorted_filename,
                bwa2_sorted_filename)
    print(time.ctime() + " filtering and pairing reads")
    filter_pair_reads(combined_bwa_filename, mapq, paired_filename,
                      qc_filename)
    print(time.ctime() + " paired bam file generated. Sorting by coordinates.")
    pysam.sort("-o", paired_filename + ".srt.bam", "-@", str(threads),
               paired_filename + ".bam")
    print(time.ctime() + " calling samtools rmdup")
    pysam.rmdup(paired_filename + ".srt.bam", paired_filename + ".rmdup.bam")
    #proc = subprocess.Popen(["samtools", "rmdup", paired_filename + ".srt.bam", paired_filename + ".rmdup.bam"])
    #proc.communicate()
    print(time.ctime() + " calling samtools flagstat on mapped file")
    proc = subprocess.Popen("samtools flagstat " + paired_filename +
                            ".srt.bam > " + paired_filename +
                            ".srt.bam.flagstat",
                            shell=True)
    proc.communicate()
    with open(paired_filename + ".srt.bam.flagstat") as flag_file:
        lines = flag_file.readlines()
        uniquely_mapped_count = lines[7].split()[0]
    print(time.ctime() +
          " calling samtools flagstat on mapped and duplicate-removed file")
    proc = subprocess.Popen("samtools flagstat " + paired_filename +
                            ".rmdup.bam > " + paired_filename +
                            ".rmdup.flagstat",
                            shell=True)
    proc.communicate()
    with open(paired_filename + ".rmdup.flagstat") as flag_file:
        lines = flag_file.readlines()
        duprmd_count = lines[7].split()[0]
        intra_count = lines[11].split()[0]
        intra_count = str(int(float(intra_count)) / 2)
    print(time.ctime() + " calling samtools sort for sorting by query names")
    #pysam.sort("-n", "-o", bwa_filename + ".srtn.rmdup.bam", paired_filename + ".rmdup.bam")
    pysam.sort("-o", paired_filename + ".srtn.rmdup.bam", "-@", str(threads),
               "-n", paired_filename + ".rmdup.bam")
    #proc.communicate()
    #proc.wait()
    print(time.ctime() + " finishing filtering")
    qc_filename = outdir + "/" + prefix + ".feather.qc"
    with open(qc_filename, 'w') as outfile:
        outfile.write("{0:70} {1}".format("number of sequencing pairs",
                                          str(read_count)))
        outfile.write("{0:70} {1} ".format(
            "number of unqiuely mapped pairs (MAPQ >= " + str(mapq) + ")",
            str(uniquely_mapped_count)))
        outfile.write("\t({0:.2f}%)\n".format(
            100 *
            (int(float(uniquely_mapped_count)) / int(float(read_count)))))
        outfile.write("{0:70} {1} ".format(
            "number of pairs after duplicate removal", str(duprmd_count)))
        outfile.write("\t({0:.2f}%)\n".format(
            100 * (int(float(duprmd_count)) / int(float(read_count)))))
        #outfile.write("{0:70} {1} ".format("number of interchromosomal pairs", str(intra_count)))
        #outfile.write("\t({0:.2f}%)\n".format(100 * int(float(intra_count)) / int(float(read_count))))
    return (paired_filename + ".srtn.rmdup.bam")