コード例 #1
0
def multi_to_fastq(sample_name, multi_bam, paired_end, alignment_file,
                   threads):
    algn_parent = Path(alignment_file).parent
    srt_multi_bam = Path(algn_parent, "_sorted.multi.bam")
    out_SE = Path(algn_parent, sample_name + "_multimap.fastq")
    out_R1 = Path(algn_parent, sample_name + "_multimap_R1.fastq")
    out_R2 = Path(algn_parent, sample_name + "_multimap_R2.fastq")

    if not paired_end:
        print(f"Writing multi-mapped reads to {out_SE}...")
        bamfile = BedTool(multi_bam)
        BedTool.bam_to_fastq(bamfile, fq=out_SE)
    else:
        print("Sorting multi-mapped reads...")
        pysam.sort("--threads", f"{threads}", "-n", "-o", f"{srt_multi_bam}",
                   f"{multi_bam}")
        sorted_bamfile = BedTool(srt_multi_bam)
        print(f"Writing multi-mapped reads to {out_R1} {out_R2}...")
        BedTool.bam_to_fastq(sorted_bamfile, fq=out_R1, fq2=out_R2)
コード例 #2
0
ファイル: tRNAmap.py プロジェクト: krdav/mim-tRNAseq
def remap(fq, genome_index_path, genome_index_name, snp_index_path, \
 snp_index_name, threads, out_dir, snp_tolerance, keep_temp, mismatches):
    # remapping of multi and unmapped reads from round 1

    # generate fastq from unmaped reads
    nomap = out_dir + fq.split("/")[-1].split(".")[0] + ".nomapping.bam"
    nomap_bed = BedTool(nomap)
    nomap_fastq = "".join(nomap.split(".bam")[:-1]) + ".fastq"
    nomap_bed.bam_to_fastq(fq=nomap_fastq)

    # generate fastq from multi-mapping reads
    # first generate sam of primary alignments from multi-mappers
    multi = out_dir + fq.split("/")[-1].split(".")[0] + ".unpaired_mult.bam"
    cmd = "samtools view -@ " + str(
        threads) + " -F 0x904 -o " + out_dir + "temp_multi.bam " + multi
    subprocess.call(cmd, shell=True)
    multi_bed = BedTool(out_dir + "temp_multi.bam")
    multi_fastq = "".join(multi.split(".bam")[:-1]) + ".fastq"
    multi_bed.bam_to_fastq(fq=multi_fastq)
    os.remove(out_dir + "temp_multi.bam")

    if not keep_temp:
        os.remove(nomap)
        os.remove(multi)

    if not mismatches == None:
        mismatch_string = "--max-mismatches " + str(mismatches) + " "
    elif mismatches == None:
        mismatch_string = ""

    output_prefix = fq.split("/")[-1].split(".")[0] + "_remap"
    if snp_tolerance:
        map_cmd = "gsnap -D " + genome_index_path + " -d " + genome_index_name + " -V " + snp_index_path + " -v " \
        + snp_index_name + " -t " + str(threads) + " --split-output " + out_dir + output_prefix + \
        " --format=sam --genome-unk-mismatch=0 --md-lowercase-snp  --ignore-trim-in-filtering 1 --force-single-end " + mismatch_string +\
        nomap_fastq + " " + multi_fastq + " &>> " + out_dir + "remap_align.log"
    else:
        map_cmd = "gsnap -D " + genome_index_path + " -d " + genome_index_name + " -t " + str(threads) + \
        " --split-output " + out_dir + output_prefix + " --format=sam --genome-unk-mismatch=0 --md-lowercase-snp --ignore-trim-in-filtering 1 --force-single-end " + mismatch_string + " " + \
        nomap_fastq + " " + multi_fastq + " &>> " + out_dir + "remap_align.log"

    log.info(
        "Realigning multi-mapped and unmapped reads to {} with updated SNP index..."
        .format(genome_index_name))
    subprocess.call(map_cmd, shell=True)

    os.remove(multi_fastq)
    os.remove(nomap_fastq)

    # remove transloc sam output if no reads present (often the case)
    cmd = "samtools view -c " + out_dir + output_prefix + ".unpaired_transloc"
    readcount = int(subprocess.check_output(cmd, shell=True))
    if readcount == 0:
        os.remove(out_dir + output_prefix + ".unpaired_transloc")

    # write mapping stats and compress to bam - remove mutlimapping and unmapped unless keep_temp = True
    log.info("Compressing SAM files, sorting, and computing mapping stats...")
    with open(out_dir + "mapping_stats.txt", "a") as stats_out:
        align_pathlist = Path(out_dir).glob(output_prefix + "*")
        for file in align_pathlist:
            if re.search("mult",
                         file.name) and not re.search("bam", file.name):
                cmd = "samtools view -@ " + str(
                    threads) + " -F 0x904 -c " + out_dir + file.name
                multi_count = int(subprocess.check_output(cmd, shell=True))
                if keep_temp:
                    cmd = "samtools view -@ " + str(
                        threads
                    ) + " -bh -o " + out_dir + file.name + ".bam " + out_dir + file.name
                    subprocess.call(cmd, shell=True)
                    os.remove(out_dir + file.name)
                else:
                    os.remove(out_dir + file.name)
            elif re.search("uniq",
                           file.name) and not re.search("bam", file.name):
                cmd = "samtools view -@ " + str(
                    threads
                ) + " -bh " + out_dir + file.name + " | samtools sort -@ " + str(
                    threads) + " -o " + out_dir + file.name + ".bam" + " - "
                subprocess.call(cmd, shell=True)
                os.remove(out_dir + file.name)
                # merge 1st run bam and remapped bam
                merged_bam = out_dir + fq.split("/")[-1].split(
                    ".")[0] + ".unpaired_uniq_remapMerge.bam"
                cmd = "samtools merge " + merged_bam + " " + out_dir + file.name + ".bam " + out_dir + fq.split(
                    "/")[-1].split(".")[0] + ".unpaired_uniq.bam"
                subprocess.call(cmd, shell=True)
                #os.remove(out_dir + file.name + ".bam")
                cmd = "samtools view -@ " + str(threads) + " -c " + merged_bam
                unique_count = int(subprocess.check_output(cmd, shell=True))
                unique_bam = merged_bam

            elif re.search("nomapping",
                           file.name) and not re.search("bam", file.name):
                cmd = "samtools view -@ " + str(
                    threads) + " -c " + out_dir + file.name
                unmapped_count = int(subprocess.check_output(cmd, shell=True))
                if keep_temp:
                    cmd = "samtools view -@ " + str(
                        threads
                    ) + " -bh -o " + out_dir + file.name + ".bam " + out_dir + file.name
                    subprocess.call(cmd, shell=True)
                    os.remove(out_dir + file.name)
                else:
                    os.remove(out_dir + file.name)

        total_count = unique_count + multi_count + unmapped_count

        stats_out.write("{}\nUniquely mapped reads: {:d} ({:.0%}) \nMulti-mapping reads: {:d} ({:.0%}) \nUnmapped reads: {:d} ({:.0%}) \nTotal: {:d}\n\n"\
         .format(fq.split("/")[-1], unique_count, (unique_count/total_count),multi_count, (multi_count/total_count), unmapped_count, (unmapped_count/total_count), total_count))

    alignstats_dict = defaultdict(list)
    type_list = ["Uniquely mapped", "Multi-mapped", "Unmapped"]
    for i, count in enumerate([unique_count, multi_count, unmapped_count]):
        alignstats_dict["Lib"].append(fq.split("/")[-1].split(".")[0])
        alignstats_dict["Type"].append(type_list[i])
        alignstats_dict["Count"].append(count)

    return (unique_bam, unique_count, alignstats_dict)