def reads_replace(bam_file, total_chosen_reads, seqer, flow_order, lib_key, barcode, tag, out_dir, aligner, aligner_index, is_single): bam = pysam.AlignmentFile(bam_file) edit_bam_reads = {} for read in bam.fetch(): read_name = read.query_name if read_name in total_chosen_reads: strand = getReadStrand(read) if read_name not in edit_bam_reads: edit_bam_reads[read_name] = {} if strand in total_chosen_reads[read_name]: my_read = total_chosen_reads[read_name][strand] read.query_sequence = my_read.query_sequence read.query_qualities = my_read.query_qualities if seqer == "life": read = deal_life_reads(read, flow_order, lib_key, barcode) if tag: read = add_tag(read) edit_bam_reads[read_name][strand] = read else: edit_bam_reads[read_name][strand] = read # write edited reads into edit.bam edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) for read_name, readInfo in edit_bam_reads.items(): for strand, read in readInfo.items(): edit_bam.write(read) edit_bam.close() # write not edited reads into exclude.bam exclude_bam_file = os.path.join(out_dir, "exclude.bam") exclude_bam = pysam.AlignmentFile(exclude_bam_file, 'wb', template=bam) for read in bam.fetch(): read_name = read.query_name if read_name not in edit_bam_reads: exclude_bam.write(read) exclude_bam.close() # remap the edited reads header = os.path.join(out_dir, 'bam.header') os.system('samtools view -H %s|grep "^@RG" > %s' % (bam_file, header)) head = open(header, 'r').readline().rstrip() if not head: head = None edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single, header=head) edit_remap_bam_sorted_prefix = os.path.join(out_dir, "edit.remap.sort") edit_remap_bam_sorted_file = os.path.join(out_dir, "edit.remap.sort.bam") bamSort(edit_remap_bam_file, edit_remap_bam_sorted_prefix) return edit_remap_bam_sorted_file, exclude_bam_file
def reads_replace(bam_file, total_chosen_reads, seqer, flow_order, lib_key, barcode, tag, out_dir, aligner, aligner_index, is_single): bam = pysam.AlignmentFile(bam_file) edit_bam_reads = {} for read in bam.fetch(): read_name = read.query_name if read_name in total_chosen_reads: strand = getReadStrand(read) if read_name not in edit_bam_reads: edit_bam_reads[read_name] = {} if strand in total_chosen_reads[read_name]: my_read = total_chosen_reads[read_name][strand] read.query_sequence = my_read.query_sequence read.query_qualities = my_read.query_qualities if seqer == "life": read = deal_life_reads(read, flow_order, lib_key, barcode) if tag: read = add_tag(read) edit_bam_reads[read_name][strand] = read else: edit_bam_reads[read_name][strand] = read # write edited reads into edit.bam edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) for read_name, readInfo in edit_bam_reads.items(): for strand, read in readInfo.items(): edit_bam.write(read) edit_bam.close() # write not edited reads into exclude.bam exclude_bam_file = os.path.join(out_dir, "exclude.bam") exclude_bam = pysam.AlignmentFile(exclude_bam_file, 'wb', template=bam) for read in bam.fetch(): read_name = read.query_name if read_name not in edit_bam_reads: exclude_bam.write(read) exclude_bam.close() # remap the edited reads edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single) if not is_single: edit_remap = pysam.AlignmentFile(edit_remap_bam_file, 'rb') editRemapBam_addRG_File = os.path.join(out_dir, "edit.remap.addRG.bam") bamAddRG(edit_remap, edit_bam_reads, bam, editRemapBam_addRG_File) edit_remap.close() else: editRemapBam_addRG_File = edit_remap_bam_file return editRemapBam_addRG_File, exclude_bam_file
def bamAddRG(editRemap, editBamReads, templateBamFile, outBamFile): # editRemapBam_addRG_File = tempOutDir + "/edit.remap.addRG.bam" head = editRemap.header head["RG"] = templateBamFile.header["RG"] addRGBam = pysam.AlignmentFile(outBamFile, 'wb', header=head) RG = _getRGs(templateBamFile) for read in editRemap.fetch(): readName = read.query_name strand = getReadStrand(read) if readName in editBamReads: orig = editBamReads[readName][strand] else: orig = None newRead = readAddRG(read, orig, RG) # print newRead addRGBam.write(newRead) addRGBam.close()
def deal_haplotype(bam_file, haplotype, reffasta, haplotype_prefix, mindepth, minmutreads, minmapq, diffcover, is_single, is_multmapfilter, aligner, aligner_index, **kwargs): reads_dict = OrderedDict() bam = pysam.AlignmentFile(bam_file, 'rb') reads = bam.fetch(reference=haplotype.chrom, start=haplotype.start, end=haplotype.end + 1) depth = 0 for read in reads: depth += 1 if read.reference_start is not None and not read.is_secondary and bin( read.flag & 2048) != bin(2048): if read.query_name not in reads_dict: reads_dict[read.query_name] = {} strand = getReadStrand(read) reads_dict[read.query_name][strand] = read # judge depth and mut reads whether qualified if depth < int(mindepth): print "depth less than min depth!" return False, "haplotype in position %s:%s-%s: depth less than min depth(%s)" % ( haplotype.chrom, haplotype.start, haplotype.end, mindepth) else: mut_reads_num = int(depth * haplotype.freq) if mut_reads_num < int(minmutreads): print "mutation reads num less than minmutreads!" return False, "haplotype in position %s:%s-%s: mut reads less than min mut reads(%s)" % ( haplotype.chrom, haplotype.start, haplotype.end, minmutreads) print "start pick reads" # print str(haplotype) res = pick_reads(bam, reads_dict, mut_reads_num, is_single, minmapq, is_multmapfilter) if res[0] is False: return False, "haplotype in position %s:%s-%s: %s" % ( haplotype.chrom, haplotype.start, haplotype.end, res[1]) chosen_reads, mate_reads = res print "end pick reads" # edit my_chosen_reads = {} my_mate_reads = {} tmp_bam_file = haplotype_prefix + ".chosen.edited.bam" tmp_bam = pysam.AlignmentFile(tmp_bam_file, 'wb', template=bam) chosen_reads_num = 0 real_mut_reads_num = 0 for readName, readInfo in chosen_reads.items(): my_chosen_reads[readName] = {} tmp_dict = {} tmp_dict2 = {} for strand, read in readInfo.items(): my_read = Read(read) res = editRead(my_read, reffasta, haplotype.mutList) if res is False: continue real_mut_reads_num += 1 sequence, quality, shift = res read.query_sequence = sequence read.query_qualities = quality tmp_dict[strand] = my_read tmp_dict2[strand] = read if is_single: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 else: if len(tmp_dict) == 0: continue elif len(tmp_dict) == 1 and readName in mate_reads: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 mate_read = mate_reads[readName] my_mate_reads[readName] = Read(mate_read) tmp_bam.write(mate_read) elif len(tmp_dict) == 2: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 tmp_bam.close() # alignment and judge coverdiff whether qualified chosen_bam_file = haplotype_prefix + ".chosen.remap.bam" genome_index = aligner_index remap(genome_index, tmp_bam_file, chosen_bam_file, aligner, is_single) chosen_bam = pysam.AlignmentFile(chosen_bam_file) if judge_coverdiff(bam, depth, chosen_bam, chosen_reads_num, haplotype, float(diffcover)): return my_chosen_reads, my_mate_reads, real_mut_reads_num, depth else: return False, "haplotype in position %s:%s-%s: coverdiff is less than minDiffCover" % ( haplotype.chrom, haplotype.start, haplotype.end)
def merge_edit_bam(bam_file, out_dir, is_single, total_modify_reads, total_add_reads, used_reads, seqer, aligner, aligner_index, flow_order, lib_key, barcode, tag): bam = pysam.AlignmentFile(bam_file, 'rb') edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) readname_convert_file = os.path.join(out_dir, "readname_convert.txt") fout_convert = open(readname_convert_file, 'w') edit_bam_reads = {} if is_single: for read_pair in total_modify_reads: read1 = read_pair[0] keyname_read1 = getKeyName(read1) orig_read1 = used_reads[keyname_read1] new_read1 = copy.deepcopy(orig_read1) new_read1.query_sequence = read1.query_sequence new_read1.query_qualities = read1.query_qualities new_name = read1.query_name.split( ":")[0] + ":" + get_new_readname() new_read1.query_name = new_name if seqer == "life": new_read1 = deal_life_reads(new_read1, flow_order, lib_key, barcode) if tag: new_read1 = add_tag(new_read1) edit_bam.write(new_read1) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) strand = getReadStrand(new_read1) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand] = new_read1 for read_pair in total_add_reads: read1 = read_pair[0] keyname_read1 = getKeyName(read1) orig_read1 = used_reads[keyname_read1] new_read1 = copy.deepcopy(orig_read1) new_name = get_new_readname() new_read1.query_name = new_name edit_bam.write(new_read1) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) strand = getReadStrand(new_read1) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand] = new_read1 else: for read_pair in total_modify_reads + total_add_reads: read1 = read_pair[0] read2 = read_pair[1] keyname_read1 = getKeyName(read1) keyname_read2 = getKeyName(read2) orig_read1 = used_reads[keyname_read1] orig_read2 = used_reads[keyname_read2] orig_read1_name = orig_read1.query_name orig_read2_name = orig_read2.query_name new_read1 = copy.deepcopy(orig_read1) new_read2 = copy.deepcopy(orig_read2) new_read1.query_sequence = read1.query_sequence new_read1.query_qualities = read1.query_qualities new_read2.query_sequence = read2.query_sequence new_read2.query_qualities = read2.query_qualities new_name = get_new_readname() new_read1.query_name = new_name new_read2.query_name = new_name strand1 = getReadStrand(new_read1) strand2 = getReadStrand(new_read2) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand1] = new_read1 edit_bam_reads[new_name][strand2] = new_read2 if tag: new_read1 = add_tag(new_read1) new_read2 = add_tag(new_read2) fout_convert.write("%s: %s, %s, %s, %s, %s-%s, %s-%s\n" % ( new_name, orig_read1_name, orig_read2_name, new_read1.is_read1, new_read2.is_read2, new_read1.reference_start, new_read1.reference_end, new_read2.reference_start, new_read2.reference_end, )) edit_bam.write(new_read1) edit_bam.write(new_read2) fout_convert.close() edit_bam.close() edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single) if not is_single: editRemap = pysam.AlignmentFile(edit_remap_bam_file, 'rb') editRemapBam_addRG_File = os.path.join(out_dir, "edit.remap.addRG.bam") bamAddRG(editRemap, edit_bam_reads, bam, editRemapBam_addRG_File) editRemap.close() else: editRemapBam_addRG_File = edit_remap_bam_file bamIndex(editRemapBam_addRG_File) return editRemapBam_addRG_File