def read_bedpe_line(self, line, chrname2tid_dict=None): line = line.strip().split(tab) if len(line) < 10: my_utils.myprint( 'ERROR! number of columns is less than 10. The line is:') my_utils.myprint(tab.join(line)) sys.exit() self.chrm1, self.pos1 = line[0:2] self.chrm2, self.pos2 = line[3:5] self.pos1 = int(self.pos1) self.pos2 = int(self.pos2) self.sv_type, self.sv_id, self.sv_size, self.score, self.filter, self.aux_info = line[ 6:12] self.sv_size = int(self.sv_size) self.score = float(self.score) if chrname2tid_dict != None: if self.chrm1 in chrname2tid_dict: self.tid1 = chrname2tid_dict[self.chrm1] if self.chrm2 in chrname2tid_dict: self.tid2 = chrname2tid_dict[self.chrm2] aux_list = self.aux_info.split(tab) for aux in aux_list: if aux == 'SVMETHOD=local_assembly': self.assembled = True if aux == 'PRECISE': self.is_precise = True if aux == 'IMPRECISE': self.is_precise = False
def get_bin_size_from_bcd13_file(bcd13_file): bcd13_fp = my_utils.gzopen(bcd13_file, 'r') pos_list = list() while 1: line = bcd13_fp.readline() if not line: break if line[0] == '#': continue line = line.strip().split(tab) pos = int(line[1]) pos_list.append(pos) if len(pos_list) > 1000000: break bcd13_fp.close() interval_list = list() for i in range(1, len(pos_list)): interval = pos_list[i] - pos_list[i-1] if interval > 0: interval_list.append(interval) if len(interval_list) < 1: my_utils.myprint('Failed to get bin size from file: %s' % bcd13_file) sys.exit(1) bin_size = int(np.median(interval_list)) del pos_list del interval_list return bin_size
def detect_small_deletions(input_bam_file, out_dir, out_del_call_file, n_threads, ref_fasta_file, fermikit_dir, samtools, bedtools, in_weird_reads_file, weird_reads_cluster_file, call_small_deletions, cal_hap_read_depth_from_bcd21, bcd21_file, bcd22_file, hap_type_read_depth_file, gap_region_bed_file, rm_temp_files=1): faidx_file = ref_fasta_file + '.fai' window_size = int(2e5) max_depth = 500 bin_size = 100 mapq_cutoff = 20 local_assembly_out_file = os.path.join(out_dir, 'local_assembly.del.bedpe') short_reads_del_call_file = os.path.join( out_dir, 'discordant_read_pairs.del.bedpe') local_assembly.small_deletion_dection_by_local_assembly( samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, faidx_file, out_dir, local_assembly_out_file, n_threads, window_size, max_depth, rm_temp_files) cluster_weird_reads.cluster_weird_reads(in_weird_reads_file, weird_reads_cluster_file, faidx_file) cmd = '%s %s %s %s %d %d' % (cal_hap_read_depth_from_bcd21, bcd21_file, hap_type_read_depth_file, faidx_file, bin_size, mapq_cutoff) my_utils.myprint(cmd) os.system(cmd) cmd = '%s %s %s %s %s %s %s' % ( call_small_deletions, hap_type_read_depth_file, weird_reads_cluster_file, bcd22_file, faidx_file, gap_region_bed_file, short_reads_del_call_file) my_utils.myprint(cmd) os.system(cmd) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) merge_sv_calls(local_assembly_out_file, short_reads_del_call_file, out_del_call_file, tid2chrname_list, chrname2tid_dict) if rm_temp_files: os.remove(local_assembly_out_file) os.remove(short_reads_del_call_file) return
def filter_calls_2d(svcall_list, black_list_file, out_file, remove_chr_prefix=False): black_list_2array_dict, bin_size = read_2d_blacklist_file( black_list_file, remove_chr_prefix) mean_fragment_length = 20000 box_length = mean_fragment_length for i in range(0, len(svcall_list)): svcall = svcall_list[i] if svcall.ft != '.': continue chr1 = svcall.chrm1 pos1 = svcall.start1 chr2 = svcall.chrm2 pos2 = svcall.start2 end_type1 = svcall.endtype1 end_type2 = svcall.endtype2 if end_type1 == 'R_end': start1 = pos1 - box_length elif end_type1 == 'L_end': start1 = pos1 if end_type2 == 'R_end': start2 = pos2 - box_length elif end_type2 == 'L_end': start2 = pos2 end1 = start1 + box_length end2 = start2 + box_length key1 = two_chr_to_key(chr1, chr2) key2 = two_chr_to_key(chr2, chr1) if key1 in black_list_2array_dict: pos1_list, pos2_list_list = black_list_2array_dict[key1] number_of_points = get_number_of_points_from_black_list_file( start1, end1, start2, end2, pos1_list, pos2_list_list, bin_size) elif key2 in black_list_2array_dict: my_utils.myprint('switch chr1 and chr2') pos1_list, pos2_list_list = black_list_2array_dict[key2] number_of_points = get_number_of_points_from_black_list_file( start2, end2, start1, end1, pos1_list, pos2_list_list, bin_size) else: number_of_points = 0 if number_of_points >= 20: svcall_list[i].ft = '2D_BLACKLIST' return svcall_list
def quantify_sv_candidates(args, dbo_args, endpoint_args): ## quantification ## task = 'quantifying SV candidates' if args.run_from_begining == False and my_utils.check_file_exists( args.quantified_bk_pair_file) == True: my_utils.myprint('quantified SV file existed, skipped %s' % (task)) else: my_utils.myprint(task) quantify2bkcand.quantify2bkcand(args, dbo_args, endpoint_args) gc.collect()
def merge_sv_calls(args, dbo_args, endpoint_args): ## merge calls ## task = 'merging SV candidates' if args.run_from_begining == False and my_utils.check_file_exists( args.merged_bedpe_file) == True: my_utils.myprint('merged bedpe file existed, skipped %s' % (task)) else: my_utils.myprint(task) merge_quantified_calls.merge_quantified_calls(args, dbo_args, endpoint_args) gc.collect()
def read_2d_blacklist_file(black_list_file, remove_chr_prefix): black_list_2d_dict = dict() black_list_fp = gzip.open(black_list_file, 'rt') bin_size = 0 while 1: line = black_list_fp.readline() if not line: break if line[0] == '#': line = line.strip().split('=') bin_size = int(line[1]) my_utils.myprint('bin_size = %d' % bin_size) continue if line[0] == '>': chr_list = line[1:].strip().split(',') chr1 = chr_list[0] chr2 = chr_list[1] if remove_chr_prefix: chr1 = chr1[3:] chr2 = chr2[3:] key = two_chr_to_key(chr1, chr2) black_list_2d_dict[key] = list() continue line = line.strip().split(tab) pos1 = int(line[0]) pos2_list = line[1].split(',') for i in range(0, len(pos2_list)): pos2_list[i] = int(pos2_list[i]) d2 = D2(pos1, pos2_list) black_list_2d_dict[key].append(d2) black_list_fp.close() black_list_2array_dict = dict() for key in black_list_2d_dict: black_list_2d_dict[key].sort(key=lambda d2: d2.pos1) black_list_2array_dict[key] = (list(), list()) for d2 in black_list_2d_dict[key]: black_list_2array_dict[key][0].append(d2.pos1) black_list_2array_dict[key][1].append(d2.pos2_list) del black_list_2d_dict return black_list_2array_dict, bin_size
def plot_read_depth_for1region(chrom, tid, bk_pos1, bk_pos2, out_file, figure_title, wg_high_mapq_depth_list, wg_total_depth_list, chr_len_list, bin_size, wg_avg_depth): plt.figure(figsize=(10, 5)) if bk_pos2 < bk_pos1: temp = bk_pos2 bk_pos2 = bk_pos1 bk_pos1 = temp sv_len = bk_pos2 - bk_pos1 win_start = max(0, bk_pos1 - sv_len) win_end = min(chr_len_list[tid], bk_pos2 + sv_len) win_start_idx = int(win_start / bin_size) win_end_idx = int(win_end / bin_size) + 1 x = range(win_start_idx * bin_size, win_end_idx * bin_size, bin_size) y1 = wg_high_mapq_depth_list[tid][win_start_idx:win_end_idx] y2 = wg_total_depth_list[tid][win_start_idx:win_end_idx] ymean = np.mean(y2) ymax = ymean * 3 if ymax < wg_avg_depth * 2: ymax = wg_avg_depth * 2 plt.title(figure_title) plt.xlabel('%s position' % chrom) plt.ylabel('Read depth') plt.plot(x, y2, '-', color='grey') plt.plot(x, y1, '-', color='black') plt.axis([win_start, win_end, 0, ymax]) plt.axvline(x=bk_pos1, color='r', linestyle='--') plt.axvline(x=bk_pos2, color='r', linestyle='--') plt.axhline(y=wg_avg_depth, color='b', linestyle='--') plt.ticklabel_format(axis='both', style='plain') plt.xticks(np.arange(min(x), max(x) + 1, sv_len)) plt.rcParams.update({'font.size': 16}) plt.show() plt.savefig(out_file, dpi=200) plt.close('all') my_utils.myprint('saved figure: %s' % out_file) return
def fermikit_variant_calling(fermikit_dir, samtools, n_threads_for_one_process, region_fasta_file, window_size, input_fastq_file, curr_out_dir, out_prefix): out_mak_file = os.path.join(curr_out_dir, '%s.mak' % out_prefix) assembly_contigs_file = os.path.join(curr_out_dir, '%s.mag.gz' % out_prefix) cmd = 'cd %s && %s/bwa index %s' % (curr_out_dir, fermikit_dir, region_fasta_file) my_utils.myprint(cmd) os.system(cmd) cmd = 'cd %s && perl %s/fermi2.pl unitig -s %s -l 151 -t %d -p %s %s > %s\n\n' % ( curr_out_dir, fermikit_dir, window_size, n_threads_for_one_process, out_prefix, input_fastq_file, out_mak_file) my_utils.myprint(cmd) os.system(cmd) cmd = 'make -f %s\n\n' % out_mak_file my_utils.myprint(cmd) os.system(cmd) cmd = 'cd %s && perl %s/run-calling -t %d %s %s | sh \n\n' % ( curr_out_dir, fermikit_dir, n_threads_for_one_process, region_fasta_file, assembly_contigs_file) my_utils.myprint(cmd) os.system(cmd) return
def plot_depth(cal_read_depth_from_bcd21_binary, bcd21_file, in_svcalls_list, faidx_file, chr_len_list, tid2chrname_list, chrname2tid_dict, out_dir, out_prefix): if os.path.exists(cal_read_depth_from_bcd21_binary) == False: my_utils.myprint('ERROR! The binary file doesn\'t exist:%s\Failed to plot read depth' % cal_read_depth_from_bcd21_binary) return if os.path.exists(bcd21_file) == False: my_utils.myprint('ERROR! The bcd21 file doesn\'t exist:%s\Failed to plot read depth' % bcd21_file) return if os.path.exists(faidx_file) == False: my_utils.myprint('ERROR! The fasta index file doesn\'t exist:%s\Failed to plot read depth' % faidx_file) return out_dir = os.path.join(out_dir, 'read_depth') my_utils.make_dir(out_dir) bin_size = 500 read_depth_file = os.path.join(out_dir, '%s.read_depth.txt' % out_prefix) cmd_args_list = [cal_read_depth_from_bcd21_binary, bcd21_file, read_depth_file, faidx_file, str(bin_size), '20'] my_utils.myprint('calculating read depth from file: %s' % bcd21_file) subprocess.call(cmd_args_list) my_utils.myprint('plotting read depth') wg_high_mapq_depth_list, wg_total_depth_list, bin_size = plot_read_depth.get_wg_depth_list(read_depth_file, chr_len_list) wg_total_depth = 0 wg_n_bin = 0 for tid in range(0, len(wg_high_mapq_depth_list)): for depth in wg_high_mapq_depth_list[tid]: wg_total_depth += depth wg_n_bin += 1 wg_avg_depth = float(wg_total_depth) / wg_n_bin for svcall in in_svcalls_list: if svcall.chrm1 != svcall.chrm2: continue out_file = os.path.join(out_dir, '%s.%s.read_depth.png' % (out_prefix, svcall.sv_id)) figure_title = 'Read depth (%s, %d bp %s)' % (svcall.sv_id, svcall.end2 - svcall.start1, svcall.svtype) plot_read_depth.plot_read_depth_for1region(svcall.chrm1, svcall.tid1, svcall.start1, svcall.end2, out_file, figure_title, wg_high_mapq_depth_list, wg_total_depth_list, chr_len_list, bin_size, wg_avg_depth) os.remove(read_depth_file) return
def get_wg_depth_list(in_depth_file, chr_len_list): in_depth_fp = my_utils.gzopen(in_depth_file, 'r') bin_size = 1 while 1: line = in_depth_fp.readline() if not line: break if line[0] == '#': continue line = line.strip().split(tab) start_pos = int(line[1]) end_pos = int(line[2]) bin_size = end_pos - start_pos if bin_size >= 10: break else: my_utils.myprint('ERROR! bin_size < 1 in depth file: %s ' % in_depth_file) sys.exit() n_chr = len(chr_len_list) wg_high_mapq_depth_list = [0] * n_chr wg_total_depth_list = [0] * n_chr for tid in range(0, n_chr): wg_high_mapq_depth_list[tid] = list() wg_total_depth_list[tid] = list() in_depth_fp.seek(0, 0) while 1: line = in_depth_fp.readline() if not line: break if line[0] == '#': continue line = line.strip().split(tab) tid = int(line[0]) wg_high_mapq_depth_list[tid].append(float(line[3])) wg_total_depth_list[tid].append(float(line[4])) in_depth_fp.close() my_utils.myprint('finished reading file: %s' % in_depth_file) return wg_high_mapq_depth_list, wg_total_depth_list, bin_size
def plot_heatmap(in_svcall_list, bcd21_file, faidx_file, out_dir, flank_dist, chr_len_list, tid2chrname_list, chrname2tid_dict, cal_2d_overlapping_barcodes_binary, out_prefix): if os.path.exists(cal_2d_overlapping_barcodes_binary) == False: my_utils.myprint('ERROR! The binary file doesn\'t exist: %s' % cal_2d_overlapping_barcodes_binary) my_utils.myprint('Skipped plotting the heat maps') return out_dir = os.path.join(out_dir, '2D_heatmap') my_utils.make_dir(out_dir) my_utils.myprint('plotting heat maps of overlapping barcodes') target_region_bedpe_list = generate_target_region_bedpe_list(in_svcall_list, chr_len_list, flank_dist, chrname2tid_dict) target_region_bedpe_file = os.path.join(out_dir, 'target_region.bedpe') target_region_bedpe_fp = my_utils.gzopen(target_region_bedpe_file, 'w') for bedpe1 in target_region_bedpe_list: target_region_bedpe_fp.write(bedpe1.output_svcall() + endl) target_region_bedpe_fp.close() target_region_2d_ovl_with_low_mapq_file = os.path.join(out_dir, '%s.2d_heatmap.with_low_mapq_reads.txt' % out_prefix) bin_size = 1000 max_ovl_num = 100 cmd_args_list1 = [cal_2d_overlapping_barcodes_binary, bcd21_file, target_region_bedpe_file, target_region_2d_ovl_with_low_mapq_file, faidx_file, str(bin_size), '1'] subprocess.call(cmd_args_list1) plot_2d_barcodes.plot_2d_overlapping_barcodes(target_region_2d_ovl_with_low_mapq_file, target_region_bedpe_list, bin_size, max_ovl_num, out_dir, out_prefix) return
def plot_twin_window_barcode_similarity_for1region(chrom, tid, reg_start, reg_end, bk_pos1, bk_pos2, out_file, figure_title, wg_pvalue_list, bin_size): min_x_idx = int(reg_start / bin_size) max_x_idx = int(reg_end / bin_size) + 1 x_list = list() y_list = list() for idx in range(min_x_idx, max_x_idx): x = idx * bin_size x_list.append(x) if idx < len(wg_pvalue_list[tid]): y = wg_pvalue_list[tid][idx] y_list.append(y) else: break xmin = min(x_list) xmax = max(x_list) x_range = xmax - xmin ymax = max(y_list) plt.figure(figsize=(10, 5)) plt.title(figure_title) plt.xlabel('%s position' % chrom) plt.ylabel('-log10(P-value)') plt.plot(x_list, y_list, '-', color = 'black') plt.axis([xmin, xmax, 0, ymax]) plt.axvline(x=bk_pos1, color='r', linestyle = '--') plt.axvline(x=bk_pos2, color='r', linestyle = '--') plt.xticks(np.arange(xmin, xmax+1, x_range/4)) plt.show() plt.savefig(out_file, dpi=200) plt.close('all') my_utils.myprint('saved figure: %s' % out_file) return
def cluster_weird_reads(in_weird_reads_file, out_file, faidx_file): tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) max_distance = 300 min_n_short_read_supp = 2 max_n_short_read_supp = 1000 min_sv_length = 1000 my_utils.myprint('reading file: %s' % in_weird_reads_file) short_read_support_list35 = read_weird_reads_file(in_weird_reads_file, chrname2tid_dict, min_sv_length) my_utils.myprint('finished reading file: %s' % in_weird_reads_file) out_fp = open(out_file, 'w') out_fp.write('') out_fp.close() my_utils.myprint('clustering discordant reads') cluster_weird_reads1type(short_read_support_list35, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp) return
def size(self): if self.chrm1 == self.chrm2: return self.pos2 - self.pos1 else: my_utils.myprint('ERROR! chrm1 != chrm2!') sys.exit()
def key2(self): if self.tid2 < 0: my_utils.myprint('ERROR! tid2 < 0') sys.exit() return self.tid2 * my_utils.FIX_LENGTH + self.pos2
def small_deletion_dection_by_local_assembly(samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, faidx_file, out_dir, out_del_call_file, n_threads, window_size, max_depth, rm_temp_files=1): if os.path.exists(faidx_file) == False: cmd = '%s faidx %s' % (samtools, ref_fasta_file) my_utils.myprint(cmd) os.system(cmd) if os.path.exists(faidx_file) == False: my_utils.myprint( 'ERROR! The index file of the reference fasta file does not exist!' ) sys.exit() cmd = 'mkdir -p %s' % out_dir my_utils.myprint(cmd) os.system('mkdir -p %s' % out_dir) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) chr_len_list = my_utils.get_chr_length(faidx_file) overlap_length = int(window_size / 10) interval_list = generate_interval_list(chr_len_list, tid2chrname_list, chrname2tid_dict, window_size, overlap_length) process_list = list() out_combined_vcf_file_list = list() for i in range(0, n_threads): out_combined_vcf_file = os.path.join( out_dir, 'assembly_raw_variants.%d.txt' % i) out_combined_vcf_file_list.append(out_combined_vcf_file) t = multiprocessing.Process( target=small_deletion_dection_from_interval_list, args=(i, n_threads, samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, out_dir, window_size, max_depth, interval_list, out_combined_vcf_file)) process_list.append(t) t.start() for t in process_list: t.join() all_processes_out_combined_vcf_file = os.path.join( out_dir, 'local_assembly_raw_variants.txt') cmd = 'cat ' for out_combined_vcf_file in out_combined_vcf_file_list: cmd += ' %s ' % out_combined_vcf_file cmd += ' > %s ' % all_processes_out_combined_vcf_file my_utils.myprint(cmd) os.system(cmd) extract_del_from_vcf_file(all_processes_out_combined_vcf_file, out_del_call_file) if rm_temp_files: for out_combined_vcf_file in out_combined_vcf_file_list: os.remove(out_combined_vcf_file) os.remove(all_processes_out_combined_vcf_file) return
def extract_del_from_vcf_file(in_vcf_file, out_file): in_vcf_fp = open(in_vcf_file, 'r') out_fp = open(out_file, 'w') min_del_size = 50 id = 0 while 1: line = in_vcf_fp.readline().strip() if not line: break if line[0] == '#': continue items = line.split('\t') chrom1 = items[0] try: pos1 = int(items[1]) except: my_utils.myprint('ERROR! invalid VCF record: %s' % line) continue ref_allele = items[3] alt_allele = items[4] flt = items[6] info = items[7] sv_type = '' sv_size = 0 pos2 = -1 if '[' in alt_allele or ']' in alt_allele: continue ref_chr, ref_start_end = chrom1.split(':') ref_start, ref_end = ref_start_end.split('-') ref_start = int(ref_start) chrom1 = ref_chr pos1 += ref_start if len(ref_allele) > min_del_size and len(ref_allele) - len( alt_allele) > min_del_size: sv_type = 'DEL' sv_size = len(ref_allele) - len(alt_allele) pos2 = pos1 + sv_size else: for ele in info.split(';'): key = ele.split('=')[0] if key == 'SVTYPE': sv_type = ele.split('=')[1] elif key == 'SVLEN': sv_size = abs(int(ele.split('=')[1])) elif key == 'END' and pos2 == -1: pos2 = int(ele.split('=')[1]) + ref_start if sv_type != 'DEL': continue chrom2 = chrom1 flt = 'PASS' score = 30 sv_id = '.' out_item = '%s\t%d\t%d\t%s\t%d\t%d\t' % (chrom1, pos1, pos1 + 1, chrom2, pos2, pos2 + 1) out_item += '%s\t%s\t%d\t%d\t%s\tSVMETHOD=local_assembly\n' % ( sv_type, sv_id, sv_size, score, flt) out_fp.write(out_item) in_vcf_fp.close() out_fp.close() return
def filter_low_mapq_gaps(input_sv_list, endpoint_args, chrname2tid_dict): all_supp_barcode_dict = dict() for j in range(0, len(input_sv_list)): svcall = input_sv_list[j] if svcall.ft != '.': continue support_barcode_list = svcall.support_barcodes.rstrip(',').split(',') for bcd in support_barcode_list: all_supp_barcode_dict[bcd] = list() if os.path.exists(endpoint_args.low_mapq_bcd21_file) == False: my_utils.myprint('WARNING! low mapq bcd21 file does not exist.') return input_sv_list my_utils.myprint('reading low mapq bcd21 file: %s' % endpoint_args.low_mapq_bcd21_file) low_mapq_bcd21_fp = my_utils.gzopen(endpoint_args.low_mapq_bcd21_file, 'rt') i = 0 while 1: line = low_mapq_bcd21_fp.readline() if not line: break if line[0] == '#': continue i += 1 attr_list = line.strip().split(tab) bcd21 = Bcd21Core(attr_list) if bcd21.bcd in all_supp_barcode_dict: all_supp_barcode_dict[bcd21.bcd].append(bcd21) if i % 10000000 == 0: my_utils.myprint('processed %d reads' % i) low_mapq_bcd21_fp.close() for bcd in all_supp_barcode_dict: all_supp_barcode_dict[bcd].sort(key=lambda bcd21: bcd21.key_start()) my_utils.myprint('finished reading low mapq bcd21 file: %s' % endpoint_args.low_mapq_bcd21_file) region_size = 10 * 1000 # for deletion, the region is the deletion region, for other type of svs, the region is 10 kb of either breakpoint for j in range(0, len(input_sv_list)): svcall = input_sv_list[j] if svcall.ft != '.': continue support_barcode_list = svcall.support_barcodes.rstrip(',').split(',') n_low_mapq_bcd = 0 region_key_start1 = -1 region_key_end1 = -1 region_key_start2 = -1 region_key_end2 = -1 tid1 = chrname2tid_dict[svcall.chrm1] tid2 = chrname2tid_dict[svcall.chrm2] if svcall.endtype1 == 'L_end': region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1 - region_size else: region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1 if svcall.endtype2 == 'L_end': region_key_start2 = tid2 * my_utils.FIX_LENGTH + svcall.start2 - region_size else: region_key_start2 = tid2 * my_utils.FIX_LENGTH + svcall.start2 region_key_end1 = region_key_start1 + region_size region_key_end2 = region_key_start2 + region_size if svcall.svtype == 'DEL': region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1 region_key_end1 = tid2 * my_utils.FIX_LENGTH + svcall.start2 region_key_start2 = region_key_start1 region_key_end2 = region_key_end1 n_low_mapq_bcd = 0 for bcd in support_barcode_list: if bcd not in all_supp_barcode_dict: continue for bcd21 in all_supp_barcode_dict[bcd]: if (bcd21.key_start() > region_key_start1 and bcd21.key_end() < region_key_end1) or ( bcd21.key_start() > region_key_start2 and bcd21.key_end() < region_key_end2): n_low_mapq_bcd += 1 break n_supp_bcd = svcall.num_fragment_support ratio_low_mapq_bcd = float(n_low_mapq_bcd) / float(n_supp_bcd) if (not (ratio_low_mapq_bcd < 0.2 and svcall.score * (1 - ratio_low_mapq_bcd) > 20)): input_sv_list[j].ft = 'LOW_MAPQ_BETWEEN_BK' return input_sv_list
def process1region(samtools, bedtools, fermikit_dir, ref_fasta_file, input_bam_file, out_dir, itv, region_id, window_size, max_depth, n_threads_for_one_process, out_combined_vcf_file): curr_out_dir = os.path.join(out_dir, 'region_%06d' % (region_id)) out_bam_file = os.path.join(curr_out_dir, 'region_%06d.bam' % region_id) out_all_fastq_file = os.path.join(curr_out_dir, 'region_%06d.all.fastq' % region_id) region_bed_file = os.path.join(curr_out_dir, 'region_%06d.bed' % region_id) region_fasta_file = os.path.join(curr_out_dir, 'region_%06d.fasta' % region_id) interval = '%s:%d-%d' % (itv.chrom, itv.start_pos + 1, itv.end_pos) cmd = 'mkdir -p %s' % curr_out_dir my_utils.myprint(cmd) os.system(cmd) time.sleep(0.05) if os.path.exists(curr_out_dir) == False: os.system(cmd) time.sleep(1) if os.path.exists(curr_out_dir) == False: my_utils.myprint('Failed to creat directory: %s' % curr_out_dir) cmd = 'rm -rf %s' % curr_out_dir my_utils.myprint(cmd) os.system(cmd) return cmd = extract_bam_region(samtools, input_bam_file, interval, out_bam_file, n_threads_for_one_process) my_utils.myprint(cmd) os.system(cmd) cmd = index_bam(samtools, out_bam_file) my_utils.myprint(cmd) os.system(cmd) cmd = bam_to_1fastq(samtools, out_bam_file, out_all_fastq_file) my_utils.myprint(cmd) os.system(cmd) fastq_file_size = os.path.getsize(out_all_fastq_file) if fastq_file_size > window_size * max_depth * 2 or fastq_file_size < 20000: cmd = 'rm -r %s' % curr_out_dir my_utils.myprint(cmd) os.system(cmd) return region_bed_fp = open(region_bed_file, 'w') region_bed_fp.write('%s\t%d\t%d\n' % (itv.chrom, itv.start_pos, itv.end_pos)) region_bed_fp.close() cmd = extract_ref_region(bedtools, ref_fasta_file, region_bed_file, region_fasta_file) my_utils.myprint(cmd) os.system(cmd) out_prefix = os.path.join(curr_out_dir, 'region_%06d.all_hap' % region_id) fermikit_variant_calling(fermikit_dir, samtools, n_threads_for_one_process, region_fasta_file, window_size, out_all_fastq_file, curr_out_dir, out_prefix) indel_call_file = out_prefix + '.flt.vcf' sv_call_file = out_prefix + '.sv.vcf' cmd = 'gunzip --force %s.gz' % indel_call_file os.system(cmd) cmd = 'gunzip --force %s.gz' % sv_call_file os.system(cmd) cmd = 'cat %s %s >> %s' % (indel_call_file, sv_call_file, out_combined_vcf_file) os.system(cmd) cmd = 'rm -r %s' % curr_out_dir os.system(cmd) return
def filter_calls(args, dbo_args, endpoint_args): my_utils.myprint('filtering SV calls') bin_size = 100 tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file) alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file) if os.path.exists(args.black_region_bed_file): my_utils.myprint('reading black region bed file: %s' % args.black_region_bed_file) black_reg_dict = read_black_reg_bed_file(args.black_region_bed_file, bin_size) else: if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37': my_utils.myprint('ERROR! black list file is missing: %s' % black_region_bed_file) black_reg_dict = dict() if os.path.exists(args.gap_region_bed_file): gap_left_region_dict, gap_right_region_dict = read_gap_region_file( args.gap_region_bed_file, bin_size) my_utils.myprint('reading gap region bed file: %s' % args.gap_region_bed_file) else: if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37': my_utils.myprint('ERROR! gap region file is missing: %s' % args.gap_region_bed_file) gap_left_region_dict = dict() gap_right_region_dict = dict() raw_svcall_list = my_utils.read_object_file(args.merged_bedpe_file, bedpe.QuantifiedBKCandCore) for i in range(0, len(raw_svcall_list)): raw_svcall_list[i].ft = '.' round1_retained_sv_list = filter_1d_blacklist( raw_svcall_list, black_reg_dict, alt_chr_name_set, gap_left_region_dict, gap_right_region_dict, bin_size) n_retained_sv = 0 for svcall in round1_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 my_utils.myprint('number of retained SVs: %d' % n_retained_sv) round2_retained_sv_list = filter_low_mapq_gaps(round1_retained_sv_list, endpoint_args, chrname2tid_dict) n_retained_sv = 0 for svcall in round2_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 if args.ref_version == 'b37': remove_chr_prefix = True else: remove_chr_prefix = False round3_retained_sv_list = filter_calls_2d(round2_retained_sv_list, args.black_region_2d_file, args.filter_bedpe_file, remove_chr_prefix) n_retained_sv = 0 for svcall in round3_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 round4_retained_sv_list = filter_dbo_score(round3_retained_sv_list, args) n_retained_sv = 0 for svcall in round4_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 round5_retained_sv_list = filter_read_depth(round4_retained_sv_list, args) round6_retained_sv_list = filter_sv_length(round5_retained_sv_list, args) final_retained_sv_list = round6_retained_sv_list n_retained_sv = 0 for svcall in final_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 my_utils.myprint('number of retained SVs: %d' % n_retained_sv) header = '#chrom1\tstart1\tstop1\tchrom2\tstart2\tstop2\t' header += 'sv_type\tsv_id\tsv_length\tqual_score\tfilter\tinfo\n' out_file = args.filter_bedpe_file out_fp = open(out_file, 'w') out_fp.write(header) sv_id = 0 n_svcall = len(final_retained_sv_list) n_digit = int(math.log10(n_svcall) + 2) for svcall in final_retained_sv_list: if svcall.ft == '.': svcall.ft = 'PASS' sv_id += 1 sv_id_str = str(sv_id) sv_id_str = '0' * (n_digit - len(sv_id_str)) + sv_id_str svcall.sv_id = 'ID%s' % sv_id_str out_fp.write(svcall.output_core2() + endl) out_fp.close() return
def detect_decreased_barcode_overlap(args, dbo_args, endpoint_args): if args.is_wgs: win_size = 10000 else: win_size = 40000 ### calculating read depth | output file: args.read_depth_file task = 'calculating read depth' if args.run_from_begining == False and my_utils.check_file_exists( args.read_depth_file) == True: my_utils.myprint('read depth file existed, skipped %s' % task) else: my_utils.myprint(task) cmd_args_list = [ args.cal_read_depth_from_bcd21, endpoint_args.bcd21_file, args.read_depth_file, args.faidx_file, str(dbo_args.bin_size), str(args.min_mapq) ] my_utils.myprint('running command: %s' % (' '.join(cmd_args_list))) subprocess.call(cmd_args_list) my_utils.myprint('finished %s' % task) ### counting overlapping barcodes | output files dbo_args.bcd11_file task = 'counting overlapping barcodes between twin windows' if args.run_from_begining == False and my_utils.check_file_exists( dbo_args.bcd11_file) == True: my_utils.myprint('bcd11 files existed, skipped %s' % task) else: my_utils.myprint(task) cmd_args_list = [ args.cal_twin_win_bcd_cnt, endpoint_args.bcd21_file, dbo_args.bcd11_file, args.faidx_file, str(dbo_args.bin_size), str(win_size), str(args.min_mapq) ] my_utils.myprint('running command: %s' % (' '.join(cmd_args_list))) subprocess.call(cmd_args_list) my_utils.myprint('finished %s' % task) ### calculating centroid | output file: dbo_args.bcd12_file task = 'calculating centroid' if args.run_from_begining == False and my_utils.check_file_exists( dbo_args.bcd12_file) == True: my_utils.myprint('bcd12 files existed, skipped %s' % task) else: my_utils.myprint(task) cmd_args_list = [ args.cal_centroid_from_read_depth, args.read_depth_file, dbo_args.bcd11_file, dbo_args.bcd12_file, args.faidx_file ] my_utils.myprint('running command: %s' % (' '.join(cmd_args_list))) subprocess.call(cmd_args_list) my_utils.myprint('finished %s' % task) ### calculating expected overlap | output file: dbo_args.bcd13_file task = 'calculating barcode similarity and p-value' if args.run_from_begining == False and my_utils.check_file_exists( dbo_args.bcd13_file) == True: my_utils.myprint('bcd12 files existed, skipped %s' % task) else: my_utils.myprint(task) if args.is_wgs: is_wgs = 1 else: is_wgs = 0 my_utils.myprint(task) cal_expected_overlap_value.cal_expected_overlap_bcd_cnt( dbo_args.bcd12_file, dbo_args.bcd13_file, is_wgs) my_utils.myprint('finished %s' % task) return
def cluster_one_region(short_read_support_list, coord_list, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp): if len(coord_list) < 1: return edge_list = list() distance_buffer = max_distance * 1.415 tree = cKDTree(coord_list, leafsize = 10000) for i in range(0, len(short_read_support_list)): if i > 0 and i % 100000 == 0: my_utils.myprint ('finished searching for %d weird reads' % i) node1 = (short_read_support_list[i].key1(), short_read_support_list[i].key2()) index_list = tree.query_ball_point( node1, distance_buffer ) if len(index_list) > max_n_short_read_supp: continue nearby_node_index_list = list() for j in index_list: if i == j: continue node2 = (short_read_support_list[j].key1(), short_read_support_list[j].key2()) if abs(node1[0] - node2[0]) < max_distance and abs(node1[1] - node2[1]) < max_distance: nearby_node_index_list.append(j) for j in nearby_node_index_list: edge = (i, j) edge_list.append(edge) row = list() col = list() data = list() for edge in edge_list: row.append (edge[0]) col.append (edge[1]) data.append (1) n_node = len(short_read_support_list) my_utils.myprint ('get connected components') n_components, label_list, component_node_index_db = get_connected_components(n_node, row, col, data, False, 'weak') node_cluster_list = [0] * n_components for i in range(0, n_components): node_cluster_list[i] = list() for index in component_node_index_db[i]: node_cluster_list[i].append(short_read_support_list[index]) my_utils.myprint ('output clusters of weird reads') out_fp = open(out_file, 'w') for i in range(0, len(node_cluster_list)): # for i-th cluster node_cluster = node_cluster_list[i] if len(node_cluster) < min_n_short_read_supp: continue if len(node_cluster) > max_n_short_read_supp: continue mean_start_pos = mean_end_pos = 0 hap_type_cnt = [0] * 3 output_info_string = 'SVTYPE=DEL' for j in range(0, len(node_cluster)): short_read_support = node_cluster[j] output_info_string += ';' + short_read_support.output_info() mean_start_pos += short_read_support.pos1() mean_end_pos += short_read_support.pos2() hap_type_cnt[short_read_support.hap_type] += 1 num_pe_supp = len(node_cluster) mean_start_pos = int( 0.5 + (float(mean_start_pos)) / num_pe_supp) mean_end_pos = int( 0.5 + (float(mean_end_pos)) / num_pe_supp) tid = node_cluster[0].tid1 chrom = tid2chrname_list[tid] if len(node_cluster) >= 5: flt = 'PASS' else: flt = 'LowQual' sv_size = mean_end_pos - mean_start_pos sv_type = 'DEL' out_fp.write('%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s\n' % (chrom, mean_start_pos, mean_start_pos+1, chrom, mean_end_pos, mean_end_pos+1, sv_type, flt, sv_size, num_pe_supp, hap_type_cnt[0], hap_type_cnt[1], hap_type_cnt[2], output_info_string)) del edge_list, row, col, data, component_node_index_db, label_list, node_cluster_list gc.collect() return
def extract_barcode_from_bam(args, endpoint_args): ## sort bam by barcode ## cmd = '%s %s | %s sort -l 1 -m 1G -@ %d -t BX -o %s -' % ( args.output_bam_coreinfo, args.bam, args.samtools, args.n_thread, args.sortbx_bam) if (args.run_from_begining == False) and my_utils.check_file_exists( args.sortbx_bam): my_utils.myprint('File: %s existed, skipped sorting bam by barcode' % args.sortbx_bam) else: my_utils.myprint('sorting bam file by barcode') my_utils.myprint('running command: %s' % cmd) os.system(cmd) ## extract barcode info ## n_compress_threads = args.n_thread - 1 if n_compress_threads < 1: n_compress_threads = 1 cmd = '%s %s __STDOUT__ %s | %s --fast --processes %d - > %s' % ( args.extract_barcode, args.sortbx_bam, args.stat_file, args.pigz, n_compress_threads, endpoint_args.bcd21_file) if args.run_from_begining == False and my_utils.check_file_exists( args.stat_file) and my_utils.check_file_exists( endpoint_args.bcd21_file): my_utils.myprint( 'File: %s existed, skipped extracting barcode from bam' % endpoint_args.bcd21_file) else: my_utils.myprint('extracting barcode info from bam file') my_utils.myprint('running command: %s' % cmd) os.system(cmd) task = 'extracting low mapq bcd21' if args.run_from_begining == False and my_utils.check_file_exists( endpoint_args.low_mapq_bcd21_file) == True: my_utils.myprint('%s existed, skipped %s' % (endpoint_args.low_mapq_bcd21_file, task)) else: my_utils.myprint(task) get_low_mapq_bcd21_file(endpoint_args.bcd21_file, endpoint_args.low_mapq_bcd21_file, args.min_mapq) if args.rm_temp_files and my_utils.check_file_exists( endpoint_args.bcd21_file): if os.path.exists(args.sortbx_bam): os.remove(args.sortbx_bam) return
def detect_increased_fragment_ends(args, dbo_args, endpoint_args): gc.enable() ### clustering reads | output file: bcd22 file task = 'clustering reads' if args.is_wgs: is_wgs = 1 else: is_wgs = 0 if args.run_from_begining == False and my_utils.check_file_exists( endpoint_args.bcd22_file): my_utils.myprint('bcd22 file existed, skipped %s' % (task)) else: my_utils.myprint(task) cmd = '%s %s %s %s %d %d %d %d' % ( args.cluster_reads, endpoint_args.bcd21_file, endpoint_args.bcd22_file, args.weird_reads_file, is_wgs, args.user_defined_min_reads_in_fragment, args.min_mapq, args.n_thread) my_utils.myprint(cmd) os.system(cmd) gc.collect() ### searching for extremely high coverage region task = 'searching for extremely high coverage region' if args.run_from_begining == False and my_utils.check_file_exists( endpoint_args.barcode_cov_file): my_utils.myprint('high coverage region file existed, skipped %s' % (task)) else: my_utils.myprint(task) get_high_coverage_regions.get_high_coverage_regions( args, dbo_args, endpoint_args) gc.collect() ### estimating distribution parameters if args.global_distribution_calculated == False: global_distribution.estimate_global_distribution( args, dbo_args, endpoint_args, endpoint_args.bcd22_file) arguments.output_arguments2file(args, dbo_args, endpoint_args) gc.collect() ## find paired breakpoints ## task = 'searching for paired breakpoints' if args.run_from_begining == False and my_utils.check_file_exists( args.bk_cand_pair_file) == True: my_utils.myprint('paired breakpoint file existed, skipped %s' % (task)) else: my_utils.myprint(task) find_paired_bk.find_paired_bk(args, dbo_args, endpoint_args) gc.collect() return
def plot_one_bedpe(out_dir, target_region_bedpe_list, out_prefix, region_title, ovl_2d_array, xmin, xmax, ymin, ymax, bin_size): tp_ovl_2d_array = ovl_2d_array.transpose() xlab, ylab = region_title.split(';') xlab = xlab.strip() ylab = ylab.strip() xchr, x_start_end = xlab.split(':') ychr, y_start_end = ylab.split(':') xstart, xend = x_start_end.split('-') ystart, yend = y_start_end.split('-') sv_id, svtype = find_sv_id(xchr, xstart, xend, ychr, ystart, yend, target_region_bedpe_list) if sv_id == 'UNK' or svtype == 'UNK': sv_id = '%s_%s_%s.%s_%s_%s' % (xchr, xstart, xend, ychr, ystart, yend) svtype = 'unknown_sv_type' xsize = int((xmax - xmin) / bin_size) ysize = int((ymax - ymin) / bin_size) pd_ovl_2d_array = pd.DataFrame(tp_ovl_2d_array) xticks_dict = dict() yticks_dict = dict() for i in range(0, xsize): xticks_dict[i] = xmin + i * bin_size for i in range(0, ysize): yticks_dict[i] = ymin + i * bin_size pd_ovl_2d_array = pd_ovl_2d_array.rename(columns=xticks_dict, index=yticks_dict) cmrmap_r = cm.get_cmap('brg_r', 1000) cmrmap_r_colors = cmrmap_r(np.linspace(0, 1, 1000)) r50, g50, b50, a50 = cmrmap_r_colors[500] r = np.linspace(1.0, r50, 500) g = np.linspace(1.0, g50, 500) b = np.linspace(1.0, b50, 500) a = np.linspace(1.0, a50, 500) my_colors1 = np.array([r, g, b, a]).transpose() r = np.linspace(r50, 0.0, 500) g = np.linspace(g50, 0.0, 500) b = np.linspace(b50, 0.0, 500) a = np.linspace(a50, 1.0, 500) my_colors2 = np.array([r, g, b, a]).transpose() #my_colors = np.vstack((my_colors, cmrmap_r_colors[500:])) my_colors = np.vstack((my_colors1, my_colors2)) my_cmap = ListedColormap(my_colors) out_file = os.path.join(out_dir, '%s.%s.heatmap.png' % (out_prefix, sv_id)) plt.figure(figsize=(10, 10)) ax = sns.heatmap(pd_ovl_2d_array, cmap=my_cmap, square=True, xticklabels=int(xsize / 10), yticklabels=int(ysize / 10)) ax.invert_yaxis() ax.axhline(y=0, color='k', linewidth=2) ax.axhline(y=ysize, color='k', linewidth=2) ax.axvline(x=0, color='k', linewidth=2) ax.axvline(x=xsize, color='k', linewidth=2) plt.axis([0, xsize, 0, ysize]) plt.xlabel(xchr) plt.ylabel(ychr) plt.xticks(rotation='vertical') plt.yticks(rotation='horizontal') plt.title('Number of overlapping barcodes (%s, %s)' % (sv_id, svtype)) plt.rcParams.update({'font.size': 12}) plt.show() plt.savefig(out_file, dpi=200) plt.close('all') my_utils.myprint('saved figure: %s' % out_file) return
def check_arguments(args): if os.path.exists(args.input_bam) == False: my_utils.myprint("ERROR! input bam file (%s) does not exist!" %(args.bam)) sys.exit() if os.path.exists(args.out_dir) == False: os.system('mkdir -p ' + args.out_dir) if os.path.exists(args.out_dir) == False: my_utils.myprint("ERROR! can not create output directory: %s" %(args.out_dir)) sys.exit() if os.path.exists(args.ref_fa) == False: my_utils.myprint("ERROR! reference FASTA file (%s) does not exist!" % (args.ref_fa)) sys.exit() if os.path.exists(args.faidx_file) == False: my_utils.myprint("Index file of reference FASTA file does not exist, creating index using samtools ...") cmd = args.samtools + ' faidx ' + args.ref_fa os.system(cmd) if os.path.exists(args.faidx_file) == False: my_utils.myprint("ERROR! Cannot generate reference index file!") sys.exit() if os.path.exists(args.extract_barcode) == False: my_utils.myprint("ERROR! extract_barcode does not exist!") sys.exit() if args.is_wgs == False and os.path.exists(args.target_region_bed) == False: my_utils.myprint ("ERROR! target region bed is required if --targeted is specified. If you don't have this file, please specify --wgs instead") sys.exit() if args.ref_version == 'hg19': args.gap_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg19_gap.bed') args.black_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg19_black_list.bed') args.black_region_2d_file = os.path.join(args.root_dir, 'black_lists/hg19.2D.blacklist.gz') args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg19_low_mapq_regions.bed') elif args.ref_version == 'b37': args.gap_region_bed_file = os.path.join(args.root_dir, 'black_lists/b37_gap.bed') args.black_region_bed_file = os.path.join(args.root_dir, 'black_lists/b37_black_list.bed') args.black_region_2d_file = os.path.join(args.root_dir, 'black_lists/b37.2D.blacklist.gz') args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/b37_low_mapq_regions.bed') elif args.ref_version == 'hg38': args.gap_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg38_gap.bed') args.black_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg38_black_list.bed') args.black_region_2d_file = os.path.join(args.root_dir, 'black_lists/hg38.2D.blacklist.gz') args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg38_low_mapq_regions.bed') return
def detect_sv_from_short_reads(args, dbo_args, endpoint_args): out_short_read_sv_file = args.short_reads_sv_call_file ## combination of local assembly, discordant read pairs and large CNV ### calculate read depth ### bin_size = 100 mapq_cutoff = 20 if os.path.exists( args.hap_type_read_depth_file) == False or os.path.getsize( args.hap_type_read_depth_file) == 0: cmd = '%s %s %s %s %d %d' % (args.cal_hap_read_depth_from_bcd21, endpoint_args.bcd21_file, args.hap_type_read_depth_file, args.faidx_file, bin_size, mapq_cutoff) my_utils.myprint('running command:' + cmd) os.system(cmd) ### CNV detection ### cmd = '%s %s %s %s %s %d %d %d' % ( args.cnv_detection, args.hap_type_read_depth_file, args.faidx_file, args.gap_region_bed_file, args.cnv_call_file, 40, 200, 500000) my_utils.myprint('running command:' + cmd) os.system(cmd) ### small deletion detection from paired-end reads ### window_size = int(2e5) max_depth = 500 bin_size = 100 mapq_cutoff = 20 local_assembly_out_file = os.path.join(args.out_dir, 'local_assembly.del.bedpe') short_reads_del_call_file = os.path.join( args.out_dir, 'discordant_read_pairs.del.bedpe') cluster_weird_reads.cluster_weird_reads(args.weird_reads_file, args.weird_reads_cluster_file, args.faidx_file) cmd = '%s %s %s %s %s %s %s' % ( args.small_deletion_detection, args.hap_type_read_depth_file, args.weird_reads_cluster_file, endpoint_args.bcd22_file, args.faidx_file, args.gap_region_bed_file, short_reads_del_call_file) my_utils.myprint('running command:' + cmd) os.system(cmd) rm_temp_files = 1 local_assembly.small_deletion_dection_by_local_assembly( args.samtools, args.bedtools, args.fermikit_dir, args.input_bam, args.ref_fa, args.faidx_file, args.out_dir, local_assembly_out_file, args.n_thread, window_size, max_depth, rm_temp_files) ### merge call files ### merge_sv_calls(local_assembly_out_file, short_reads_del_call_file, out_short_read_sv_file, args.tid2chrname, args.chrname2tid) rm_temp_files = 1 if rm_temp_files: os.remove(local_assembly_out_file) os.remove(short_reads_del_call_file) return
def init_from_two_lines(self, line1, line2): line1 = line1.strip().split(tab) line2 = line2.strip().split(tab) if len(line1) < 13: my_utils.myprint('ERROR! This line is less than 13 coloumns: %s' % tab.join(line1)) return if len(line2) < 13: my_utils.myprint('ERROR! This line is less than 13 coloumns: %s' % tab.join(line2)) return if line1[6] != line2[6]: my_utils.myprint('ERROR! line1 and line2 have different read id!') my_utils.myprint('line1: %s' % tab.join(line1)) my_utils.myprint('line2: %s' % tab.join(line2)) sys.exit() return if line1[0] != line2[0]: my_utils.myprint('ERROR! line1 and line2 have different tid!') my_utils.myprint('line1: %s' % tab.join(line1)) my_utils.myprint('line2: %s' % tab.join(line2)) sys.exit() return if int(line1[1]) > int(line2[1]): tmp = line1 line1 = line2 line2 = tmp self.tid1, self.start1, self.end1, self.mapq1 = line1[0:4] self.tid2, self.start2, self.end2, self.mapq2 = line2[0:4] self.tid1 = int(self.tid1) self.start1 = int(self.start1) self.end1 = int(self.end1) self.mapq1 = int(self.mapq1) self.tid2 = int(self.tid2) self.start2 = int(self.start2) self.end2 = int(self.end2) self.mapq2 = int(self.mapq2) self.flag1 = int(line1[7]) self.flag2 = int(line2[7]) self.read_id = line1[6] self.bcd = line1[4] self.hap_type = int(line1[5]) if self.start1 > self.start2: my_utils.myprint('ERROR! start1 > start2') sys.exit() return