def visualize_sv_calls(args, dbo_args, endpoint_args): in_svcalls_bedpe_file = args.filter_bedpe_file bcd13_file = dbo_args.bcd13_file bcd21_file = endpoint_args.bcd21_file faidx_file = args.faidx_file cal_2d_overlapping_barcodes_binary = args.cal_2d_overlapping_barcodes cal_read_depth_from_bcd21_binary = args.cal_read_depth_from_bcd21 out_dir = args.image_out_dir out_prefix = args.bam_name my_utils.make_dir(out_dir) chr_len_list = my_utils.get_chr_length(faidx_file) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) in_svcall_list = read_svcall_bedpe_file(in_svcalls_bedpe_file, chrname2tid_dict) for i in range(0, len(in_svcall_list)): in_svcall_list[i].format() plot_depth(cal_read_depth_from_bcd21_binary, bcd21_file, in_svcall_list, faidx_file, chr_len_list, tid2chrname_list, chrname2tid_dict, out_dir, out_prefix) plot_twin_window_barcode_similarity(in_svcall_list, bcd13_file, faidx_file, out_dir, chr_len_list, tid2chrname_list, chrname2tid_dict, out_prefix) flank_dist = 100 * 1000 # set flank distance to be 100 kb plot_heatmap (in_svcall_list, bcd21_file, faidx_file, out_dir, flank_dist, chr_len_list, tid2chrname_list, chrname2tid_dict, cal_2d_overlapping_barcodes_binary, out_prefix) return
def detect_small_deletions(input_bam_file, out_dir, out_del_call_file, n_threads, ref_fasta_file, fermikit_dir, samtools, bedtools, in_weird_reads_file, weird_reads_cluster_file, call_small_deletions, cal_hap_read_depth_from_bcd21, bcd21_file, bcd22_file, hap_type_read_depth_file, gap_region_bed_file, rm_temp_files=1): faidx_file = ref_fasta_file + '.fai' window_size = int(2e5) max_depth = 500 bin_size = 100 mapq_cutoff = 20 local_assembly_out_file = os.path.join(out_dir, 'local_assembly.del.bedpe') short_reads_del_call_file = os.path.join( out_dir, 'discordant_read_pairs.del.bedpe') local_assembly.small_deletion_dection_by_local_assembly( samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, faidx_file, out_dir, local_assembly_out_file, n_threads, window_size, max_depth, rm_temp_files) cluster_weird_reads.cluster_weird_reads(in_weird_reads_file, weird_reads_cluster_file, faidx_file) cmd = '%s %s %s %s %d %d' % (cal_hap_read_depth_from_bcd21, bcd21_file, hap_type_read_depth_file, faidx_file, bin_size, mapq_cutoff) my_utils.myprint(cmd) os.system(cmd) cmd = '%s %s %s %s %s %s %s' % ( call_small_deletions, hap_type_read_depth_file, weird_reads_cluster_file, bcd22_file, faidx_file, gap_region_bed_file, short_reads_del_call_file) my_utils.myprint(cmd) os.system(cmd) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) merge_sv_calls(local_assembly_out_file, short_reads_del_call_file, out_del_call_file, tid2chrname_list, chrname2tid_dict) if rm_temp_files: os.remove(local_assembly_out_file) os.remove(short_reads_del_call_file) return
def parse_user_arguments(): parser = argparse.ArgumentParser(description='Detection of SVs from linked-read sequencing data') ### required arguments ### parser.add_argument('-i', '--bam', required = True, metavar = 'input.phased_possorted_bam.bam', type = str, help = 'input bam file (should be the phased_possorted_bam.bam generated by Longranger') parser.add_argument('-d', '--out_dir', required = True, metavar = 'output_directory', type = str, help = 'output directory') parser.add_argument('-r', '--ref', required = True, metavar = 'ref.fa', type = str, help ='reference FASTA file') ### optional arguments ### parser.add_argument('-v', '--ref_version', required = False, metavar = 'version', type = str, default = '', help ='version of reference fasta file. Current supported versions are: hg19, b37, hg38') parser.add_argument('--gap_region_bed', required = False, metavar = 'BED', type = str, default = '', help ='reference gap region in bed format, required if --ref_version is not specified') parser.add_argument('--black_region_bed', required = False, metavar = 'BED', type = str, default = '', help ='black region in bed format, required if --ref_version is not specified') parser.add_argument('-t', '--n_thread', required = False, metavar = 'num_thread', type = int, default = 1, help ='number of threads (default: 4)') parser.add_argument('--min_fragment_length', metavar = 'INT', required = False, type = int, default = -1, help ='minimal fragment length considered for SV calling') parser.add_argument('--min_reads_in_fragment', metavar = 'INT', required = False, type = int, default = -1, help ='minimal number of confidently mapped reads in one fragment') parser.add_argument('--min_supp_barcodes', metavar = 'INT', required = False, type = int, default = 10, help ='minimal number of shared barcodes between two SV breakpoints (default: 10)') parser.add_argument('--samtools', required = False, metavar = 'path/to/samtools', type = str, default = 'samtools', help ='path to samtools (default: find in environmental path)') parser.add_argument('--bedtools', required = False, metavar = 'path/to/bedtools', type = str, default = 'bedtools', help ='path to bedtools (default: find in environmental path)') parser.add_argument('--wgs', dest='is_wgs', action='store_true', help='the input is whole-genome sequencing data') parser.add_argument('--targeted', dest='is_wgs', action='store_false', help='the input is targeted region sequencing data (such as WES)') parser.add_argument('--germline_mode', dest='germline_mode', action='store_true', help='detect germline SVs') parser.add_argument('--somatic_mode', dest='germline_mode', action='store_false', help='detect somatic SVs (with low variant allele frequency)') parser.set_defaults(germline_mode = True) parser.set_defaults(is_wgs = True) parser.add_argument('--target_region', required = False, metavar = 'BED', type = str, default = '', help ='bed file of target regions (required if --targeted is specified)') parser.add_argument('--gap_distance_cut_off', required = False, metavar = 'INT', type = int, default = -1, help ='max distance between two reads in a HMW DNA molecule (default: automatically determined)') parser.add_argument('--save_temp_files', dest='save_temp_files', action='store_true', help='Do not remove intermediate files after the run. Use in debug mode. Default: False') parser.add_argument('--version', action='version', version='%(prog)s 1.0.1') input_args = parser.parse_args() args = global_parameter(input_args) args.tid2chrname, args.chrname2tid = my_utils.get_chrnames(args.faidx_file) args.alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file) args.alt_tid_set = my_utils.get_alternative_tid_set(args.alt_ctg_file, args.faidx_file) check_arguments(args) dbo_args = dbo_parameter(args) endpoint_args = endpoint_parameter(args) return args, dbo_args, endpoint_args
def filter_read_depth(input_sv_list, args): if args.is_wgs == False or args.germline_mode == False: return input_sv_list chr_len_list = my_utils.get_chr_length(args.faidx_file) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file) wg_high_mapq_depth_list, wg_total_depth_list, bin_size = plot_read_depth.get_wg_depth_list( args.read_depth_file, chr_len_list) wg_depth_mean, wg_depth_std = get_mean_std_depth(wg_high_mapq_depth_list) for j in range(0, len(input_sv_list)): svcall = input_sv_list[j] if svcall.svtype == 'INV' or svcall.svtype == 'TRA' or svcall.chrm1 != svcall.chrm2: continue bk1_pos = svcall.start1 bk2_pos = svcall.end2 tid = chrname2tid_dict[svcall.chrm1] sv_region_total_depth_list = get_region_depth_list( wg_total_depth_list, tid, bk1_pos, bk2_pos, bin_size) n_dots = len(sv_region_total_depth_list) if n_dots < 2: continue mean_reg_total_depth = np.median(sv_region_total_depth_list) q1_reg_total_depth = np.percentile(sv_region_total_depth_list, 0.25) q3_reg_total_depth = np.percentile(sv_region_total_depth_list, 0.75) if svcall.svtype == 'DEL' and ( mean_reg_total_depth <= wg_depth_mean * 0.667 and q3_reg_total_depth < wg_depth_mean): continue elif svcall.svtype == 'DUP' and ( mean_reg_total_depth >= wg_depth_mean * 1.25 and q1_reg_total_depth > wg_depth_mean): continue else: input_sv_list[j].ft = 'DEPTH_FILTER' return input_sv_list
def cluster_weird_reads(in_weird_reads_file, out_file, faidx_file): tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) max_distance = 300 min_n_short_read_supp = 2 max_n_short_read_supp = 1000 min_sv_length = 1000 my_utils.myprint('reading file: %s' % in_weird_reads_file) short_read_support_list35 = read_weird_reads_file(in_weird_reads_file, chrname2tid_dict, min_sv_length) my_utils.myprint('finished reading file: %s' % in_weird_reads_file) out_fp = open(out_file, 'w') out_fp.write('') out_fp.close() my_utils.myprint('clustering discordant reads') cluster_weird_reads1type(short_read_support_list35, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp) return
def small_deletion_dection_by_local_assembly(samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, faidx_file, out_dir, out_del_call_file, n_threads, window_size, max_depth, rm_temp_files=1): if os.path.exists(faidx_file) == False: cmd = '%s faidx %s' % (samtools, ref_fasta_file) my_utils.myprint(cmd) os.system(cmd) if os.path.exists(faidx_file) == False: my_utils.myprint( 'ERROR! The index file of the reference fasta file does not exist!' ) sys.exit() cmd = 'mkdir -p %s' % out_dir my_utils.myprint(cmd) os.system('mkdir -p %s' % out_dir) tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file) chr_len_list = my_utils.get_chr_length(faidx_file) overlap_length = int(window_size / 10) interval_list = generate_interval_list(chr_len_list, tid2chrname_list, chrname2tid_dict, window_size, overlap_length) process_list = list() out_combined_vcf_file_list = list() for i in range(0, n_threads): out_combined_vcf_file = os.path.join( out_dir, 'assembly_raw_variants.%d.txt' % i) out_combined_vcf_file_list.append(out_combined_vcf_file) t = multiprocessing.Process( target=small_deletion_dection_from_interval_list, args=(i, n_threads, samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file, out_dir, window_size, max_depth, interval_list, out_combined_vcf_file)) process_list.append(t) t.start() for t in process_list: t.join() all_processes_out_combined_vcf_file = os.path.join( out_dir, 'local_assembly_raw_variants.txt') cmd = 'cat ' for out_combined_vcf_file in out_combined_vcf_file_list: cmd += ' %s ' % out_combined_vcf_file cmd += ' > %s ' % all_processes_out_combined_vcf_file my_utils.myprint(cmd) os.system(cmd) extract_del_from_vcf_file(all_processes_out_combined_vcf_file, out_del_call_file) if rm_temp_files: for out_combined_vcf_file in out_combined_vcf_file_list: os.remove(out_combined_vcf_file) os.remove(all_processes_out_combined_vcf_file) return
def filter_calls(args, dbo_args, endpoint_args): my_utils.myprint('filtering SV calls') bin_size = 100 tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file) alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file) if os.path.exists(args.black_region_bed_file): my_utils.myprint('reading black region bed file: %s' % args.black_region_bed_file) black_reg_dict = read_black_reg_bed_file(args.black_region_bed_file, bin_size) else: if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37': my_utils.myprint('ERROR! black list file is missing: %s' % black_region_bed_file) black_reg_dict = dict() if os.path.exists(args.gap_region_bed_file): gap_left_region_dict, gap_right_region_dict = read_gap_region_file( args.gap_region_bed_file, bin_size) my_utils.myprint('reading gap region bed file: %s' % args.gap_region_bed_file) else: if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37': my_utils.myprint('ERROR! gap region file is missing: %s' % args.gap_region_bed_file) gap_left_region_dict = dict() gap_right_region_dict = dict() raw_svcall_list = my_utils.read_object_file(args.merged_bedpe_file, bedpe.QuantifiedBKCandCore) for i in range(0, len(raw_svcall_list)): raw_svcall_list[i].ft = '.' round1_retained_sv_list = filter_1d_blacklist( raw_svcall_list, black_reg_dict, alt_chr_name_set, gap_left_region_dict, gap_right_region_dict, bin_size) n_retained_sv = 0 for svcall in round1_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 my_utils.myprint('number of retained SVs: %d' % n_retained_sv) round2_retained_sv_list = filter_low_mapq_gaps(round1_retained_sv_list, endpoint_args, chrname2tid_dict) n_retained_sv = 0 for svcall in round2_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 if args.ref_version == 'b37': remove_chr_prefix = True else: remove_chr_prefix = False round3_retained_sv_list = filter_calls_2d(round2_retained_sv_list, args.black_region_2d_file, args.filter_bedpe_file, remove_chr_prefix) n_retained_sv = 0 for svcall in round3_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 round4_retained_sv_list = filter_dbo_score(round3_retained_sv_list, args) n_retained_sv = 0 for svcall in round4_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 round5_retained_sv_list = filter_read_depth(round4_retained_sv_list, args) round6_retained_sv_list = filter_sv_length(round5_retained_sv_list, args) final_retained_sv_list = round6_retained_sv_list n_retained_sv = 0 for svcall in final_retained_sv_list: if svcall.ft == '.': n_retained_sv += 1 my_utils.myprint('number of retained SVs: %d' % n_retained_sv) header = '#chrom1\tstart1\tstop1\tchrom2\tstart2\tstop2\t' header += 'sv_type\tsv_id\tsv_length\tqual_score\tfilter\tinfo\n' out_file = args.filter_bedpe_file out_fp = open(out_file, 'w') out_fp.write(header) sv_id = 0 n_svcall = len(final_retained_sv_list) n_digit = int(math.log10(n_svcall) + 2) for svcall in final_retained_sv_list: if svcall.ft == '.': svcall.ft = 'PASS' sv_id += 1 sv_id_str = str(sv_id) sv_id_str = '0' * (n_digit - len(sv_id_str)) + sv_id_str svcall.sv_id = 'ID%s' % sv_id_str out_fp.write(svcall.output_core2() + endl) out_fp.close() return