def visualize_sv_calls(args, dbo_args, endpoint_args):

    in_svcalls_bedpe_file = args.filter_bedpe_file
    bcd13_file = dbo_args.bcd13_file
    bcd21_file = endpoint_args.bcd21_file
    faidx_file = args.faidx_file
    cal_2d_overlapping_barcodes_binary = args.cal_2d_overlapping_barcodes
    cal_read_depth_from_bcd21_binary = args.cal_read_depth_from_bcd21
    out_dir = args.image_out_dir
    out_prefix = args.bam_name

    my_utils.make_dir(out_dir)

    chr_len_list = my_utils.get_chr_length(faidx_file)
    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    in_svcall_list = read_svcall_bedpe_file(in_svcalls_bedpe_file, chrname2tid_dict)
    for i in range(0, len(in_svcall_list)):
        in_svcall_list[i].format() 

    plot_depth(cal_read_depth_from_bcd21_binary, bcd21_file, in_svcall_list, faidx_file, chr_len_list, tid2chrname_list, chrname2tid_dict, out_dir, out_prefix)

    plot_twin_window_barcode_similarity(in_svcall_list, bcd13_file, faidx_file, out_dir, chr_len_list, tid2chrname_list, chrname2tid_dict, out_prefix)

    flank_dist = 100 * 1000 # set flank distance to be 100 kb
    plot_heatmap (in_svcall_list, bcd21_file, faidx_file, out_dir, flank_dist, chr_len_list, tid2chrname_list, chrname2tid_dict, cal_2d_overlapping_barcodes_binary, out_prefix)

    return
def detect_small_deletions(input_bam_file,
                           out_dir,
                           out_del_call_file,
                           n_threads,
                           ref_fasta_file,
                           fermikit_dir,
                           samtools,
                           bedtools,
                           in_weird_reads_file,
                           weird_reads_cluster_file,
                           call_small_deletions,
                           cal_hap_read_depth_from_bcd21,
                           bcd21_file,
                           bcd22_file,
                           hap_type_read_depth_file,
                           gap_region_bed_file,
                           rm_temp_files=1):

    faidx_file = ref_fasta_file + '.fai'
    window_size = int(2e5)
    max_depth = 500
    bin_size = 100
    mapq_cutoff = 20
    local_assembly_out_file = os.path.join(out_dir, 'local_assembly.del.bedpe')
    short_reads_del_call_file = os.path.join(
        out_dir, 'discordant_read_pairs.del.bedpe')

    local_assembly.small_deletion_dection_by_local_assembly(
        samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file,
        faidx_file, out_dir, local_assembly_out_file, n_threads, window_size,
        max_depth, rm_temp_files)

    cluster_weird_reads.cluster_weird_reads(in_weird_reads_file,
                                            weird_reads_cluster_file,
                                            faidx_file)

    cmd = '%s %s %s %s %d %d' % (cal_hap_read_depth_from_bcd21, bcd21_file,
                                 hap_type_read_depth_file, faidx_file,
                                 bin_size, mapq_cutoff)
    my_utils.myprint(cmd)
    os.system(cmd)

    cmd = '%s %s %s %s %s %s %s' % (
        call_small_deletions, hap_type_read_depth_file,
        weird_reads_cluster_file, bcd22_file, faidx_file, gap_region_bed_file,
        short_reads_del_call_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    merge_sv_calls(local_assembly_out_file, short_reads_del_call_file,
                   out_del_call_file, tid2chrname_list, chrname2tid_dict)

    if rm_temp_files:
        os.remove(local_assembly_out_file)
        os.remove(short_reads_del_call_file)

    return
Exemple #3
0
def parse_user_arguments():

    parser = argparse.ArgumentParser(description='Detection of SVs from linked-read sequencing data')
    ### required arguments ###
    parser.add_argument('-i', '--bam', required = True, metavar = 'input.phased_possorted_bam.bam', type = str, help = 'input bam file (should be the phased_possorted_bam.bam generated by Longranger')
    parser.add_argument('-d', '--out_dir', required = True, metavar = 'output_directory', type = str, help = 'output directory')
    parser.add_argument('-r', '--ref', required = True, metavar = 'ref.fa', type = str, help ='reference FASTA file')

    ### optional arguments ###
    parser.add_argument('-v', '--ref_version', required = False, metavar = 'version', type = str, default = '', help ='version of reference fasta file. Current supported versions are: hg19, b37, hg38')
    parser.add_argument('--gap_region_bed', required = False, metavar = 'BED', type = str, default = '', help ='reference gap region in bed format, required if --ref_version is not specified')
    parser.add_argument('--black_region_bed', required = False, metavar = 'BED', type = str, default = '', help ='black region in bed format, required if --ref_version is not specified')
    parser.add_argument('-t', '--n_thread', required = False, metavar = 'num_thread', type = int, default = 1, help ='number of threads (default: 4)')
    parser.add_argument('--min_fragment_length', metavar = 'INT', required = False, type = int, default = -1, help ='minimal fragment length considered for SV calling')
    parser.add_argument('--min_reads_in_fragment', metavar = 'INT', required = False, type = int, default = -1, help ='minimal number of confidently mapped reads in one fragment')
    parser.add_argument('--min_supp_barcodes', metavar = 'INT', required = False, type = int, default = 10, help ='minimal number of shared barcodes between two SV breakpoints (default: 10)')
    parser.add_argument('--samtools', required = False, metavar = 'path/to/samtools', type = str, default = 'samtools', help ='path to samtools (default: find in environmental path)')
    parser.add_argument('--bedtools', required = False, metavar = 'path/to/bedtools', type = str, default = 'bedtools', help ='path to bedtools (default: find in environmental path)')
    parser.add_argument('--wgs', dest='is_wgs', action='store_true', help='the input is whole-genome sequencing data')
    parser.add_argument('--targeted', dest='is_wgs', action='store_false', help='the input is targeted region sequencing data (such as WES)')
    parser.add_argument('--germline_mode', dest='germline_mode', action='store_true', help='detect germline SVs')
    parser.add_argument('--somatic_mode', dest='germline_mode', action='store_false', help='detect somatic SVs (with low variant allele frequency)')
    parser.set_defaults(germline_mode = True)
    parser.set_defaults(is_wgs = True)
    parser.add_argument('--target_region', required = False, metavar = 'BED', type = str, default = '', help ='bed file of target regions (required if --targeted is specified)')
    parser.add_argument('--gap_distance_cut_off', required = False, metavar = 'INT', type = int, default = -1, help ='max distance between two reads in a HMW DNA molecule (default: automatically determined)')
    parser.add_argument('--save_temp_files', dest='save_temp_files', action='store_true', help='Do not remove intermediate files after the run. Use in debug mode. Default: False')
    parser.add_argument('--version', action='version', version='%(prog)s 1.0.1')

    input_args = parser.parse_args()

    args = global_parameter(input_args)

    args.tid2chrname, args.chrname2tid = my_utils.get_chrnames(args.faidx_file)

    args.alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file)

    args.alt_tid_set = my_utils.get_alternative_tid_set(args.alt_ctg_file, args.faidx_file)

    check_arguments(args)

    dbo_args = dbo_parameter(args)

    endpoint_args = endpoint_parameter(args)

    return args, dbo_args, endpoint_args
Exemple #4
0
def filter_read_depth(input_sv_list, args):

    if args.is_wgs == False or args.germline_mode == False:
        return input_sv_list

    chr_len_list = my_utils.get_chr_length(args.faidx_file)

    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file)

    wg_high_mapq_depth_list, wg_total_depth_list, bin_size = plot_read_depth.get_wg_depth_list(
        args.read_depth_file, chr_len_list)

    wg_depth_mean, wg_depth_std = get_mean_std_depth(wg_high_mapq_depth_list)

    for j in range(0, len(input_sv_list)):
        svcall = input_sv_list[j]
        if svcall.svtype == 'INV' or svcall.svtype == 'TRA' or svcall.chrm1 != svcall.chrm2:
            continue

        bk1_pos = svcall.start1
        bk2_pos = svcall.end2
        tid = chrname2tid_dict[svcall.chrm1]

        sv_region_total_depth_list = get_region_depth_list(
            wg_total_depth_list, tid, bk1_pos, bk2_pos, bin_size)
        n_dots = len(sv_region_total_depth_list)
        if n_dots < 2: continue

        mean_reg_total_depth = np.median(sv_region_total_depth_list)
        q1_reg_total_depth = np.percentile(sv_region_total_depth_list, 0.25)
        q3_reg_total_depth = np.percentile(sv_region_total_depth_list, 0.75)

        if svcall.svtype == 'DEL' and (
                mean_reg_total_depth <= wg_depth_mean * 0.667
                and q3_reg_total_depth < wg_depth_mean):
            continue
        elif svcall.svtype == 'DUP' and (
                mean_reg_total_depth >= wg_depth_mean * 1.25
                and q1_reg_total_depth > wg_depth_mean):
            continue
        else:
            input_sv_list[j].ft = 'DEPTH_FILTER'

    return input_sv_list
Exemple #5
0
def cluster_weird_reads(in_weird_reads_file, out_file, faidx_file):

    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    max_distance = 300
    min_n_short_read_supp = 2
    max_n_short_read_supp = 1000
    min_sv_length = 1000

    my_utils.myprint('reading file: %s' % in_weird_reads_file)
    short_read_support_list35 = read_weird_reads_file(in_weird_reads_file, chrname2tid_dict, min_sv_length)
    my_utils.myprint('finished reading file: %s' % in_weird_reads_file)

    out_fp = open(out_file, 'w')
    out_fp.write('')
    out_fp.close()

    my_utils.myprint('clustering discordant reads')
    cluster_weird_reads1type(short_read_support_list35, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp)

    return
Exemple #6
0
def small_deletion_dection_by_local_assembly(samtools,
                                             bedtools,
                                             fermikit_dir,
                                             input_bam_file,
                                             ref_fasta_file,
                                             faidx_file,
                                             out_dir,
                                             out_del_call_file,
                                             n_threads,
                                             window_size,
                                             max_depth,
                                             rm_temp_files=1):

    if os.path.exists(faidx_file) == False:
        cmd = '%s faidx %s' % (samtools, ref_fasta_file)
        my_utils.myprint(cmd)
        os.system(cmd)

    if os.path.exists(faidx_file) == False:
        my_utils.myprint(
            'ERROR! The index file of the reference fasta file does not exist!'
        )
        sys.exit()

    cmd = 'mkdir -p %s' % out_dir
    my_utils.myprint(cmd)
    os.system('mkdir -p %s' % out_dir)
    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    chr_len_list = my_utils.get_chr_length(faidx_file)

    overlap_length = int(window_size / 10)
    interval_list = generate_interval_list(chr_len_list, tid2chrname_list,
                                           chrname2tid_dict, window_size,
                                           overlap_length)

    process_list = list()
    out_combined_vcf_file_list = list()
    for i in range(0, n_threads):
        out_combined_vcf_file = os.path.join(
            out_dir, 'assembly_raw_variants.%d.txt' % i)
        out_combined_vcf_file_list.append(out_combined_vcf_file)
        t = multiprocessing.Process(
            target=small_deletion_dection_from_interval_list,
            args=(i, n_threads, samtools, bedtools, fermikit_dir,
                  input_bam_file, ref_fasta_file, out_dir, window_size,
                  max_depth, interval_list, out_combined_vcf_file))
        process_list.append(t)
        t.start()

    for t in process_list:
        t.join()

    all_processes_out_combined_vcf_file = os.path.join(
        out_dir, 'local_assembly_raw_variants.txt')
    cmd = 'cat '
    for out_combined_vcf_file in out_combined_vcf_file_list:
        cmd += ' %s ' % out_combined_vcf_file

    cmd += ' > %s ' % all_processes_out_combined_vcf_file
    my_utils.myprint(cmd)
    os.system(cmd)

    extract_del_from_vcf_file(all_processes_out_combined_vcf_file,
                              out_del_call_file)

    if rm_temp_files:
        for out_combined_vcf_file in out_combined_vcf_file_list:
            os.remove(out_combined_vcf_file)
        os.remove(all_processes_out_combined_vcf_file)

    return
Exemple #7
0
def filter_calls(args, dbo_args, endpoint_args):

    my_utils.myprint('filtering SV calls')

    bin_size = 100
    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file)
    alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file)

    if os.path.exists(args.black_region_bed_file):
        my_utils.myprint('reading black region bed file: %s' %
                         args.black_region_bed_file)
        black_reg_dict = read_black_reg_bed_file(args.black_region_bed_file,
                                                 bin_size)
    else:
        if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37':
            my_utils.myprint('ERROR! black list file is missing: %s' %
                             black_region_bed_file)
        black_reg_dict = dict()

    if os.path.exists(args.gap_region_bed_file):
        gap_left_region_dict, gap_right_region_dict = read_gap_region_file(
            args.gap_region_bed_file, bin_size)
        my_utils.myprint('reading gap region bed file: %s' %
                         args.gap_region_bed_file)
    else:
        if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37':
            my_utils.myprint('ERROR! gap region file is missing: %s' %
                             args.gap_region_bed_file)
        gap_left_region_dict = dict()
        gap_right_region_dict = dict()

    raw_svcall_list = my_utils.read_object_file(args.merged_bedpe_file,
                                                bedpe.QuantifiedBKCandCore)
    for i in range(0, len(raw_svcall_list)):
        raw_svcall_list[i].ft = '.'

    round1_retained_sv_list = filter_1d_blacklist(
        raw_svcall_list, black_reg_dict, alt_chr_name_set,
        gap_left_region_dict, gap_right_region_dict, bin_size)

    n_retained_sv = 0
    for svcall in round1_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1
    my_utils.myprint('number of retained SVs: %d' % n_retained_sv)

    round2_retained_sv_list = filter_low_mapq_gaps(round1_retained_sv_list,
                                                   endpoint_args,
                                                   chrname2tid_dict)

    n_retained_sv = 0
    for svcall in round2_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    if args.ref_version == 'b37':
        remove_chr_prefix = True
    else:
        remove_chr_prefix = False

    round3_retained_sv_list = filter_calls_2d(round2_retained_sv_list,
                                              args.black_region_2d_file,
                                              args.filter_bedpe_file,
                                              remove_chr_prefix)

    n_retained_sv = 0
    for svcall in round3_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    round4_retained_sv_list = filter_dbo_score(round3_retained_sv_list, args)

    n_retained_sv = 0
    for svcall in round4_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    round5_retained_sv_list = filter_read_depth(round4_retained_sv_list, args)

    round6_retained_sv_list = filter_sv_length(round5_retained_sv_list, args)

    final_retained_sv_list = round6_retained_sv_list

    n_retained_sv = 0
    for svcall in final_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1
    my_utils.myprint('number of retained SVs: %d' % n_retained_sv)

    header = '#chrom1\tstart1\tstop1\tchrom2\tstart2\tstop2\t'
    header += 'sv_type\tsv_id\tsv_length\tqual_score\tfilter\tinfo\n'

    out_file = args.filter_bedpe_file
    out_fp = open(out_file, 'w')
    out_fp.write(header)
    sv_id = 0
    n_svcall = len(final_retained_sv_list)
    n_digit = int(math.log10(n_svcall) + 2)

    for svcall in final_retained_sv_list:
        if svcall.ft == '.':
            svcall.ft = 'PASS'
            sv_id += 1
            sv_id_str = str(sv_id)
            sv_id_str = '0' * (n_digit - len(sv_id_str)) + sv_id_str
            svcall.sv_id = 'ID%s' % sv_id_str
            out_fp.write(svcall.output_core2() + endl)

    out_fp.close()

    return