Example #1
0
    def read_bedpe_line(self, line, chrname2tid_dict=None):
        line = line.strip().split(tab)
        if len(line) < 10:
            my_utils.myprint(
                'ERROR! number of columns is less than 10. The line is:')
            my_utils.myprint(tab.join(line))
            sys.exit()

        self.chrm1, self.pos1 = line[0:2]
        self.chrm2, self.pos2 = line[3:5]
        self.pos1 = int(self.pos1)
        self.pos2 = int(self.pos2)

        self.sv_type, self.sv_id, self.sv_size, self.score, self.filter, self.aux_info = line[
            6:12]
        self.sv_size = int(self.sv_size)
        self.score = float(self.score)

        if chrname2tid_dict != None:
            if self.chrm1 in chrname2tid_dict:
                self.tid1 = chrname2tid_dict[self.chrm1]
            if self.chrm2 in chrname2tid_dict:
                self.tid2 = chrname2tid_dict[self.chrm2]

        aux_list = self.aux_info.split(tab)
        for aux in aux_list:
            if aux == 'SVMETHOD=local_assembly':
                self.assembled = True
            if aux == 'PRECISE':
                self.is_precise = True
            if aux == 'IMPRECISE':
                self.is_precise = False
def get_bin_size_from_bcd13_file(bcd13_file):

    bcd13_fp = my_utils.gzopen(bcd13_file, 'r')
    pos_list = list()

    while 1:
        line = bcd13_fp.readline()
        if not line: break
        if line[0] == '#': continue

        line = line.strip().split(tab)
        pos = int(line[1])
        pos_list.append(pos)
        if len(pos_list) > 1000000: break
    bcd13_fp.close()
    
    interval_list = list()
    for i in range(1, len(pos_list)):
        interval = pos_list[i] - pos_list[i-1]
        if interval > 0: interval_list.append(interval)

    if len(interval_list) < 1:
        my_utils.myprint('Failed to get bin size from file: %s' % bcd13_file)
        sys.exit(1)

    bin_size = int(np.median(interval_list))

    del pos_list
    del interval_list

    return bin_size
def detect_small_deletions(input_bam_file,
                           out_dir,
                           out_del_call_file,
                           n_threads,
                           ref_fasta_file,
                           fermikit_dir,
                           samtools,
                           bedtools,
                           in_weird_reads_file,
                           weird_reads_cluster_file,
                           call_small_deletions,
                           cal_hap_read_depth_from_bcd21,
                           bcd21_file,
                           bcd22_file,
                           hap_type_read_depth_file,
                           gap_region_bed_file,
                           rm_temp_files=1):

    faidx_file = ref_fasta_file + '.fai'
    window_size = int(2e5)
    max_depth = 500
    bin_size = 100
    mapq_cutoff = 20
    local_assembly_out_file = os.path.join(out_dir, 'local_assembly.del.bedpe')
    short_reads_del_call_file = os.path.join(
        out_dir, 'discordant_read_pairs.del.bedpe')

    local_assembly.small_deletion_dection_by_local_assembly(
        samtools, bedtools, fermikit_dir, input_bam_file, ref_fasta_file,
        faidx_file, out_dir, local_assembly_out_file, n_threads, window_size,
        max_depth, rm_temp_files)

    cluster_weird_reads.cluster_weird_reads(in_weird_reads_file,
                                            weird_reads_cluster_file,
                                            faidx_file)

    cmd = '%s %s %s %s %d %d' % (cal_hap_read_depth_from_bcd21, bcd21_file,
                                 hap_type_read_depth_file, faidx_file,
                                 bin_size, mapq_cutoff)
    my_utils.myprint(cmd)
    os.system(cmd)

    cmd = '%s %s %s %s %s %s %s' % (
        call_small_deletions, hap_type_read_depth_file,
        weird_reads_cluster_file, bcd22_file, faidx_file, gap_region_bed_file,
        short_reads_del_call_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    merge_sv_calls(local_assembly_out_file, short_reads_del_call_file,
                   out_del_call_file, tid2chrname_list, chrname2tid_dict)

    if rm_temp_files:
        os.remove(local_assembly_out_file)
        os.remove(short_reads_del_call_file)

    return
Example #4
0
def filter_calls_2d(svcall_list,
                    black_list_file,
                    out_file,
                    remove_chr_prefix=False):

    black_list_2array_dict, bin_size = read_2d_blacklist_file(
        black_list_file, remove_chr_prefix)

    mean_fragment_length = 20000
    box_length = mean_fragment_length

    for i in range(0, len(svcall_list)):
        svcall = svcall_list[i]
        if svcall.ft != '.': continue

        chr1 = svcall.chrm1
        pos1 = svcall.start1
        chr2 = svcall.chrm2
        pos2 = svcall.start2
        end_type1 = svcall.endtype1
        end_type2 = svcall.endtype2

        if end_type1 == 'R_end':
            start1 = pos1 - box_length
        elif end_type1 == 'L_end':
            start1 = pos1

        if end_type2 == 'R_end':
            start2 = pos2 - box_length
        elif end_type2 == 'L_end':
            start2 = pos2

        end1 = start1 + box_length
        end2 = start2 + box_length

        key1 = two_chr_to_key(chr1, chr2)
        key2 = two_chr_to_key(chr2, chr1)

        if key1 in black_list_2array_dict:
            pos1_list, pos2_list_list = black_list_2array_dict[key1]
            number_of_points = get_number_of_points_from_black_list_file(
                start1, end1, start2, end2, pos1_list, pos2_list_list,
                bin_size)
        elif key2 in black_list_2array_dict:
            my_utils.myprint('switch chr1 and chr2')
            pos1_list, pos2_list_list = black_list_2array_dict[key2]
            number_of_points = get_number_of_points_from_black_list_file(
                start2, end2, start1, end1, pos1_list, pos2_list_list,
                bin_size)
        else:
            number_of_points = 0

        if number_of_points >= 20: svcall_list[i].ft = '2D_BLACKLIST'

    return svcall_list
Example #5
0
def quantify_sv_candidates(args, dbo_args, endpoint_args):
    ## quantification ##
    task = 'quantifying SV candidates'
    if args.run_from_begining == False and my_utils.check_file_exists(
            args.quantified_bk_pair_file) == True:
        my_utils.myprint('quantified SV file existed, skipped %s' % (task))
    else:
        my_utils.myprint(task)
        quantify2bkcand.quantify2bkcand(args, dbo_args, endpoint_args)

    gc.collect()
Example #6
0
def merge_sv_calls(args, dbo_args, endpoint_args):
    ## merge calls ##
    task = 'merging SV candidates'
    if args.run_from_begining == False and my_utils.check_file_exists(
            args.merged_bedpe_file) == True:
        my_utils.myprint('merged bedpe file existed, skipped %s' % (task))
    else:
        my_utils.myprint(task)
        merge_quantified_calls.merge_quantified_calls(args, dbo_args,
                                                      endpoint_args)

    gc.collect()
Example #7
0
def read_2d_blacklist_file(black_list_file, remove_chr_prefix):

    black_list_2d_dict = dict()

    black_list_fp = gzip.open(black_list_file, 'rt')

    bin_size = 0

    while 1:
        line = black_list_fp.readline()
        if not line: break
        if line[0] == '#':
            line = line.strip().split('=')
            bin_size = int(line[1])
            my_utils.myprint('bin_size = %d' % bin_size)
            continue

        if line[0] == '>':
            chr_list = line[1:].strip().split(',')
            chr1 = chr_list[0]
            chr2 = chr_list[1]

            if remove_chr_prefix:
                chr1 = chr1[3:]
                chr2 = chr2[3:]

            key = two_chr_to_key(chr1, chr2)
            black_list_2d_dict[key] = list()
            continue

        line = line.strip().split(tab)
        pos1 = int(line[0])
        pos2_list = line[1].split(',')
        for i in range(0, len(pos2_list)):
            pos2_list[i] = int(pos2_list[i])

        d2 = D2(pos1, pos2_list)
        black_list_2d_dict[key].append(d2)

    black_list_fp.close()

    black_list_2array_dict = dict()
    for key in black_list_2d_dict:
        black_list_2d_dict[key].sort(key=lambda d2: d2.pos1)
        black_list_2array_dict[key] = (list(), list())
        for d2 in black_list_2d_dict[key]:
            black_list_2array_dict[key][0].append(d2.pos1)
            black_list_2array_dict[key][1].append(d2.pos2_list)

    del black_list_2d_dict

    return black_list_2array_dict, bin_size
Example #8
0
def plot_read_depth_for1region(chrom, tid, bk_pos1, bk_pos2, out_file,
                               figure_title, wg_high_mapq_depth_list,
                               wg_total_depth_list, chr_len_list, bin_size,
                               wg_avg_depth):

    plt.figure(figsize=(10, 5))

    if bk_pos2 < bk_pos1:
        temp = bk_pos2
        bk_pos2 = bk_pos1
        bk_pos1 = temp

    sv_len = bk_pos2 - bk_pos1

    win_start = max(0, bk_pos1 - sv_len)
    win_end = min(chr_len_list[tid], bk_pos2 + sv_len)

    win_start_idx = int(win_start / bin_size)
    win_end_idx = int(win_end / bin_size) + 1

    x = range(win_start_idx * bin_size, win_end_idx * bin_size, bin_size)
    y1 = wg_high_mapq_depth_list[tid][win_start_idx:win_end_idx]
    y2 = wg_total_depth_list[tid][win_start_idx:win_end_idx]

    ymean = np.mean(y2)

    ymax = ymean * 3

    if ymax < wg_avg_depth * 2:
        ymax = wg_avg_depth * 2

    plt.title(figure_title)

    plt.xlabel('%s position' % chrom)
    plt.ylabel('Read depth')

    plt.plot(x, y2, '-', color='grey')
    plt.plot(x, y1, '-', color='black')

    plt.axis([win_start, win_end, 0, ymax])
    plt.axvline(x=bk_pos1, color='r', linestyle='--')
    plt.axvline(x=bk_pos2, color='r', linestyle='--')
    plt.axhline(y=wg_avg_depth, color='b', linestyle='--')
    plt.ticklabel_format(axis='both', style='plain')
    plt.xticks(np.arange(min(x), max(x) + 1, sv_len))
    plt.rcParams.update({'font.size': 16})
    plt.show()
    plt.savefig(out_file, dpi=200)
    plt.close('all')
    my_utils.myprint('saved figure: %s' % out_file)

    return
Example #9
0
def fermikit_variant_calling(fermikit_dir, samtools, n_threads_for_one_process,
                             region_fasta_file, window_size, input_fastq_file,
                             curr_out_dir, out_prefix):

    out_mak_file = os.path.join(curr_out_dir, '%s.mak' % out_prefix)
    assembly_contigs_file = os.path.join(curr_out_dir,
                                         '%s.mag.gz' % out_prefix)

    cmd = 'cd %s && %s/bwa index %s' % (curr_out_dir, fermikit_dir,
                                        region_fasta_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    cmd = 'cd %s && perl %s/fermi2.pl unitig -s %s -l 151 -t %d -p %s %s > %s\n\n' % (
        curr_out_dir, fermikit_dir, window_size, n_threads_for_one_process,
        out_prefix, input_fastq_file, out_mak_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    cmd = 'make -f %s\n\n' % out_mak_file
    my_utils.myprint(cmd)
    os.system(cmd)

    cmd = 'cd %s && perl %s/run-calling -t %d %s %s | sh \n\n' % (
        curr_out_dir, fermikit_dir, n_threads_for_one_process,
        region_fasta_file, assembly_contigs_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    return
Example #10
0
def plot_depth(cal_read_depth_from_bcd21_binary, bcd21_file, in_svcalls_list, faidx_file, chr_len_list, tid2chrname_list, chrname2tid_dict, out_dir, out_prefix):

    if os.path.exists(cal_read_depth_from_bcd21_binary) == False:
        my_utils.myprint('ERROR! The binary file doesn\'t exist:%s\Failed to plot read depth' % cal_read_depth_from_bcd21_binary)
        return
    
    if os.path.exists(bcd21_file) == False:
        my_utils.myprint('ERROR! The bcd21 file doesn\'t exist:%s\Failed to plot read depth' % bcd21_file)
        return

    if os.path.exists(faidx_file) == False:
        my_utils.myprint('ERROR! The fasta index file doesn\'t exist:%s\Failed to plot read depth' % faidx_file)
        return

    out_dir = os.path.join(out_dir, 'read_depth')
    my_utils.make_dir(out_dir)

    bin_size = 500
    read_depth_file = os.path.join(out_dir, '%s.read_depth.txt' % out_prefix)
    cmd_args_list = [cal_read_depth_from_bcd21_binary, bcd21_file, read_depth_file, faidx_file, str(bin_size), '20']
    my_utils.myprint('calculating read depth from file: %s' % bcd21_file)
    subprocess.call(cmd_args_list)

    my_utils.myprint('plotting read depth')
    wg_high_mapq_depth_list, wg_total_depth_list, bin_size = plot_read_depth.get_wg_depth_list(read_depth_file, chr_len_list)
    
    wg_total_depth = 0 
    wg_n_bin = 0 
    for tid in range(0, len(wg_high_mapq_depth_list)):
        for depth in wg_high_mapq_depth_list[tid]:
            wg_total_depth += depth
            wg_n_bin += 1

    wg_avg_depth = float(wg_total_depth) / wg_n_bin 

    for svcall in in_svcalls_list:
        if svcall.chrm1 != svcall.chrm2: continue
        out_file = os.path.join(out_dir, '%s.%s.read_depth.png' % (out_prefix, svcall.sv_id))
        figure_title = 'Read depth (%s, %d bp %s)' % (svcall.sv_id, svcall.end2 - svcall.start1, svcall.svtype)
        plot_read_depth.plot_read_depth_for1region(svcall.chrm1, svcall.tid1, svcall.start1, svcall.end2, out_file, figure_title, wg_high_mapq_depth_list, wg_total_depth_list, chr_len_list, bin_size, wg_avg_depth)

    os.remove(read_depth_file)

    return
Example #11
0
def get_wg_depth_list(in_depth_file, chr_len_list):

    in_depth_fp = my_utils.gzopen(in_depth_file, 'r')

    bin_size = 1
    while 1:
        line = in_depth_fp.readline()
        if not line: break
        if line[0] == '#': continue
        line = line.strip().split(tab)
        start_pos = int(line[1])
        end_pos = int(line[2])
        bin_size = end_pos - start_pos
        if bin_size >= 10:
            break
        else:
            my_utils.myprint('ERROR! bin_size < 1 in depth file: %s ' %
                             in_depth_file)
            sys.exit()

    n_chr = len(chr_len_list)
    wg_high_mapq_depth_list = [0] * n_chr
    wg_total_depth_list = [0] * n_chr
    for tid in range(0, n_chr):
        wg_high_mapq_depth_list[tid] = list()
        wg_total_depth_list[tid] = list()

    in_depth_fp.seek(0, 0)
    while 1:
        line = in_depth_fp.readline()
        if not line: break
        if line[0] == '#': continue

        line = line.strip().split(tab)
        tid = int(line[0])
        wg_high_mapq_depth_list[tid].append(float(line[3]))
        wg_total_depth_list[tid].append(float(line[4]))

    in_depth_fp.close()

    my_utils.myprint('finished reading file: %s' % in_depth_file)
    return wg_high_mapq_depth_list, wg_total_depth_list, bin_size
Example #12
0
def plot_heatmap(in_svcall_list, bcd21_file, faidx_file, out_dir, flank_dist, chr_len_list, tid2chrname_list, chrname2tid_dict, cal_2d_overlapping_barcodes_binary, out_prefix):

    if os.path.exists(cal_2d_overlapping_barcodes_binary) == False:
        my_utils.myprint('ERROR! The binary file doesn\'t exist: %s' % cal_2d_overlapping_barcodes_binary)
        my_utils.myprint('Skipped plotting the heat maps')
        return

    out_dir = os.path.join(out_dir, '2D_heatmap')
    my_utils.make_dir(out_dir)

    my_utils.myprint('plotting heat maps of overlapping barcodes')

    target_region_bedpe_list = generate_target_region_bedpe_list(in_svcall_list, chr_len_list, flank_dist, chrname2tid_dict)
    target_region_bedpe_file = os.path.join(out_dir, 'target_region.bedpe') 
    target_region_bedpe_fp = my_utils.gzopen(target_region_bedpe_file, 'w')  
    for bedpe1 in target_region_bedpe_list: 
        target_region_bedpe_fp.write(bedpe1.output_svcall() + endl)
    target_region_bedpe_fp.close()

    target_region_2d_ovl_with_low_mapq_file = os.path.join(out_dir, '%s.2d_heatmap.with_low_mapq_reads.txt' % out_prefix)

    bin_size = 1000
    max_ovl_num = 100

    cmd_args_list1 = [cal_2d_overlapping_barcodes_binary, bcd21_file, target_region_bedpe_file, target_region_2d_ovl_with_low_mapq_file, faidx_file, str(bin_size), '1']
    subprocess.call(cmd_args_list1)
    plot_2d_barcodes.plot_2d_overlapping_barcodes(target_region_2d_ovl_with_low_mapq_file, target_region_bedpe_list, bin_size, max_ovl_num, out_dir, out_prefix)

    return
Example #13
0
def plot_twin_window_barcode_similarity_for1region(chrom, tid, reg_start, reg_end, bk_pos1, bk_pos2, out_file, figure_title, wg_pvalue_list, bin_size):

    min_x_idx = int(reg_start / bin_size)
    max_x_idx = int(reg_end / bin_size) + 1

    x_list = list()
    y_list = list()

    for idx in range(min_x_idx, max_x_idx):
        x = idx * bin_size
        x_list.append(x)
        if idx < len(wg_pvalue_list[tid]):
            y = wg_pvalue_list[tid][idx]
            y_list.append(y)
        else:
            break

    xmin = min(x_list)
    xmax = max(x_list)
    x_range = xmax - xmin
    ymax = max(y_list)
    

    plt.figure(figsize=(10, 5))
    plt.title(figure_title)
    plt.xlabel('%s position' % chrom)
    plt.ylabel('-log10(P-value)')
    plt.plot(x_list, y_list, '-', color = 'black')
    plt.axis([xmin, xmax, 0, ymax])
    plt.axvline(x=bk_pos1, color='r', linestyle = '--')
    plt.axvline(x=bk_pos2, color='r', linestyle = '--')
    plt.xticks(np.arange(xmin, xmax+1, x_range/4))
    plt.show()
    plt.savefig(out_file, dpi=200)
    plt.close('all')
    my_utils.myprint('saved figure: %s' % out_file)

    return
Example #14
0
def cluster_weird_reads(in_weird_reads_file, out_file, faidx_file):

    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    max_distance = 300
    min_n_short_read_supp = 2
    max_n_short_read_supp = 1000
    min_sv_length = 1000

    my_utils.myprint('reading file: %s' % in_weird_reads_file)
    short_read_support_list35 = read_weird_reads_file(in_weird_reads_file, chrname2tid_dict, min_sv_length)
    my_utils.myprint('finished reading file: %s' % in_weird_reads_file)

    out_fp = open(out_file, 'w')
    out_fp.write('')
    out_fp.close()

    my_utils.myprint('clustering discordant reads')
    cluster_weird_reads1type(short_read_support_list35, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp)

    return
Example #15
0
 def size(self):
     if self.chrm1 == self.chrm2:
         return self.pos2 - self.pos1
     else:
         my_utils.myprint('ERROR! chrm1 != chrm2!')
         sys.exit()
Example #16
0
 def key2(self):
     if self.tid2 < 0:
         my_utils.myprint('ERROR! tid2 < 0')
         sys.exit()
     return self.tid2 * my_utils.FIX_LENGTH + self.pos2
Example #17
0
def small_deletion_dection_by_local_assembly(samtools,
                                             bedtools,
                                             fermikit_dir,
                                             input_bam_file,
                                             ref_fasta_file,
                                             faidx_file,
                                             out_dir,
                                             out_del_call_file,
                                             n_threads,
                                             window_size,
                                             max_depth,
                                             rm_temp_files=1):

    if os.path.exists(faidx_file) == False:
        cmd = '%s faidx %s' % (samtools, ref_fasta_file)
        my_utils.myprint(cmd)
        os.system(cmd)

    if os.path.exists(faidx_file) == False:
        my_utils.myprint(
            'ERROR! The index file of the reference fasta file does not exist!'
        )
        sys.exit()

    cmd = 'mkdir -p %s' % out_dir
    my_utils.myprint(cmd)
    os.system('mkdir -p %s' % out_dir)
    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(faidx_file)
    chr_len_list = my_utils.get_chr_length(faidx_file)

    overlap_length = int(window_size / 10)
    interval_list = generate_interval_list(chr_len_list, tid2chrname_list,
                                           chrname2tid_dict, window_size,
                                           overlap_length)

    process_list = list()
    out_combined_vcf_file_list = list()
    for i in range(0, n_threads):
        out_combined_vcf_file = os.path.join(
            out_dir, 'assembly_raw_variants.%d.txt' % i)
        out_combined_vcf_file_list.append(out_combined_vcf_file)
        t = multiprocessing.Process(
            target=small_deletion_dection_from_interval_list,
            args=(i, n_threads, samtools, bedtools, fermikit_dir,
                  input_bam_file, ref_fasta_file, out_dir, window_size,
                  max_depth, interval_list, out_combined_vcf_file))
        process_list.append(t)
        t.start()

    for t in process_list:
        t.join()

    all_processes_out_combined_vcf_file = os.path.join(
        out_dir, 'local_assembly_raw_variants.txt')
    cmd = 'cat '
    for out_combined_vcf_file in out_combined_vcf_file_list:
        cmd += ' %s ' % out_combined_vcf_file

    cmd += ' > %s ' % all_processes_out_combined_vcf_file
    my_utils.myprint(cmd)
    os.system(cmd)

    extract_del_from_vcf_file(all_processes_out_combined_vcf_file,
                              out_del_call_file)

    if rm_temp_files:
        for out_combined_vcf_file in out_combined_vcf_file_list:
            os.remove(out_combined_vcf_file)
        os.remove(all_processes_out_combined_vcf_file)

    return
Example #18
0
def extract_del_from_vcf_file(in_vcf_file, out_file):

    in_vcf_fp = open(in_vcf_file, 'r')
    out_fp = open(out_file, 'w')
    min_del_size = 50
    id = 0
    while 1:
        line = in_vcf_fp.readline().strip()
        if not line: break
        if line[0] == '#': continue

        items = line.split('\t')

        chrom1 = items[0]
        try:
            pos1 = int(items[1])
        except:
            my_utils.myprint('ERROR! invalid VCF record: %s' % line)
            continue

        ref_allele = items[3]
        alt_allele = items[4]
        flt = items[6]
        info = items[7]

        sv_type = ''
        sv_size = 0

        pos2 = -1

        if '[' in alt_allele or ']' in alt_allele: continue

        ref_chr, ref_start_end = chrom1.split(':')
        ref_start, ref_end = ref_start_end.split('-')
        ref_start = int(ref_start)
        chrom1 = ref_chr
        pos1 += ref_start

        if len(ref_allele) > min_del_size and len(ref_allele) - len(
                alt_allele) > min_del_size:
            sv_type = 'DEL'
            sv_size = len(ref_allele) - len(alt_allele)
            pos2 = pos1 + sv_size
        else:
            for ele in info.split(';'):
                key = ele.split('=')[0]
                if key == 'SVTYPE':
                    sv_type = ele.split('=')[1]
                elif key == 'SVLEN':
                    sv_size = abs(int(ele.split('=')[1]))
                elif key == 'END' and pos2 == -1:
                    pos2 = int(ele.split('=')[1]) + ref_start

        if sv_type != 'DEL': continue

        chrom2 = chrom1
        flt = 'PASS'

        score = 30
        sv_id = '.'
        out_item = '%s\t%d\t%d\t%s\t%d\t%d\t' % (chrom1, pos1, pos1 + 1,
                                                 chrom2, pos2, pos2 + 1)
        out_item += '%s\t%s\t%d\t%d\t%s\tSVMETHOD=local_assembly\n' % (
            sv_type, sv_id, sv_size, score, flt)

        out_fp.write(out_item)

    in_vcf_fp.close()
    out_fp.close()

    return
Example #19
0
def filter_low_mapq_gaps(input_sv_list, endpoint_args, chrname2tid_dict):

    all_supp_barcode_dict = dict()

    for j in range(0, len(input_sv_list)):
        svcall = input_sv_list[j]
        if svcall.ft != '.': continue

        support_barcode_list = svcall.support_barcodes.rstrip(',').split(',')
        for bcd in support_barcode_list:
            all_supp_barcode_dict[bcd] = list()

    if os.path.exists(endpoint_args.low_mapq_bcd21_file) == False:
        my_utils.myprint('WARNING! low mapq bcd21 file does not exist.')
        return input_sv_list

    my_utils.myprint('reading low mapq bcd21 file: %s' %
                     endpoint_args.low_mapq_bcd21_file)
    low_mapq_bcd21_fp = my_utils.gzopen(endpoint_args.low_mapq_bcd21_file,
                                        'rt')
    i = 0
    while 1:
        line = low_mapq_bcd21_fp.readline()
        if not line: break
        if line[0] == '#': continue
        i += 1

        attr_list = line.strip().split(tab)
        bcd21 = Bcd21Core(attr_list)
        if bcd21.bcd in all_supp_barcode_dict:
            all_supp_barcode_dict[bcd21.bcd].append(bcd21)
        if i % 10000000 == 0:
            my_utils.myprint('processed %d reads' % i)

    low_mapq_bcd21_fp.close()

    for bcd in all_supp_barcode_dict:
        all_supp_barcode_dict[bcd].sort(key=lambda bcd21: bcd21.key_start())

    my_utils.myprint('finished reading low mapq bcd21 file: %s' %
                     endpoint_args.low_mapq_bcd21_file)

    region_size = 10 * 1000
    # for deletion, the region is the deletion region, for other type of svs, the region is 10 kb of either breakpoint

    for j in range(0, len(input_sv_list)):
        svcall = input_sv_list[j]
        if svcall.ft != '.': continue

        support_barcode_list = svcall.support_barcodes.rstrip(',').split(',')
        n_low_mapq_bcd = 0
        region_key_start1 = -1
        region_key_end1 = -1
        region_key_start2 = -1
        region_key_end2 = -1

        tid1 = chrname2tid_dict[svcall.chrm1]
        tid2 = chrname2tid_dict[svcall.chrm2]

        if svcall.endtype1 == 'L_end':
            region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1 - region_size
        else:
            region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1

        if svcall.endtype2 == 'L_end':
            region_key_start2 = tid2 * my_utils.FIX_LENGTH + svcall.start2 - region_size
        else:
            region_key_start2 = tid2 * my_utils.FIX_LENGTH + svcall.start2

        region_key_end1 = region_key_start1 + region_size
        region_key_end2 = region_key_start2 + region_size

        if svcall.svtype == 'DEL':
            region_key_start1 = tid1 * my_utils.FIX_LENGTH + svcall.start1
            region_key_end1 = tid2 * my_utils.FIX_LENGTH + svcall.start2
            region_key_start2 = region_key_start1
            region_key_end2 = region_key_end1

        n_low_mapq_bcd = 0

        for bcd in support_barcode_list:
            if bcd not in all_supp_barcode_dict: continue
            for bcd21 in all_supp_barcode_dict[bcd]:
                if (bcd21.key_start() > region_key_start1
                        and bcd21.key_end() < region_key_end1) or (
                            bcd21.key_start() > region_key_start2
                            and bcd21.key_end() < region_key_end2):
                    n_low_mapq_bcd += 1
                    break

        n_supp_bcd = svcall.num_fragment_support
        ratio_low_mapq_bcd = float(n_low_mapq_bcd) / float(n_supp_bcd)
        if (not (ratio_low_mapq_bcd < 0.2 and svcall.score *
                 (1 - ratio_low_mapq_bcd) > 20)):
            input_sv_list[j].ft = 'LOW_MAPQ_BETWEEN_BK'

    return input_sv_list
Example #20
0
def process1region(samtools, bedtools, fermikit_dir, ref_fasta_file,
                   input_bam_file, out_dir, itv, region_id, window_size,
                   max_depth, n_threads_for_one_process,
                   out_combined_vcf_file):

    curr_out_dir = os.path.join(out_dir, 'region_%06d' % (region_id))
    out_bam_file = os.path.join(curr_out_dir, 'region_%06d.bam' % region_id)
    out_all_fastq_file = os.path.join(curr_out_dir,
                                      'region_%06d.all.fastq' % region_id)
    region_bed_file = os.path.join(curr_out_dir, 'region_%06d.bed' % region_id)
    region_fasta_file = os.path.join(curr_out_dir,
                                     'region_%06d.fasta' % region_id)

    interval = '%s:%d-%d' % (itv.chrom, itv.start_pos + 1, itv.end_pos)

    cmd = 'mkdir -p %s' % curr_out_dir
    my_utils.myprint(cmd)
    os.system(cmd)
    time.sleep(0.05)

    if os.path.exists(curr_out_dir) == False:
        os.system(cmd)
        time.sleep(1)

    if os.path.exists(curr_out_dir) == False:
        my_utils.myprint('Failed to creat directory: %s' % curr_out_dir)
        cmd = 'rm -rf %s' % curr_out_dir
        my_utils.myprint(cmd)
        os.system(cmd)
        return

    cmd = extract_bam_region(samtools, input_bam_file, interval, out_bam_file,
                             n_threads_for_one_process)
    my_utils.myprint(cmd)
    os.system(cmd)
    cmd = index_bam(samtools, out_bam_file)
    my_utils.myprint(cmd)
    os.system(cmd)
    cmd = bam_to_1fastq(samtools, out_bam_file, out_all_fastq_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    fastq_file_size = os.path.getsize(out_all_fastq_file)
    if fastq_file_size > window_size * max_depth * 2 or fastq_file_size < 20000:
        cmd = 'rm -r %s' % curr_out_dir
        my_utils.myprint(cmd)
        os.system(cmd)
        return

    region_bed_fp = open(region_bed_file, 'w')
    region_bed_fp.write('%s\t%d\t%d\n' %
                        (itv.chrom, itv.start_pos, itv.end_pos))
    region_bed_fp.close()

    cmd = extract_ref_region(bedtools, ref_fasta_file, region_bed_file,
                             region_fasta_file)
    my_utils.myprint(cmd)
    os.system(cmd)

    out_prefix = os.path.join(curr_out_dir, 'region_%06d.all_hap' % region_id)

    fermikit_variant_calling(fermikit_dir, samtools, n_threads_for_one_process,
                             region_fasta_file, window_size,
                             out_all_fastq_file, curr_out_dir, out_prefix)

    indel_call_file = out_prefix + '.flt.vcf'
    sv_call_file = out_prefix + '.sv.vcf'

    cmd = 'gunzip --force %s.gz' % indel_call_file
    os.system(cmd)
    cmd = 'gunzip --force %s.gz' % sv_call_file
    os.system(cmd)

    cmd = 'cat %s %s >> %s' % (indel_call_file, sv_call_file,
                               out_combined_vcf_file)
    os.system(cmd)

    cmd = 'rm -r %s' % curr_out_dir
    os.system(cmd)

    return
Example #21
0
def filter_calls(args, dbo_args, endpoint_args):

    my_utils.myprint('filtering SV calls')

    bin_size = 100
    tid2chrname_list, chrname2tid_dict = my_utils.get_chrnames(args.faidx_file)
    alt_chr_name_set = my_utils.read_alternative_contig_file(args.alt_ctg_file)

    if os.path.exists(args.black_region_bed_file):
        my_utils.myprint('reading black region bed file: %s' %
                         args.black_region_bed_file)
        black_reg_dict = read_black_reg_bed_file(args.black_region_bed_file,
                                                 bin_size)
    else:
        if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37':
            my_utils.myprint('ERROR! black list file is missing: %s' %
                             black_region_bed_file)
        black_reg_dict = dict()

    if os.path.exists(args.gap_region_bed_file):
        gap_left_region_dict, gap_right_region_dict = read_gap_region_file(
            args.gap_region_bed_file, bin_size)
        my_utils.myprint('reading gap region bed file: %s' %
                         args.gap_region_bed_file)
    else:
        if args.ref_version == 'hg19' or args.ref_version == 'hg38' or args.ref_version == 'b37':
            my_utils.myprint('ERROR! gap region file is missing: %s' %
                             args.gap_region_bed_file)
        gap_left_region_dict = dict()
        gap_right_region_dict = dict()

    raw_svcall_list = my_utils.read_object_file(args.merged_bedpe_file,
                                                bedpe.QuantifiedBKCandCore)
    for i in range(0, len(raw_svcall_list)):
        raw_svcall_list[i].ft = '.'

    round1_retained_sv_list = filter_1d_blacklist(
        raw_svcall_list, black_reg_dict, alt_chr_name_set,
        gap_left_region_dict, gap_right_region_dict, bin_size)

    n_retained_sv = 0
    for svcall in round1_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1
    my_utils.myprint('number of retained SVs: %d' % n_retained_sv)

    round2_retained_sv_list = filter_low_mapq_gaps(round1_retained_sv_list,
                                                   endpoint_args,
                                                   chrname2tid_dict)

    n_retained_sv = 0
    for svcall in round2_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    if args.ref_version == 'b37':
        remove_chr_prefix = True
    else:
        remove_chr_prefix = False

    round3_retained_sv_list = filter_calls_2d(round2_retained_sv_list,
                                              args.black_region_2d_file,
                                              args.filter_bedpe_file,
                                              remove_chr_prefix)

    n_retained_sv = 0
    for svcall in round3_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    round4_retained_sv_list = filter_dbo_score(round3_retained_sv_list, args)

    n_retained_sv = 0
    for svcall in round4_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1

    round5_retained_sv_list = filter_read_depth(round4_retained_sv_list, args)

    round6_retained_sv_list = filter_sv_length(round5_retained_sv_list, args)

    final_retained_sv_list = round6_retained_sv_list

    n_retained_sv = 0
    for svcall in final_retained_sv_list:
        if svcall.ft == '.': n_retained_sv += 1
    my_utils.myprint('number of retained SVs: %d' % n_retained_sv)

    header = '#chrom1\tstart1\tstop1\tchrom2\tstart2\tstop2\t'
    header += 'sv_type\tsv_id\tsv_length\tqual_score\tfilter\tinfo\n'

    out_file = args.filter_bedpe_file
    out_fp = open(out_file, 'w')
    out_fp.write(header)
    sv_id = 0
    n_svcall = len(final_retained_sv_list)
    n_digit = int(math.log10(n_svcall) + 2)

    for svcall in final_retained_sv_list:
        if svcall.ft == '.':
            svcall.ft = 'PASS'
            sv_id += 1
            sv_id_str = str(sv_id)
            sv_id_str = '0' * (n_digit - len(sv_id_str)) + sv_id_str
            svcall.sv_id = 'ID%s' % sv_id_str
            out_fp.write(svcall.output_core2() + endl)

    out_fp.close()

    return
Example #22
0
def detect_decreased_barcode_overlap(args, dbo_args, endpoint_args):

    if args.is_wgs:
        win_size = 10000
    else:
        win_size = 40000

    ### calculating read depth | output file: args.read_depth_file
    task = 'calculating read depth'
    if args.run_from_begining == False and my_utils.check_file_exists(
            args.read_depth_file) == True:
        my_utils.myprint('read depth file existed, skipped %s' % task)
    else:
        my_utils.myprint(task)
        cmd_args_list = [
            args.cal_read_depth_from_bcd21, endpoint_args.bcd21_file,
            args.read_depth_file, args.faidx_file,
            str(dbo_args.bin_size),
            str(args.min_mapq)
        ]
        my_utils.myprint('running command: %s' % (' '.join(cmd_args_list)))
        subprocess.call(cmd_args_list)
        my_utils.myprint('finished %s' % task)

### counting overlapping barcodes | output files dbo_args.bcd11_file
    task = 'counting overlapping barcodes between twin windows'
    if args.run_from_begining == False and my_utils.check_file_exists(
            dbo_args.bcd11_file) == True:
        my_utils.myprint('bcd11 files existed, skipped %s' % task)
    else:
        my_utils.myprint(task)
        cmd_args_list = [
            args.cal_twin_win_bcd_cnt, endpoint_args.bcd21_file,
            dbo_args.bcd11_file, args.faidx_file,
            str(dbo_args.bin_size),
            str(win_size),
            str(args.min_mapq)
        ]
        my_utils.myprint('running command: %s' % (' '.join(cmd_args_list)))
        subprocess.call(cmd_args_list)
        my_utils.myprint('finished %s' % task)

    ### calculating centroid | output file: dbo_args.bcd12_file
    task = 'calculating centroid'
    if args.run_from_begining == False and my_utils.check_file_exists(
            dbo_args.bcd12_file) == True:
        my_utils.myprint('bcd12 files existed, skipped %s' % task)
    else:
        my_utils.myprint(task)
        cmd_args_list = [
            args.cal_centroid_from_read_depth, args.read_depth_file,
            dbo_args.bcd11_file, dbo_args.bcd12_file, args.faidx_file
        ]
        my_utils.myprint('running command: %s' % (' '.join(cmd_args_list)))
        subprocess.call(cmd_args_list)
        my_utils.myprint('finished %s' % task)

### calculating expected overlap | output file: dbo_args.bcd13_file

    task = 'calculating barcode similarity and p-value'
    if args.run_from_begining == False and my_utils.check_file_exists(
            dbo_args.bcd13_file) == True:
        my_utils.myprint('bcd12 files existed, skipped %s' % task)
    else:
        my_utils.myprint(task)
        if args.is_wgs:
            is_wgs = 1
        else:
            is_wgs = 0

        my_utils.myprint(task)
        cal_expected_overlap_value.cal_expected_overlap_bcd_cnt(
            dbo_args.bcd12_file, dbo_args.bcd13_file, is_wgs)
        my_utils.myprint('finished %s' % task)

    return
Example #23
0
def cluster_one_region(short_read_support_list, coord_list, out_file, min_n_short_read_supp, max_distance, tid2chrname_list, chrname2tid_dict, max_n_short_read_supp):

    if len(coord_list) < 1: return
    edge_list = list()

    distance_buffer = max_distance * 1.415
    tree = cKDTree(coord_list, leafsize = 10000)

    for i in range(0, len(short_read_support_list)):

        if i > 0 and i % 100000 == 0: my_utils.myprint ('finished searching for %d weird reads' % i)

        node1 = (short_read_support_list[i].key1(), short_read_support_list[i].key2())
        index_list = tree.query_ball_point( node1, distance_buffer )

        if len(index_list) > max_n_short_read_supp: continue

        nearby_node_index_list = list()
        for j in index_list:
            if i == j: continue
            node2 = (short_read_support_list[j].key1(), short_read_support_list[j].key2())
            if abs(node1[0] - node2[0]) < max_distance and abs(node1[1] - node2[1]) < max_distance:
                nearby_node_index_list.append(j)

        for j in nearby_node_index_list: 
            edge = (i, j) 
            edge_list.append(edge)

    row = list()
    col = list()
    data = list()
    for edge in edge_list:
        row.append (edge[0])
        col.append (edge[1])
        data.append (1) 

    n_node = len(short_read_support_list)

    my_utils.myprint ('get connected components')
    n_components, label_list, component_node_index_db = get_connected_components(n_node, row, col, data, False, 'weak')
    node_cluster_list = [0] * n_components
    for i in range(0, n_components):
        node_cluster_list[i] = list()
        for index in component_node_index_db[i]:
            node_cluster_list[i].append(short_read_support_list[index])

    my_utils.myprint ('output clusters of weird reads')
    out_fp = open(out_file, 'w')
    for i in range(0, len(node_cluster_list)): # for i-th cluster
        node_cluster = node_cluster_list[i]
        if len(node_cluster) < min_n_short_read_supp: continue
        if len(node_cluster) > max_n_short_read_supp: continue
        mean_start_pos = mean_end_pos = 0
        hap_type_cnt = [0] * 3
        output_info_string = 'SVTYPE=DEL'
        for j in range(0, len(node_cluster)):
            short_read_support = node_cluster[j]
            output_info_string += ';' + short_read_support.output_info() 
            mean_start_pos += short_read_support.pos1()
            mean_end_pos   += short_read_support.pos2()
            hap_type_cnt[short_read_support.hap_type] += 1

        num_pe_supp = len(node_cluster)
        mean_start_pos = int( 0.5 + (float(mean_start_pos)) / num_pe_supp)
        mean_end_pos   = int( 0.5 + (float(mean_end_pos))   / num_pe_supp)
        
        tid = node_cluster[0].tid1
        chrom = tid2chrname_list[tid]
        if len(node_cluster) >= 5:
            flt = 'PASS'
        else:
            flt = 'LowQual'
        sv_size = mean_end_pos - mean_start_pos
        sv_type = 'DEL'
        out_fp.write('%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s\n' % (chrom, mean_start_pos, mean_start_pos+1, chrom, mean_end_pos, mean_end_pos+1, sv_type, flt, sv_size, num_pe_supp, hap_type_cnt[0], hap_type_cnt[1], hap_type_cnt[2], output_info_string))

    del edge_list, row, col, data, component_node_index_db, label_list, node_cluster_list
    gc.collect()

    return
Example #24
0
def extract_barcode_from_bam(args, endpoint_args):

    ## sort bam by barcode ##

    cmd = '%s %s | %s sort -l 1 -m 1G -@ %d -t BX -o %s -' % (
        args.output_bam_coreinfo, args.bam, args.samtools, args.n_thread,
        args.sortbx_bam)

    if (args.run_from_begining == False) and my_utils.check_file_exists(
            args.sortbx_bam):
        my_utils.myprint('File: %s existed, skipped sorting bam by barcode' %
                         args.sortbx_bam)
    else:
        my_utils.myprint('sorting bam file by barcode')
        my_utils.myprint('running command: %s' % cmd)
        os.system(cmd)

    ## extract barcode info ##
    n_compress_threads = args.n_thread - 1
    if n_compress_threads < 1: n_compress_threads = 1
    cmd = '%s %s __STDOUT__ %s | %s --fast --processes %d - > %s' % (
        args.extract_barcode, args.sortbx_bam, args.stat_file, args.pigz,
        n_compress_threads, endpoint_args.bcd21_file)

    if args.run_from_begining == False and my_utils.check_file_exists(
            args.stat_file) and my_utils.check_file_exists(
                endpoint_args.bcd21_file):
        my_utils.myprint(
            'File: %s existed, skipped extracting barcode from bam' %
            endpoint_args.bcd21_file)
    else:
        my_utils.myprint('extracting barcode info from bam file')
        my_utils.myprint('running command: %s' % cmd)
        os.system(cmd)

    task = 'extracting low mapq bcd21'
    if args.run_from_begining == False and my_utils.check_file_exists(
            endpoint_args.low_mapq_bcd21_file) == True:
        my_utils.myprint('%s existed, skipped %s' %
                         (endpoint_args.low_mapq_bcd21_file, task))
    else:
        my_utils.myprint(task)
        get_low_mapq_bcd21_file(endpoint_args.bcd21_file,
                                endpoint_args.low_mapq_bcd21_file,
                                args.min_mapq)

    if args.rm_temp_files and my_utils.check_file_exists(
            endpoint_args.bcd21_file):
        if os.path.exists(args.sortbx_bam): os.remove(args.sortbx_bam)

    return
Example #25
0
def detect_increased_fragment_ends(args, dbo_args, endpoint_args):

    gc.enable()

    ### clustering reads | output file: bcd22 file
    task = 'clustering reads'

    if args.is_wgs:
        is_wgs = 1
    else:
        is_wgs = 0

    if args.run_from_begining == False and my_utils.check_file_exists(
            endpoint_args.bcd22_file):
        my_utils.myprint('bcd22 file existed, skipped %s' % (task))
    else:
        my_utils.myprint(task)

        cmd = '%s %s %s %s %d %d %d %d' % (
            args.cluster_reads, endpoint_args.bcd21_file,
            endpoint_args.bcd22_file, args.weird_reads_file, is_wgs,
            args.user_defined_min_reads_in_fragment, args.min_mapq,
            args.n_thread)
        my_utils.myprint(cmd)
        os.system(cmd)

    gc.collect()

    ### searching for extremely high coverage region

    task = 'searching for extremely high coverage region'
    if args.run_from_begining == False and my_utils.check_file_exists(
            endpoint_args.barcode_cov_file):
        my_utils.myprint('high coverage region file existed, skipped %s' %
                         (task))
    else:
        my_utils.myprint(task)
        get_high_coverage_regions.get_high_coverage_regions(
            args, dbo_args, endpoint_args)

    gc.collect()

    ### estimating distribution parameters

    if args.global_distribution_calculated == False:
        global_distribution.estimate_global_distribution(
            args, dbo_args, endpoint_args, endpoint_args.bcd22_file)

    arguments.output_arguments2file(args, dbo_args, endpoint_args)

    gc.collect()

    ## find paired breakpoints ##

    task = 'searching for paired breakpoints'
    if args.run_from_begining == False and my_utils.check_file_exists(
            args.bk_cand_pair_file) == True:
        my_utils.myprint('paired breakpoint file existed, skipped %s' % (task))
    else:
        my_utils.myprint(task)
        find_paired_bk.find_paired_bk(args, dbo_args, endpoint_args)

    gc.collect()

    return
Example #26
0
def plot_one_bedpe(out_dir, target_region_bedpe_list, out_prefix, region_title,
                   ovl_2d_array, xmin, xmax, ymin, ymax, bin_size):

    tp_ovl_2d_array = ovl_2d_array.transpose()

    xlab, ylab = region_title.split(';')
    xlab = xlab.strip()
    ylab = ylab.strip()
    xchr, x_start_end = xlab.split(':')
    ychr, y_start_end = ylab.split(':')

    xstart, xend = x_start_end.split('-')
    ystart, yend = y_start_end.split('-')

    sv_id, svtype = find_sv_id(xchr, xstart, xend, ychr, ystart, yend,
                               target_region_bedpe_list)
    if sv_id == 'UNK' or svtype == 'UNK':
        sv_id = '%s_%s_%s.%s_%s_%s' % (xchr, xstart, xend, ychr, ystart, yend)
        svtype = 'unknown_sv_type'

    xsize = int((xmax - xmin) / bin_size)
    ysize = int((ymax - ymin) / bin_size)

    pd_ovl_2d_array = pd.DataFrame(tp_ovl_2d_array)
    xticks_dict = dict()
    yticks_dict = dict()

    for i in range(0, xsize):
        xticks_dict[i] = xmin + i * bin_size

    for i in range(0, ysize):
        yticks_dict[i] = ymin + i * bin_size

    pd_ovl_2d_array = pd_ovl_2d_array.rename(columns=xticks_dict,
                                             index=yticks_dict)

    cmrmap_r = cm.get_cmap('brg_r', 1000)
    cmrmap_r_colors = cmrmap_r(np.linspace(0, 1, 1000))
    r50, g50, b50, a50 = cmrmap_r_colors[500]
    r = np.linspace(1.0, r50, 500)
    g = np.linspace(1.0, g50, 500)
    b = np.linspace(1.0, b50, 500)
    a = np.linspace(1.0, a50, 500)
    my_colors1 = np.array([r, g, b, a]).transpose()
    r = np.linspace(r50, 0.0, 500)
    g = np.linspace(g50, 0.0, 500)
    b = np.linspace(b50, 0.0, 500)
    a = np.linspace(a50, 1.0, 500)
    my_colors2 = np.array([r, g, b, a]).transpose()
    #my_colors = np.vstack((my_colors, cmrmap_r_colors[500:]))
    my_colors = np.vstack((my_colors1, my_colors2))

    my_cmap = ListedColormap(my_colors)

    out_file = os.path.join(out_dir, '%s.%s.heatmap.png' % (out_prefix, sv_id))
    plt.figure(figsize=(10, 10))
    ax = sns.heatmap(pd_ovl_2d_array,
                     cmap=my_cmap,
                     square=True,
                     xticklabels=int(xsize / 10),
                     yticklabels=int(ysize / 10))
    ax.invert_yaxis()

    ax.axhline(y=0, color='k', linewidth=2)
    ax.axhline(y=ysize, color='k', linewidth=2)
    ax.axvline(x=0, color='k', linewidth=2)
    ax.axvline(x=xsize, color='k', linewidth=2)

    plt.axis([0, xsize, 0, ysize])
    plt.xlabel(xchr)
    plt.ylabel(ychr)
    plt.xticks(rotation='vertical')
    plt.yticks(rotation='horizontal')
    plt.title('Number of overlapping barcodes (%s, %s)' % (sv_id, svtype))
    plt.rcParams.update({'font.size': 12})
    plt.show()
    plt.savefig(out_file, dpi=200)
    plt.close('all')
    my_utils.myprint('saved figure: %s' % out_file)

    return
Example #27
0
def check_arguments(args):

    if os.path.exists(args.input_bam) == False:
        my_utils.myprint("ERROR! input bam file (%s) does not exist!" %(args.bam))
        sys.exit()

    if os.path.exists(args.out_dir) == False:
        os.system('mkdir -p ' + args.out_dir)
        if os.path.exists(args.out_dir) == False:
            my_utils.myprint("ERROR! can not create output directory: %s" %(args.out_dir))
            sys.exit()

    if os.path.exists(args.ref_fa) == False:
        my_utils.myprint("ERROR! reference FASTA file (%s) does not exist!" % (args.ref_fa))
        sys.exit()

    if os.path.exists(args.faidx_file) == False:
        my_utils.myprint("Index file of reference FASTA file does not exist, creating index using samtools ...")
        cmd = args.samtools + ' faidx ' + args.ref_fa
        os.system(cmd)
        if os.path.exists(args.faidx_file) == False:
            my_utils.myprint("ERROR! Cannot generate reference index file!")
            sys.exit()

    if os.path.exists(args.extract_barcode) == False:
        my_utils.myprint("ERROR! extract_barcode does not exist!")
        sys.exit()

    if args.is_wgs == False and os.path.exists(args.target_region_bed) == False:
        my_utils.myprint ("ERROR! target region bed is required if --targeted is specified. If you don't have this file, please specify --wgs instead")
        sys.exit()


    if args.ref_version == 'hg19':
        args.gap_region_bed_file      = os.path.join(args.root_dir, 'black_lists/hg19_gap.bed') 
        args.black_region_bed_file    = os.path.join(args.root_dir, 'black_lists/hg19_black_list.bed')
        args.black_region_2d_file     = os.path.join(args.root_dir, 'black_lists/hg19.2D.blacklist.gz')
        args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg19_low_mapq_regions.bed')

    elif args.ref_version == 'b37':
        args.gap_region_bed_file      = os.path.join(args.root_dir, 'black_lists/b37_gap.bed') 
        args.black_region_bed_file    = os.path.join(args.root_dir, 'black_lists/b37_black_list.bed')
        args.black_region_2d_file     = os.path.join(args.root_dir, 'black_lists/b37.2D.blacklist.gz')
        args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/b37_low_mapq_regions.bed')

    elif args.ref_version == 'hg38':
        args.gap_region_bed_file      = os.path.join(args.root_dir, 'black_lists/hg38_gap.bed') 
        args.black_region_bed_file    = os.path.join(args.root_dir, 'black_lists/hg38_black_list.bed')
        args.black_region_2d_file     = os.path.join(args.root_dir, 'black_lists/hg38.2D.blacklist.gz')
        args.low_mapq_region_bed_file = os.path.join(args.root_dir, 'black_lists/hg38_low_mapq_regions.bed')



    return
def detect_sv_from_short_reads(args, dbo_args, endpoint_args):

    out_short_read_sv_file = args.short_reads_sv_call_file  ## combination of local assembly, discordant read pairs and large CNV

    ### calculate read depth ###

    bin_size = 100
    mapq_cutoff = 20

    if os.path.exists(
            args.hap_type_read_depth_file) == False or os.path.getsize(
                args.hap_type_read_depth_file) == 0:

        cmd = '%s %s %s %s %d %d' % (args.cal_hap_read_depth_from_bcd21,
                                     endpoint_args.bcd21_file,
                                     args.hap_type_read_depth_file,
                                     args.faidx_file, bin_size, mapq_cutoff)
        my_utils.myprint('running command:' + cmd)
        os.system(cmd)

    ### CNV  detection ###
    cmd = '%s %s %s %s %s %d %d %d' % (
        args.cnv_detection, args.hap_type_read_depth_file, args.faidx_file,
        args.gap_region_bed_file, args.cnv_call_file, 40, 200, 500000)
    my_utils.myprint('running command:' + cmd)
    os.system(cmd)

    ### small deletion detection from paired-end reads ###

    window_size = int(2e5)
    max_depth = 500
    bin_size = 100
    mapq_cutoff = 20

    local_assembly_out_file = os.path.join(args.out_dir,
                                           'local_assembly.del.bedpe')
    short_reads_del_call_file = os.path.join(
        args.out_dir, 'discordant_read_pairs.del.bedpe')

    cluster_weird_reads.cluster_weird_reads(args.weird_reads_file,
                                            args.weird_reads_cluster_file,
                                            args.faidx_file)

    cmd = '%s %s %s %s %s %s %s' % (
        args.small_deletion_detection, args.hap_type_read_depth_file,
        args.weird_reads_cluster_file, endpoint_args.bcd22_file,
        args.faidx_file, args.gap_region_bed_file, short_reads_del_call_file)
    my_utils.myprint('running command:' + cmd)
    os.system(cmd)

    rm_temp_files = 1
    local_assembly.small_deletion_dection_by_local_assembly(
        args.samtools, args.bedtools, args.fermikit_dir, args.input_bam,
        args.ref_fa, args.faidx_file, args.out_dir, local_assembly_out_file,
        args.n_thread, window_size, max_depth, rm_temp_files)

    ### merge call files ###

    merge_sv_calls(local_assembly_out_file, short_reads_del_call_file,
                   out_short_read_sv_file, args.tid2chrname, args.chrname2tid)

    rm_temp_files = 1
    if rm_temp_files:
        os.remove(local_assembly_out_file)
        os.remove(short_reads_del_call_file)

    return
Example #29
0
    def init_from_two_lines(self, line1, line2):
        line1 = line1.strip().split(tab)
        line2 = line2.strip().split(tab)
        if len(line1) < 13:
            my_utils.myprint('ERROR! This line is less than 13 coloumns: %s' % tab.join(line1)) 
            return
        
        if len(line2) < 13:
            my_utils.myprint('ERROR! This line is less than 13 coloumns: %s' % tab.join(line2)) 
            return

        if line1[6] != line2[6]:
            my_utils.myprint('ERROR! line1 and line2 have different read id!')
            my_utils.myprint('line1: %s' % tab.join(line1))
            my_utils.myprint('line2: %s' % tab.join(line2))
            sys.exit()
            return
        
        if line1[0] != line2[0]:
            my_utils.myprint('ERROR! line1 and line2 have different tid!')
            my_utils.myprint('line1: %s' % tab.join(line1))
            my_utils.myprint('line2: %s' % tab.join(line2))
            sys.exit()
            return
        
        if int(line1[1]) > int(line2[1]):
            tmp   = line1
            line1 = line2
            line2 = tmp

        self.tid1, self.start1, self.end1, self.mapq1 = line1[0:4]
        self.tid2, self.start2, self.end2, self.mapq2 = line2[0:4]

        self.tid1    = int(self.tid1)
        self.start1  = int(self.start1)
        self.end1    = int(self.end1)
        self.mapq1   = int(self.mapq1)

        self.tid2    = int(self.tid2)
        self.start2  = int(self.start2)
        self.end2    = int(self.end2)
        self.mapq2   = int(self.mapq2)

        self.flag1 = int(line1[7])
        self.flag2 = int(line2[7])
        self.read_id = line1[6]
        self.bcd = line1[4]
        self.hap_type = int(line1[5])

        if self.start1 > self.start2:
            my_utils.myprint('ERROR! start1 > start2')
            sys.exit()

        return