Exemple #1
0
def format_main(args):

    # generate bedpe file
    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:
    
            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                header_info.read(line.rstrip('\n'))
                continue

            F = line.rstrip('\n').split('\t')

            if F[header_info.variant_type] in ["inversion", "translocation"]: continue
            if abs(int(F[header_info.pos_1]) - int(F[header_info.pos_2])) > int(args.max_size_thres): continue

            if F[header_info.variant_type] == "deletion":
                ref_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]), int(F[header_info.pos_2]) - 1)
                alt_seq = ref_seq[0] if F[header_info.inserted_seq] == "---" else ref_seq[0] + F[header_info.inserted_seq] 
                pos = F[header_info.pos_1]
            elif F[header_info.variant_type] == "tandem_duplication":
                alt_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - 1, int(F[header_info.pos_2]))
                alt_seq = alt_seq if F[header_info.inserted_seq] == "---" else alt_seq + F[header_info.inserted_seq] 
                ref_seq = alt_seq[0]
                pos = str(int(F[header_info.pos_1]) - 1)

            print('\t'.join([F[header_info.chr_1], pos, '.', ref_seq, alt_seq, '.', "PASS", '.']), file = hout)

    hout.close()
Exemple #2
0
def nonB_DB_main(args):
   
    from . import nonB_DB

    all_nonB_DB_type = ["A_Phased_Repeat", "Direct_Repeat", "G_Quadruplex_Motif", "Inverted_Repeat", 
                         "Mirror_Repeat", "Short_Tandem_Repeat", "Z_DNA_Motif"]

    if not os.path.exists(args.result_file):
        raise ValueError("file not exists: " + args.result_file)

    nonB_DB_bed = args.nonB_DB
    nonB_DB_tb = pysam.TabixFile(nonB_DB_bed)


    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:
            if utils.header_check(line.rstrip('\n')):
                header_info.read(line.rstrip('\n'))
                print_header = line.rstrip('\n') + '\t' + '\t'.join([x + "_dist1" + '\t' + x + "_dist2" for x in all_nonB_DB_type])
                print(print_header, file = hout)
                continue
            
            F = line.rstrip('\n').split('\t')

            # for meta info print
            if F[0].startswith("#"):
                print('\t'.join(F), file = hout)
                continue

            if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                  (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                  F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr)
                continue

            chr_ucsc1 = F[header_info.chr_1] if F[header_info.chr_1].startswith("chr") else "chr" + F[header_info.chr_1]
            chr_ucsc2 = F[header_info.chr_2] if F[header_info.chr_2].startswith("chr") else "chr" + F[header_info.chr_2]

            print_dist_bar = ''
            for nonB_DB_type in all_nonB_DB_type:
                nonB_DB_dist1 = nonB_DB.nonB_DB_dist_check(chr_ucsc1, int(F[header_info.pos_1]), F[header_info.dir_1], nonB_DB_tb, nonB_DB_type)
                nonB_DB_dist2 = nonB_DB.nonB_DB_dist_check(chr_ucsc2, int(F[header_info.pos_2]), F[header_info.dir_2], nonB_DB_tb, nonB_DB_type)
                print_dist_bar = print_dist_bar + '\t' + str(nonB_DB_dist1) + '\t' + str(nonB_DB_dist2)

            print('\t'.join(F) + print_dist_bar, file = hout)
    
    hout.close()
Exemple #3
0
def AID_main(args):

    # make directory for output if necessary
    if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:

            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                line = line.rstrip('\n')
                header_info.read(line)
                print(line + '\t' + "CG_motif_info_1" + '\t' + "CG_motif_info_2" + '\t' + "WGCW_motif_info_1" + '\t' + "WGCW_motif_info_2", file = hout)
                continue

            F = line.rstrip('\n').split('\t')

            if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                   (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                    F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr)
                continue

            seq1 = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - args.check_size, int(F[header_info.pos_1]) + args.check_size)
            seq2 = my_seq.get_seq(args.reference, F[header_info.chr_2], int(F[header_info.pos_2]) - args.check_size, int(F[header_info.pos_2]) + args.check_size)

           
            CG_starts_1 = [match.start() - 10 for match in re.finditer(r'CG', seq1)]
            CG_starts_2 = [match.start() - 10 for match in re.finditer(r'CG', seq2)]
            WGCW_starts_1 = [match.start() - 10 for match in re.finditer(r'[AT]GC[AT]', seq1)]
            WGCW_starts_2 = [match.start() - 10 for match in re.finditer(r'[AT]GC[AT]', seq2)]

            if len(CG_starts_1) == 0: CG_starts_1.append("---")
            if len(CG_starts_2) == 0: CG_starts_2.append("---")
            if len(WGCW_starts_1) == 0: WGCW_starts_1.append("---")
            if len(WGCW_starts_2) == 0: WGCW_starts_2.append("---")

            print('\t'.join(F) + '\t' + \
                  ','.join([str(x) for x in CG_starts_1]) + '\t' + \
                  ','.join([str(x) for x in CG_starts_2]) + '\t' + \
                  ','.join([str(x) for x in WGCW_starts_1]) + '\t' + \
                  ','.join([str(x) for x in WGCW_starts_2]), file = hout)

    hout.close()
Exemple #4
0
def homology_main(args):

    from . import homology

    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:

            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                header_info.read(line.rstrip('\n'))
                print_header = line.rstrip('\n') + '\t' + "Homology_Match"
                print(print_header, file = hout)
                continue

            F = line.rstrip('\n').split('\t')

            # for meta info print
            if F[0].startswith("#"):
                print('\t'.join(F), file = hout)
                continue

            if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                   (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                    F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr)
                continue

            var_size = 500000
            if F[header_info.variant_type] == "deletion":
                var_size = int(F[header_info.pos_2]) - int(F[header_info.pos_1]) - 1
            elif F[header_info.variant_type] == "tandem_duplication":
                var_size = int(F[header_info.pos_2]) - int(F[header_info.pos_1]) + 1
            
            homology_match = homology.check_homology(F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1],
                                                     F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2], 
                                                     args.reference, min(var_size, 100))

            print('\t'.join(F) + '\t' + str(homology_match), file = hout)
 
    hout.close()
Exemple #5
0
def RSS_main(args):

    # make directory for output if necessary
    if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    rss_pwm = my_seq.generate_rss_pwm()

    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:

            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                line = line.rstrip('\n')
                header_info.read(line)
                print(line + '\t' + "RSS_score_1" + '\t' + "RSS_info_1" + '\t' + "RSS_score_2" + '\t' + "RSS_info_2", file = hout)
                continue

            F = line.rstrip('\n').split('\t')

            if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                   (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                    F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr)
                continue
            
            seq1 = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - args.check_size, int(F[header_info.pos_1]) + args.check_size)
            seq2 = my_seq.get_seq(args.reference, F[header_info.chr_2], int(F[header_info.pos_2]) - args.check_size, int(F[header_info.pos_2]) + args.check_size)

            rss_info_1 = my_seq.get_max_rss_score(seq1, rss_pwm[0], rss_pwm[1])
            rss_info_2 = my_seq.get_max_rss_score(seq2, rss_pwm[0], rss_pwm[1])
   
            print('\t'.join(F) + '\t' + \
                  str(round(rss_info_1[0], 3)) + '\t' + \
                  ';'.join([rss_info_1[1], rss_info_1[2], str(int(rss_info_1[3] - 50)), str(rss_info_1[4]), str(rss_info_1[5])]) + '\t' + \
                  str(round(rss_info_2[0], 3)) + '\t' + \
                  ';'.join([rss_info_2[1], rss_info_2[2], str(int(rss_info_2[3] - 50)), str(rss_info_2[4]), str(rss_info_2[5])]), file = hout)

    hout.close()
Exemple #6
0
def primer_main(args):
 
    from genomon_sv import realignmentFunction
    from primer3 import bindings 

    # make directory for output if necessary
    if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    param = {"reference_genome": args.reference, "split_refernece_thres": 1000, "validate_sequence_length": 250}

    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:
    
            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                line = line.rstrip('\n')
                header_info.read(line)
                print(line + '\t' + "Primer1" + '\t' + "Primer2" + '\t' + "Primer3" + '\t' + "Primer4" + '\t' + "Primer5", file = hout)
                continue
            
            F = line.rstrip('\n').split('\t')
            chr1, pos1, dir1, chr2, pos2, dir2, junc_seq = F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                                                           F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2], F[header_info.inserted_seq]

            if utils.check_atypical_chromosomes(chr1, chr2):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                      (chr1, pos1, dir1, chr2, pos2, dir2), file = sys.stderr)
                continue
            
            junc_seq_len = 0 if junc_seq == "---" else len(junc_seq)

            realignmentFunction.getRefAltForSV(args.output + ".contig.tmp.fa", chr1, pos1, dir1, chr2, pos2, dir2, junc_seq, args.reference, 1000, 250)

            with open(args.output + ".contig.tmp.fa") as hin2:
                lines2 = hin2.readlines()
                for i in range(len(lines2)):
                    lines2[i] = lines2[i].rstrip('\n')
                    if lines2[i].startswith('>') and lines2[i].endswith("alt"):
                        seq = lines2[i + 1].rstrip('\n')

                        primer = bindings.designPrimers(
                            {
                                'SEQUENCE_ID': 'MH1000',
                                'SEQUENCE_TEMPLATE': seq,
                                'SEQUENCE_TARGET': [225,50 + junc_seq_len],
                                'SEQUENCE_INCLUDED_REGION': [10, len(seq) - 20]
                            },
                            {
                                'PRIMER_PRODUCT_SIZE_RANGE': [[150,250],[100,300],[301,400],[401,500]],
                            })

                        primer_left_right = ["---"] * 5
                        for i in range(5):
                            if "PRIMER_LEFT_" + str(i) + "_SEQUENCE" in primer and "PRIMER_RIGHT_" + str(i) + "_SEQUENCE" in primer and \
                               "PRIMER_LEFT_" + str(i) + "_TM" in primer and "PRIMER_RIGHT_" + str(i) + "_TM" in primer and \
                               "PRIMER_PAIR_" + str(i) + "_PRODUCT_SIZE" in primer:
                                primer_left_right[i] = primer["PRIMER_LEFT_" + str(i) + "_SEQUENCE"] + ";" + primer["PRIMER_RIGHT_" + str(i) + "_SEQUENCE"] + ';' + \
                                                       str(round(primer["PRIMER_LEFT_" + str(i) + "_TM"], 3)) + ";" + str(round(primer["PRIMER_RIGHT_" + str(i) + "_TM"], 3)) + ';' + \
                                                       str(primer["PRIMER_PAIR_" + str(i) + "_PRODUCT_SIZE"])
 
                        print('\t'.join(F) + '\t' + '\t'.join(primer_left_right), file = hout)
              

    hout.close()    
    subprocess.check_call(["rm", "-rf", args.output + ".contig.tmp.fa"])
Exemple #7
0
def realign_main(args):

    from genomon_sv import filterFunction

    if args.tumor_bam is None:
        print("tumor_bam file should be input", file = sys.stderr)
        sys.exit(1)

    # make directory for output if necessary
    if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output))

    matchedControlFlag = True if args.control_bam is not None else False
    if args.control_bam is None: args.control_bam = ""

    # generate bedpe file
    hout = open(args.output + ".tmp1.bedpe", 'w')
    i = 0
    with open(args.result_file, 'r') as hin:
        for line in hin:

            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                line = line.rstrip('\n')
                header_info.read(line)
                continue

            F = line.rstrip('\n').split('\t')

            if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]):
                print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \
                   (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \
                    F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr)
                continue

            print('\t'.join([F[header_info.chr_1], str(int(F[header_info.pos_1]) - 1), F[header_info.pos_1], \
                             F[header_info.chr_2], str(int(F[header_info.pos_2]) - 1), F[header_info.pos_2], \
                             "genoemonSV_" + str(i), F[header_info.inserted_seq], F[header_info.dir_1], F[header_info.dir_2]] + \
                             ["---" for i in range(14)]), file = hout)
            i = i + 1

    hout.close()

    filterFunction.validateByRealignment(args.output + ".tmp1.bedpe",
                    args.output + ".tmp2.bedpe",
                    args.tumor_bam,
                    args.control_bam,
                    args.reference,
                    "-stepSize=5 -repMatch=2253",
                    500,
                    5000,
                    1000,
                    5,
                    1000,
                    1000)


    key2AF_info = {}
    with open(args.output + ".tmp2.bedpe", 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            key = '\t'.join(F[:7])

            tumorAF = 0.0 
            if float(F[7]) + float(F[8]) > 0: tumorAF = float(F[8]) / (float(F[7]) + float(F[8]))     
            tumorAF = str(round(tumorAF, 4))

            normalAF = "---"
            if matchedControlFlag == True:
                normalAF = 0.0
                if float(F[9]) + float(F[10]) > 0: normalAF = float(F[10]) / (float(F[9]) + float(F[10]))
                normalAF = str(round(normalAF, 4))

            if matchedControlFlag == True:
                key2AF_info[key] = '\t'.join([F[7], F[8], tumorAF, F[9], F[10], normalAF, F[11]])
            else:
                key2AF_info[key] = '\t'.join([F[7], F[8], tumorAF])


    hout = open(args.output, 'w') 
    with open(args.result_file, 'r') as hin:
        for line in hin:
            
            if line.startswith("#"): continue
            if utils.header_check(line.rstrip('\n')):
                line = line.rstrip('\n')
                if matchedControlFlag == True:
                    print(line + '\t' + "Num_Tumor_Ref_Read_Pair_re" + '\t' + "Num_Tumor_Var_Read_Pair_re" + '\t' + "Tumor_VAF_re" + '\t' + \
                                        "Num_Control_Ref_Read_Pair_re" + '\t'+ "Num_Control_Var_Read_Pair_re" + '\t' + "Control_VAF_re" + '\t' + \
                                        "Minus_Log_Fisher_P_value_re", file = hout) 
                else:
                    print(line + '\t' + "Num_Tumor_Ref_Read_Pair_re" + '\t' + "Num_Tumor_Var_Read_Pair_re" + '\t' + "Tumor_VAF_re", file = hout)
                continue

            F = line.rstrip('\n').split('\t')
            key = '\t'.join(F[:7])
            if key not in key2AF_info: continue

            print('\t'.join(F) + '\t' + key2AF_info[key], file = hout)

    hout.close()

    subprocess.check_call(["rm", "-rf", args.output + ".tmp1.bedpe"])
    subprocess.check_call(["rm", "-rf", args.output + ".tmp2.bedpe"])
Exemple #8
0
def merge_control_main(args):

    import genomon_sv.mergeFunction, genomon_sv.utils

    # make directory for output if necessary
    if os.path.dirname(args.output_file) != "" and not os.path.exists(os.path.dirname(args.output_file)):
        os.makedirs(os.path.dirname(args.output_file))

    hout = open(args.output_file + ".temp", 'w')

    tumor_type_list = {}
    gene2type_sample = {}
    with open(args.result_list, 'r') as hin:
        for line in hin:

            label, tumor_type, result_file = line.rstrip('\n').split('\t')
            # label, result_file = line.rstrip('\n').split('\t')
            if tumor_type not in tumor_type_list: tumor_type_list[tumor_type] = 1

            if not os.path.exists(result_file):
                raise ValueError("file not exists: " + result_file)


            num = 1
            with open(result_file, 'r') as hin:
                for line in hin:
                    if line.startswith("#"): continue
                    if utils.header_check(line.rstrip('\n')):
                        line = line.rstrip('\n')
                        header_info.read(line)
                        continue

                    F = line.rstrip('\n').split('\t')
                    inseqLen = len(F[header_info.inserted_seq]) if F[header_info.inserted_seq] != "---" else 0

                    print('\t'.join([F[header_info.chr_1], str(int(F[header_info.pos_1]) - 1), F[header_info.pos_1], \
                                    F[header_info.chr_2], str(int(F[header_info.pos_2]) - 1), F[header_info.pos_2], \
                                    "junction_" + str(num),  str(inseqLen), \
                                    F[header_info.dir_1], F[header_info.dir_2], label, "1"]), file = hout)

                    num = num + 1

    hout.close()

    # utils.processingMessage("sorting the aggregated junction file")
    genomon_sv.utils.sortBedpe(args.output_file + ".temp", args.output_file + ".temp.sort")

    # utils.processingMessage("merging the same junction in the aggregated junction file")
    genomon_sv.mergeFunction.organizeControl(args.output_file + ".temp.sort", args.output_file + ".temp.merged", 20)

    # utils.processingMessage("sorting the merged junction file")
    genomon_sv.utils.sortBedpe(args.output_file + ".temp.merged", args.output_file + ".temp.merged.sort")

    # utils.processingMessage("compressing the merged junction file")
    genomon_sv.utils.compress_index_bed(args.output_file + ".temp.merged.sort", args.output_file)


    # remove intermediate files
    subprocess.check_call(["rm", "-rf", args.output_file + ".temp"])
    subprocess.check_call(["rm", "-rf", args.output_file + ".temp.sort"])
    subprocess.check_call(["rm", "-rf", args.output_file + ".temp.merged"])
    subprocess.check_call(["rm", "-rf", args.output_file + ".temp.merged.sort"])