Example #1
0
def filt_main(args):

    annotation_dir = args.db_dir
    ref_gene_bed = annotation_dir + "/refGene.bed.gz"
    ref_exon_bed = annotation_dir + "/refExon.bed.gz"
    ens_gene_bed = annotation_dir + "/ensGene.bed.gz"
    ens_exon_bed = annotation_dir + "/ensExon.bed.gz"
    grch2ucsc_file = annotation_dir + "/grch2ucsc.txt"

    ref_gene_tb = pysam.TabixFile(ref_gene_bed)
    ref_exon_tb = pysam.TabixFile(ref_exon_bed)
    ens_gene_tb = pysam.TabixFile(ens_gene_bed)
    ens_exon_tb = pysam.TabixFile(ens_exon_bed)

    # relationship between CRCh and UCSC chromosome names
    grch2ucsc = {}
    with open(grch2ucsc_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            grch2ucsc[F[0]] = F[1]

    hIN = open(args.fusion, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(F, args.type)
        read_num = process.get_read_num(F, args.type)

        if read_num < args.thres: continue
 
        if chr1 in ["MT", "chrM", "hs37d5"] or chr2 in ["MT", "chrM", "hs37d5"]: continue
        if chr1.startswith("GL0") or chr2.startswith("GL0"): continue

        chr1_ucsc = grch2ucsc[chr1] if chr1 in grch2ucsc else chr1 
        chr2_ucsc = grch2ucsc[chr2] if chr2 in grch2ucsc else chr2
        gene1 = annotation.get_gene_info(chr1_ucsc, pos1, ref_gene_tb, ens_gene_tb)
        gene2 = annotation.get_gene_info(chr2_ucsc, pos2, ref_gene_tb, ens_gene_tb)
        junction1 = annotation.get_junc_info(chr1_ucsc, pos1, ref_exon_tb, ens_exon_tb, 5)
        junction2 = annotation.get_junc_info(chr2_ucsc, pos2, ref_exon_tb, ens_exon_tb, 5)

        same_gene_flag = 0
        for g1 in gene1:
            for g2 in gene2:
                if g1 == g2: same_gene_flag = 1

        if args.filter_same_gene and same_gene_flag == 1: continue


        junc_test = 0
        if (';'.join(junction1).find("start") != -1 and ';'.join(junction2).find("end") != -1) or (';'.join(junction1).find("end") != -1 and ';'.join(junction2).find("start") != -1):
            junc_test = 1
        
        if args.filter_unspliced and junc_test == 0: continue

        print >> hOUT, '\t'.join(F)

    hIN.close()
    hOUT.close()
Example #2
0
def rmdup_main(args):

    if os.path.getsize(args.fusion) == 0:
        hOUT = open(args.output, 'w')
        hOUT.close()
        return

    process.convert_to_bedpe(args.fusion, args.output + ".fusion.bedpe", 10,
                             10, args.type)

    hOUT = open(args.output + ".fusion_comp.bedpe", 'w')
    subprocess.check_call([
        "bedtools", "pairtopair", "-a", args.output + ".fusion.bedpe", "-b",
        args.output + ".fusion.bedpe"
    ],
                          stdout=hOUT)
    hOUT.close()

    # create dictionary
    fusion_comp = {}
    hIN = open(args.output + ".fusion_comp.bedpe", 'r')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        if F[6] != F[16]:
            if int(F[7]) < int(F[17]) or (int(F[7]) == int(F[17])
                                          and F[6] not in fusion_comp):
                fusion_comp[F[6]] = F[16]

    hIN.close()

    hIN = open(args.fusion, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')

        # header is removed
        if F[0] == "#fusion_name" and args.type == "star_fusion": continue

        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(F, args.type)
        ID = chr1 + ':' + dir1 + pos1 + '-' + chr2 + ':' + dir2 + pos2

        if ID not in fusion_comp:
            print >> hOUT, '\t'.join(F)

    hIN.close()
    hOUT.close()

    # remove intermediate files
    subprocess.check_call(["rm", "-rf", args.output + ".fusion.bedpe"])
    subprocess.check_call(["rm", "-rf", args.output + ".fusion_comp.bedpe"])
Example #3
0
def comp_main(args):

    if os.path.getsize(args.fusion1) == 0:
        hOUT = open(args.output, 'w')
        hOUT.close()
        return
    
    if args.type1 in ["fusionfusion", "fusionfusion_part", "star_fusion", "genomon_fusion", "mapsplice2", "tophat_fusion"] and args.type2 == "genomonSV":
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe", 500000, 10, args.type1)
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe", 10, 10, "genomonSV")
    elif args.type2 in ["fusionfusion", "fusionfusion_part", "star_fusion", "genomon_fusion", "mapsplice2", "tophat_fusion"] and args.type1 == "genomonSV":
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe", 10, 10, "genomonSV")
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe", 500000, 10, args.type2)
    else:
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe", 10, 10, args.type1)
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe", 10, 10, args.type2)


    hOUT = open(args.output + ".fusion_comp.bedpe", 'w')
    subprocess.call([args.bedtools_path + "/bedtools", "pairtopair", "-a", args.output + ".fusion1.bedpe", "-b", args.output + ".fusion2.bedpe"], stdout = hOUT)
    hOUT.close()

    # create dictionary
    fusion_comp = {}
    hIN = open(args.output + ".fusion_comp.bedpe", 'r')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        fusion_comp[F[6]] = F[16]
    
    hIN.close()

    # add SV annotation to fusion
    hIN = open(args.fusion1, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(F, args.type1)
        ID = chr1 + ':' + dir1 + pos1 + '-' + chr2 + ':' + dir2 + pos2

        SV_info = fusion_comp[ID] if ID in fusion_comp else "---"
        print >> hOUT, '\t'.join(F) + '\t' + SV_info

    hIN.close()
    hOUT.close()

    # remove intermediate files
    subprocess.call(["rm", "-rf", args.output + ".fusion1.bedpe"])
    subprocess.call(["rm", "-rf", args.output + ".fusion2.bedpe"])
    subprocess.call(["rm", "-rf", args.output + ".fusion_comp.bedpe"])
Example #4
0
def rmdup_main(args):

    if os.path.getsize(args.fusion) == 0:
        hOUT = open(args.output, 'w')
        hOUT.close()
        return

    process.convert_to_bedpe(args.fusion, args.output + ".fusion.bedpe", 10, 10, args.type)

    hOUT = open(args.output + ".fusion_comp.bedpe", 'w')
    subprocess.call([args.bedtools_path + "/bedtools", "pairtopair", "-a", args.output + ".fusion.bedpe", "-b", args.output + ".fusion.bedpe"], stdout = hOUT)
    hOUT.close()

    # create dictionary
    fusion_comp = {}
    hIN = open(args.output + ".fusion_comp.bedpe", 'r')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        if F[6] != F[16]:
            if int(F[7]) < int(F[17]) or (int(F[7]) == int(F[17]) and F[6] not in fusion_comp):
                fusion_comp[F[6]] = F[16]

    hIN.close()

    hIN = open(args.fusion, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')

        # header is removed
        if F[0] == "#fusion_name" and args.type == "star_fusion": continue

        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(F, args.type)
        ID = chr1 + ':' + dir1 + pos1 + '-' + chr2 + ':' + dir2 + pos2

        if ID not in fusion_comp:
            print >> hOUT, '\t'.join(F)


    hIN.close()
    hOUT.close()

    # remove intermediate files
    subprocess.call(["rm", "-rf", args.output + ".fusion.bedpe"])
    subprocess.call(["rm", "-rf", args.output + ".fusion_comp.bedpe"])
Example #5
0
def filt_main(args):
    """
    annotation_dir = args.db_dir
    ref_gene_bed = annotation_dir + "/refGene.bed.gz"
    ref_exon_bed = annotation_dir + "/refExon.bed.gz"
    ens_gene_bed = annotation_dir + "/ensGene.bed.gz"
    ens_exon_bed = annotation_dir + "/ensExon.bed.gz"
    grch2ucsc_file = annotation_dir + "/grch2ucsc.txt"

    ref_gene_tb = pysam.TabixFile(ref_gene_bed)
    ref_exon_tb = pysam.TabixFile(ref_exon_bed)
    ens_gene_tb = pysam.TabixFile(ens_gene_bed)
    ens_exon_tb = pysam.TabixFile(ens_exon_bed)

    # relationship between CRCh and UCSC chromosome names
    grch2ucsc = {}
    with open(grch2ucsc_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            grch2ucsc[F[0]] = F[1]
    """

    annot_utils.gene.make_gene_info(args.output + ".tmp.refGene.bed.gz",
                                    "refseq", args.genome_id, args.grc, False)
    annot_utils.gene.make_gene_info(args.output + ".tmp.ensGene.bed.gz",
                                    "gencode", args.genome_id, args.grc, False)
    annot_utils.exon.make_exon_info(args.output + ".tmp.refExon.bed.gz",
                                    "refseq", args.genome_id, args.grc, False)
    annot_utils.exon.make_exon_info(args.output + ".tmp.ensExon.bed.gz",
                                    "gencode", args.genome_id, args.grc, False)

    ref_gene_tb = pysam.TabixFile(args.output + ".tmp.refGene.bed.gz")
    ens_gene_tb = pysam.TabixFile(args.output + ".tmp.ensGene.bed.gz")
    ref_exon_tb = pysam.TabixFile(args.output + ".tmp.refExon.bed.gz")
    ens_exon_tb = pysam.TabixFile(args.output + ".tmp.ensExon.bed.gz")

    hIN = open(args.fusion, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(F, args.type)
        read_num = process.get_read_num(F, args.type)

        if read_num < args.thres: continue

        if chr1 in ["MT", "chrM", "hs37d5"
                    ] or chr2 in ["MT", "chrM", "hs37d5"]:
            continue
        if chr1.startswith("GL0") or chr2.startswith("GL0"): continue

        # chr1_ucsc = grch2ucsc[chr1] if chr1 in grch2ucsc else chr1
        # chr2_ucsc = grch2ucsc[chr2] if chr2 in grch2ucsc else chr2
        gene1 = annotation.get_gene_info(chr1, pos1, ref_gene_tb, ens_gene_tb)
        gene2 = annotation.get_gene_info(chr2, pos2, ref_gene_tb, ens_gene_tb)
        junction1 = annotation.get_junc_info(chr1, pos1, ref_exon_tb,
                                             ens_exon_tb, 5)
        junction2 = annotation.get_junc_info(chr2, pos2, ref_exon_tb,
                                             ens_exon_tb, 5)

        same_gene_flag = 0
        for g1 in gene1:
            for g2 in gene2:
                if g1 == g2: same_gene_flag = 1

        if args.filter_same_gene and same_gene_flag == 1: continue

        junc_test = 0
        if (';'.join(junction1).find("start") != -1
                and ';'.join(junction2).find("end") != -1) or (
                    ';'.join(junction1).find("end") != -1
                    and ';'.join(junction2).find("start") != -1):
            junc_test = 1

        if args.filter_unspliced and junc_test == 0: continue

        print >> hOUT, '\t'.join(F)

    hIN.close()
    hOUT.close()

    subprocess.check_call(["rm", "-rf", args.output + ".tmp.refGene.bed.gz"])
    subprocess.check_call(["rm", "-rf", args.output + ".tmp.ensGene.bed.gz"])
    subprocess.check_call(["rm", "-rf", args.output + ".tmp.refExon.bed.gz"])
    subprocess.check_call(["rm", "-rf", args.output + ".tmp.ensExon.bed.gz"])
    subprocess.check_call(
        ["rm", "-rf", args.output + ".tmp.refGene.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", args.output + ".tmp.ensGene.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", args.output + ".tmp.refExon.bed.gz.tbi"])
    subprocess.check_call(
        ["rm", "-rf", args.output + ".tmp.ensExon.bed.gz.tbi"])
Example #6
0
def comp_main(args):

    if os.path.getsize(args.fusion1) == 0:
        hOUT = open(args.output, 'w')
        hOUT.close()
        return

    if args.type1 in [
            "fusionfusion", "fusionfusion_part", "star_fusion",
            "genomon_fusion", "mapsplice2", "tophat_fusion"
    ] and args.type2 == "genomonSV":
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe",
                                 args.sv_margin_major, args.sv_margin_minor,
                                 args.type1)
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe",
                                 args.margin, args.margin, "genomonSV")
    elif args.type2 in [
            "fusionfusion", "fusionfusion_part", "star_fusion",
            "genomon_fusion", "mapsplice2", "tophat_fusion"
    ] and args.type1 == "genomonSV":
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe",
                                 args.margin, args.margin, "genomonSV")
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe",
                                 args.sv_margin_major, args.sv_margin_minor,
                                 args.type2)
    else:
        process.convert_to_bedpe(args.fusion1, args.output + ".fusion1.bedpe",
                                 args.margin, args.margin, args.type1)
        process.convert_to_bedpe(args.fusion2, args.output + ".fusion2.bedpe",
                                 args.margin, args.margin, args.type2)

    hOUT = open(args.output + ".fusion_comp.bedpe", 'w')
    subprocess.check_call([
        "bedtools", "pairtopair", "-a", args.output + ".fusion1.bedpe", "-b",
        args.output + ".fusion2.bedpe"
    ],
                          stdout=hOUT)
    hOUT.close()

    # create dictionary
    fusion_comp = {}
    hIN = open(args.output + ".fusion_comp.bedpe", 'r')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        fusion_comp[F[6]] = F[16]

    hIN.close()

    # add SV annotation to fusion
    hIN = open(args.fusion1, 'r')
    hOUT = open(args.output, 'w')
    for line in hIN:
        F = line.rstrip('\n').split('\t')
        chr1, pos1, dir1, chr2, pos2, dir2 = process.get_position(
            F, args.type1)
        ID = chr1 + ':' + dir1 + pos1 + '-' + chr2 + ':' + dir2 + pos2

        SV_info = fusion_comp[ID] if ID in fusion_comp else "---"
        print >> hOUT, '\t'.join(F) + '\t' + SV_info

    hIN.close()
    hOUT.close()

    # remove intermediate files
    subprocess.check_call(["rm", "-rf", args.output + ".fusion1.bedpe"])
    subprocess.check_call(["rm", "-rf", args.output + ".fusion2.bedpe"])
    subprocess.check_call(["rm", "-rf", args.output + ".fusion_comp.bedpe"])