Example #1
0
def find_isoform(gff3, genome_bam, isoform_gff3, tss_tes_stat, genomefa,
                 transcript_fa, downsample_ratio, config_dict,
                 raw_splice_isoform):
    # find isoform
    print "#### Read genne annotations"
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff3)

    transcript_to_junctions = {
        tr: blocks_to_junctions(transcript_to_exon[tr])
        for tr in transcript_to_exon
    }
    remove_similar_tr(gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)

    # finding isoforms are required
    print "#### find isoforms"
    group_bam2isoform(genome_bam,
                      isoform_gff3,
                      tss_tes_stat,
                      "",
                      chr_to_blocks,
                      gene_dict,
                      transcript_to_junctions,
                      transcript_dict,
                      genomefa,
                      config=config_dict["isoform_parameters"],
                      downsample_ratio=downsample_ratio,
                      raw_gff3=None)
    #raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(
        isoform_gff3)
    ref_dict = {
        "chr_to_gene": chr_to_gene,
        "transcript_dict": transcript_dict,
        "gene_to_transcript": gene_to_transcript,
        "transcript_to_exon": transcript_to_exon
    }
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(genomefa,
                       transcript_fa,
                       chr_to_gene_i,
                       transcript_dict_i,
                       gene_to_transcript_i,
                       transcript_to_exon_i,
                       ref_dict=ref_dict)

    return {
        "transcript_dict": transcript_dict,
        "transcript_dict_i": transcript_dict_i
    }
Example #2
0
def get_splice_expr(isoform_gff, fsm_annotation, res_csv, tr_kept=10):
    typ_cnt = Counter()
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        isoform_gff)
    transcript_to_splice = {}
    for tr in transcript_to_exon:  # convert to list
        transcript_to_splice[tr] = []
        for ex in transcript_to_exon[tr]:
            transcript_to_splice[tr].append(ex[0])
            transcript_to_splice[tr].append(ex[1])
    tr_dict = {}
    fsm2tr = {}
    fsm_cnt = Counter()
    gene2fsm = {}
    for line in open(fsm_annotation):
        if "gene_id" in line:
            continue
        its = line.strip().split(
            ",")  # 0:tr_id, 1:gene_id, 2:FSM_id, 3:in_ref, 4:cnt
        # filtered out in gff (low abundance)
        if its[0] not in transcript_to_exon:
            continue
        tr_dict[its[0]] = (its[1], its[2], int(its[4]))
        fsm_cnt[its[2]] += int(its[4])
        fsm2tr.setdefault(its[2], []).append(its[0])
        gene2fsm.setdefault(its[1], []).append(its[2])
    for ge in gene2fsm:
        gene2fsm[ge] = [(it, fsm_cnt[it]) for it in list(set(gene2fsm[ge]))]
        if len(gene2fsm[ge]) == 1:
            continue
        gene2fsm[ge].sort(key=lambda x: x[1], reverse=True)
        if len(gene2fsm[ge]) > tr_kept:
            gene2fsm[ge] = gene2fsm[ge][:tr_kept]
    out_f = open(res_csv, "w")
    for ge in gene2fsm:
        if len(gene2fsm[ge]) == 1:
            continue
        for i in range(len(gene2fsm[ge])):
            for j in range(len(gene2fsm[ge])):
                if i == j:
                    continue
                if gene2fsm[ge][i][0] in transcript_to_splice:
                    tr1 = gene2fsm[ge][i][0]
                else:
                    tr1 = fsm2tr[gene2fsm[ge][i][0]][0]
                if gene2fsm[ge][j][0] in transcript_to_splice:
                    tr2 = gene2fsm[ge][j][0]
                else:
                    tr2 = fsm2tr[gene2fsm[ge][j][0]][0]
                typ = comp_two_splicing(transcript_to_splice[tr1],
                                        transcript_to_splice[tr2])
                typ = "::".join(typ)
                typ_cnt[typ] += 1
                out_f.write("{},{},{}\n".format(gene2fsm[ge][i][0],
                                                gene2fsm[ge][j][0], typ))
    out_f.close()
    for i in typ_cnt.most_common(5):
        print("\t", i)
Example #3
0
def get_cage_coverage(isoform_gff, cage_f):
    cage_dict = make_CAGE_dict(cage_f)
    cage_cov_dict = {}
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        isoform_gff)
    for ch in chr_to_gene:
        if ch not in cage_dict:
            print((ch, "not in CAGE annotation file."))
            continue
        cage_cov_dict[ch] = []
        cage_tmp = dict((it[0], it) for it in cage_dict[ch])
        cage_left = [it[0] for it in cage_dict[ch]]
        for ge in chr_to_gene[ch]:
            for tr in gene_to_transcript[ge]:
                tss_pos = transcript_dict[tr].start if transcript_dict[
                    tr].strand == "+" else transcript_dict[tr].end
                min_idx = max(0, bisect_right(cage_left, tss_pos) - 1)
                #min_idx = min(range(len(cage_dict[ch])), key=lambda i: min(abs(cage_dict[ch][i][0]-tss_pos),abs(cage_dict[ch][i][1]-tss_pos)) )
                if tss_pos >= cage_dict[ch][min_idx][
                        0] and tss_pos <= cage_dict[ch][min_idx][1]:
                    cage_cov_dict[ch].append((tr, tss_pos, 0))
                elif tss_pos < cage_dict[ch][min_idx][0]:
                    if min_idx == 0:
                        cage_cov_dict[ch].append(
                            (tr, tss_pos, cage_dict[ch][min_idx][0] - tss_pos))
                    else:
                        cage_cov_dict[ch].append(
                            (tr, tss_pos,
                             min(tss_pos - cage_dict[ch][min_idx - 1][1],
                                 cage_dict[ch][min_idx][0] - tss_pos)))
                else:  # tss_pos>cage_dict[ch][min_idx][1]
                    if min_idx == len(cage_dict[ch]) - 1:
                        cage_cov_dict[ch].append(
                            (tr, tss_pos, tss_pos - cage_dict[ch][min_idx][1]))
                    else:
                        cage_cov_dict[ch].append(
                            (tr, tss_pos,
                             min(tss_pos - cage_dict[ch][min_idx][1],
                                 cage_dict[ch][min_idx + 1][0] - tss_pos)))
    return cage_cov_dict
Example #4
0
        # write header
        al_stat.write(
            "chr,position,REF,ALT,REF_frequency,REF_frequency_in_short_reads,hypergeom_test_p_value,sequence_entrophy,INDEL_frequency,mean_base_quality\n")
        for rec in reporting_summary:
            al_stat.write(",".join(str(it) for it in rec)+"\n")
    pct_bin, pt = np.histogram(acc_pct, bins=500, range=(0, 1))
    with open(os.path.join(out_dir, "mutation", "MT_freq_summary.csv"), "w") as cov_bin_out:
        for ix in range(500):
            cov_bin_out.write("{},{}\n".format(pt[ix], pct_bin[ix]))


if __name__ == "__main__":
    known_position_dict = {("chr18", 63318364): 0}
    fa_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/GRCh38.primary_assembly.genome.fa"
    gff_f = "/stornext/General/data/user_managed/grpu_mritchie_1/LuyiTian/Index/gencode.v33.annotation.gff3"
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff_f)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    # CLL141 capture
    cb_seq_dict = dict((it.strip().split("-")[0], it.strip().split("-")[0]) for it in open(
        "/stornext/General/data/user_managed/grpu_mritchie_1/RachelThijssen/sclr_data/Illumina_data/Thijssen_count80/outs/filtered_feature_bc_matrix/barcodes.tsv"))
    bam_short = "/stornext/General/data/user_managed/grpu_mritchie_1/hongkePeng/Rachel/all_fastq/CLL141-CLL-cells_S8_Rsubread.sorted.bam"
    iso_dir = "/stornext/Genomics/data/CLL_venetoclax/single_cell_data/capture_test/isoform_out"
    bam_in = os.path.join(iso_dir, "align2genome.bam")
    gff_f = os.path.join(iso_dir, "isoform_annotated.gff3")
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff_f)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    get_all_SNV_table(bam_in, chr_to_blocks, transcript_to_exon,
                      fa_f, iso_dir, cb_seq_dict, bam_short, known_position_dict)
Example #5
0
def annotate_filter_gff(isoform_gff,ref_gff,isoform_out,anno_out,tr_cnt,min_sup_reads,verbose=True):
    """
    combine FLAMES ouput with reference and filter out transcript by
    realignment result
    """
    gff3_fmt = "{_ch}\t{_sr}\t{_ty}\t{_st}\t{_en}\t{_sc}\t{_stnd}\t{_ph}\t{_attr}"

    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(isoform_gff)
    chr_to_gene_ref, transcript_dict_ref, gene_to_transcript_ref, transcript_to_exon_ref = parse_gff_tree(ref_gff)
    prt = "\tFiltering and combining isoforms from realigned bam file:\n\tBefore filtering: {} isoforms in count matrix. {} isoforms in reference annotation. {} isoforms in FLAMES raw output.".format(
        len(tr_cnt),
        len(transcript_to_exon_ref),
        len(transcript_to_exon)
    )
    if verbose:
        print(prt)
    gff_rec = []
    iso_rm=0
    iso_kp=0
    for ch in chr_to_gene:
        new_ge_list = copy.deepcopy(chr_to_gene[ch])
        new_ge_list.extend(chr_to_gene_ref[ch])
        new_ge_list = list(set(new_ge_list))
        for ge in new_ge_list:
            gff_tmp = []
            total_cnt = 0
            mi = 99999999999
            ma = -1
            if ge in gene_to_transcript:
                for tr in gene_to_transcript[ge]:
                    if (tr in tr_cnt) and (tr_cnt[tr]>=min_sup_reads):
                        gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="transcript",
                            _st=transcript_to_exon[tr][0][0]+1,
                            _en=transcript_to_exon[tr][-1][1],
                            _sc=".",_stnd=transcript_dict[tr].strand, _ph=".",
                            _attr="ID=transcript:{};transcript_id={};Parent=gene:{};support_count={}".format(tr,tr, ge, tr_cnt[tr])))
                        total_cnt += tr_cnt[tr]
                        mi = min(mi,transcript_to_exon[tr][0][0])
                        ma = max(ma,transcript_to_exon[tr][-1][1])
                        exon_idx = 1
                        for ex in transcript_to_exon[tr]:
                            gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="exon",_st=ex[0]+1,_en=ex[1], # `+1` because gff is 1-based
                            _sc=".",_stnd=transcript_dict[tr].strand, _ph=".",
                            _attr="exon_id=exon:{}_{};Parent=transcript:{};rank={}".format(ex[0]+1, ex[1],tr, exon_idx )))
                            exon_idx += 1
                        iso_kp += 1
                    else:
                        iso_rm += 1
            if ge in gene_to_transcript_ref:
                for tr in gene_to_transcript_ref[ge]:
                    if (tr not in transcript_dict) and (tr in tr_cnt) and (tr_cnt[tr]>=min_sup_reads):  # not in FLAMES output but in tr count
                        gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="reference",_ty="transcript",
                            _st=transcript_to_exon_ref[tr][0][0]+1,
                            _en=transcript_to_exon_ref[tr][-1][1],
                            _sc=".",_stnd=transcript_dict_ref[tr].strand, _ph=".",
                            _attr="ID=transcript:{};transcript_id={};Parent=gene:{};support_count={}".format(tr,tr, ge, tr_cnt[tr])))
                        total_cnt += tr_cnt[tr]
                        mi = min(mi,transcript_to_exon_ref[tr][0][0])
                        ma = max(ma,transcript_to_exon_ref[tr][-1][1])
                        exon_idx = 1
                        for ex in transcript_to_exon_ref[tr]:
                            gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="reference",_ty="exon",_st=ex[0]+1,_en=ex[1], # `+1` because gff is 1-based
                            _sc=".",_stnd=transcript_dict_ref[tr].strand, _ph=".",
                            _attr="exon_id=exon:{}_{};Parent=transcript:{};rank={}".format(ex[0]+1, ex[1],tr, exon_idx )))
                            exon_idx += 1
                        iso_kp += 1
            if len(gff_tmp)>0 and ge in gene_to_transcript:
                gff_tmp.insert(0,gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="gene",
                    _st=mi+1,
                    _en=ma,
                    _sc=".",_stnd=transcript_dict[gene_to_transcript[ge][0]].strand, _ph=".",
                    _attr="ID=gene:{};gene_id={};support_count={}".format(ge, ge, total_cnt)))
                gff_rec.extend(gff_tmp)
            elif len(gff_tmp)>0:
                gff_tmp.insert(0,gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="gene",
                    _st=mi+1,
                    _en=ma,
                    _sc=".",_stnd=transcript_dict_ref[gene_to_transcript_ref[ge][0]].strand, _ph=".",
                    _attr="ID=gene:{};gene_id={};support_count={}".format(ge, ge, total_cnt)))
                gff_rec.extend(gff_tmp)
    iso_annotated = open(isoform_out,"w")
    iso_annotated.write("##gff-version 3\n")
    iso_annotated.write("\n".join(gff_rec))
    iso_annotated.close()
    prt = "\tAfter filtering: kept {} isoforms. removed {} isoforms.".format(
        iso_kp,
        iso_rm
    )
    if verbose:
        print(prt)
    annotate_full_splice_match(transcript_to_exon,transcript_to_exon_ref,transcript_dict,transcript_dict_ref,anno_out,tr_cnt,min_sup_reads)
Example #6
0
def sc_long_pipeline(args):
    # parse configuration file

    if os.path.isfile(args.config_file):
        print("Use config file: {}".format(args.config_file))
        config_dict = parse_json_config(args.config_file)
    elif os.path.isfile(os.path.join(sys.path[0], args.config_file)):
        print("Use config file: {}".format(os.path.join(sys.path[0], args.config_file)))
        config_dict = parse_json_config(os.path.join(sys.path[0], args.config_file))
    else:
        print("Cannot find config file in current directory or script depository: {}".format(args.config_file))
        exit()
    print_config(config_dict)
    # check if files exist
    if args.downsample_ratio>1 or args.downsample_ratio<=0:
        print("downsample_ratio shoulw between 0 and 1: {}".format(args.downsample_ratio))
        exit()
    if not (os.path.isfile(args.infq) and os.path.isfile(args.gff3) and os.path.isfile(args.genomefa)):
        print("make sure all file exists:")
        print(args.infq)
        print(args.gff3)
        print(args.genomefa)
        exit()
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
        print("output directory not exist, create one:")
        print(args.outdir)
    if args.inbam != "" and (not os.path.isfile(args.inbam)):
        print("make sure input inbam file exists:")
        print(args.inbam)
        exit()
    # output files:
    isoform_gff3 = os.path.join(args.outdir, "isoform_annotated.gff3")
    isoform_gff3_f = os.path.join(args.outdir, "isoform_annotated.filtered.gff3")
    FSM_anno_out = os.path.join(args.outdir, "isoform_FSM_annotation.csv")
    raw_splice_isoform = os.path.join(args.outdir, "splice_raw.gff3")
    tss_tes_stat = os.path.join(args.outdir, "tss_tes.bedgraph")
    transcript_fa = os.path.join(args.outdir, "transcript_assembly.fa")
    transcript_fa_idx = os.path.join(args.outdir, "transcript_assembly.fa.fai")
    tmp_bam = os.path.join(args.outdir, "tmp.align.bam")
    tmp_bed = os.path.join(args.outdir, "tmp.splice_anno.bed12")
    genome_bam = os.path.join(args.outdir, "align2genome.bam")
    realign_bam = os.path.join(args.outdir, "realign2transcript.bam")
    tr_cnt_csv = os.path.join(args.outdir, "transcript_count.csv.gz")
    tr_badcov_cnt_csv = os.path.join(args.outdir, "transcript_count.bad_coverage.csv.gz")
    print "Input parameters:"
    print "\tgene annotation:", args.gff3
    print "\tgenome fasta:", args.genomefa
    if args.inbam != "":
        print "\tinput bam:", args.inbam
        genome_bam = args.inbam
    else:
        print "\tinput fastq:", args.infq
    print "\toutput directory:", args.outdir
    print "\tdirectory contains minimap2:", args.minimap2_dir

    # align reads to genome
    if args.inbam == "" and config_dict["pipeline_parameters"]["do_genome_alignment"]:
        print "### align reads to genome using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        if config_dict["alignment_parameters"]["use_junctions"]:
            gff3_to_bed12(args.minimap2_dir, args.gff3, tmp_bed)
        minimap2_align(args.minimap2_dir, args.genomefa, args.infq, tmp_bam, no_flank=config_dict["alignment_parameters"]["no_flank"], bed12_junc=tmp_bed if config_dict["alignment_parameters"]["use_junctions"] else None)
        samtools_sort_index(tmp_bam, genome_bam)
        os.remove(tmp_bam)
        if config_dict["alignment_parameters"]["use_junctions"]:
            os.remove(tmp_bed)
    else:
        print "### skip aligning reads to genome", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # find isoform
    print "### read gene annotation", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(args.gff3)
    transcript_to_junctions = {tr: blocks_to_junctions(transcript_to_exon[tr]) for tr in transcript_to_exon}
    remove_similar_tr(transcript_dict, gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    if config_dict["pipeline_parameters"]["do_isoform_identification"]:
        print "### find isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        group_bam2isoform(genome_bam, isoform_gff3, tss_tes_stat, "", chr_to_blocks, gene_dict, transcript_to_junctions, transcript_dict, args.genomefa,
        config=config_dict["isoform_parameters"], 
        downsample_ratio=args.downsample_ratio,
        raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)
    else:
        print "### skip finding isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(isoform_gff3)
    ref_dict = {"chr_to_gene":chr_to_gene, "transcript_dict":transcript_dict, "gene_to_transcript":gene_to_transcript, "transcript_to_exon":transcript_to_exon}
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(args.genomefa, transcript_fa, chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i,ref_dict=ref_dict)

    # realign to transcript using minimap2
    if config_dict["pipeline_parameters"]["do_read_realignment"]:
        print "### realign to transcript using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        minimap2_tr_align(args.minimap2_dir, transcript_fa, args.infq, tmp_bam)
        samtools_sort_index(tmp_bam, realign_bam)
        os.remove(tmp_bam)
    else:
        print "### skip read realignment", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # quantification
    if config_dict["pipeline_parameters"]["do_transcript_quantification"]:
        print "### generate transcript count matrix", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept = parse_realigned_bam(realign_bam, transcript_fa_idx, config_dict["isoform_parameters"]["Min_sup_cnt"], config_dict["transcript_counting"]["min_tr_coverage"], config_dict["transcript_counting"]["min_read_coverage"])
        #realigned_bam_coverage(realign_bam, transcript_fa_idx, args.outdir)
        tr_cnt = wrt_tr_to_csv(bc_tr_count_dict, transcript_dict_i, tr_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        wrt_tr_to_csv(bc_tr_badcov_count_dict, transcript_dict_i, tr_badcov_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        annotate_filter_gff(isoform_gff3,args.gff3,isoform_gff3_f,FSM_anno_out,tr_cnt,config_dict["isoform_parameters"]["Min_sup_cnt"])
    else:
        print "### skip transcript quantification", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Example #7
0
def annotate_filter_gff(isoform_gff,ref_gff,isoform_out,anno_out,tr_cnt,min_sup_reads):
    """
    combine FLAMES ouput with reference and filter out transcript by
    realignment result
    """
    gff3_fmt = "{_ch}\t{_sr}\t{_ty}\t{_st}\t{_en}\t{_sc}\t{_stnd}\t{_ph}\t{_attr}"

    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(isoform_gff)
    _, transcript_dict_ref, gene_to_transcript_ref, transcript_to_exon_ref = parse_gff_tree(ref_gff)
    gff_rec = []
    for ch in chr_to_gene:
        for ge in chr_to_gene[ch]:
            gff_tmp = []
            total_cnt = 0
            mi = 99999999999
            ma = -1
            for tr in gene_to_transcript[ge]:
                if (tr in tr_cnt) and (tr_cnt[tr]>=min_sup_reads):
                    gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="transcript",
                        _st=transcript_to_exon[tr][0][0]+1,
                        _en=transcript_to_exon[tr][-1][1],
                        _sc=".",_stnd=transcript_dict[tr].strand, _ph=".",
                        _attr="ID=transcript:{};transcript_id={};Parent=gene:{};support_count={}".format(tr,tr, ge, tr_cnt[tr])))
                    total_cnt += tr_cnt[tr]
                    mi = min(mi,transcript_to_exon[tr][0][0])
                    ma = max(ma,transcript_to_exon[tr][-1][1])
                    exon_idx = 1
                    for ex in transcript_to_exon[tr]:
                        gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="exon",_st=ex[0]+1,_en=ex[1], # `+1` because gff is 1-based
                        _sc=".",_stnd=transcript_dict[tr].strand, _ph=".",
                        _attr="exon_id=exon:{}_{};Parent=transcript:{};rank={}".format(ex[0]+1, ex[1],tr, exon_idx )))
                        exon_idx += 1
            if ge in gene_to_transcript_ref:
                for tr in gene_to_transcript_ref[ge]:
                    if (tr not in transcript_dict) and (tr in tr_cnt) and (tr_cnt[tr]>=min_sup_reads):  # not in FLAMES output but in tr count
                        gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="reference",_ty="transcript",
                            _st=transcript_to_exon_ref[tr][0][0]+1,
                            _en=transcript_to_exon_ref[tr][-1][1],
                            _sc=".",_stnd=transcript_dict_ref[tr].strand, _ph=".",
                            _attr="ID=transcript:{};transcript_id={};Parent=gene:{};support_count={}".format(tr,tr, ge, tr_cnt[tr])))
                        total_cnt += tr_cnt[tr]
                        mi = min(mi,transcript_to_exon_ref[tr][0][0])
                        ma = max(ma,transcript_to_exon_ref[tr][-1][1])
                        exon_idx = 1
                        for ex in transcript_to_exon_ref[tr]:
                            gff_tmp.append(gff3_fmt.format(_ch=ch,_sr="reference",_ty="exon",_st=ex[0]+1,_en=ex[1], # `+1` because gff is 1-based
                            _sc=".",_stnd=transcript_dict_ref[tr].strand, _ph=".",
                            _attr="exon_id=exon:{}_{};Parent=transcript:{};rank={}".format(ex[0]+1, ex[1],tr, exon_idx )))
                            exon_idx += 1
            if len(gff_tmp)>0:
                gff_tmp.insert(0,gff3_fmt.format(_ch=ch,_sr="FLAMES",_ty="gene",
                    _st=mi+1,
                    _en=ma,
                    _sc=".",_stnd=transcript_dict[gene_to_transcript[ge][0]].strand, _ph=".",
                    _attr="ID=gene:{};gene_id={};support_count={}".format(ge, ge, total_cnt)))
                gff_rec.extend(gff_tmp)
    iso_annotated = open(isoform_out,"w")
    iso_annotated.write("##gff-version 3\n")
    iso_annotated.write("\n".join(gff_rec))
    iso_annotated.close()
    annotate_full_splice_match(transcript_to_exon,transcript_to_exon_ref,transcript_dict,transcript_dict_ref,anno_out,tr_cnt,min_sup_reads)