Beispiel #1
0
def annotate_fusion(ref_f, junc_bed, secondary_flag=0, denovo_flag=0):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    # gene annotations
    genes, novel_genes, gene_info, chrom_info = parse_ref(ref_f, 1)
    fusion_bed = junc_bed
    fusions, fusion_index = parse_bed(fusion_bed)  # fusion junctions
    total = set()
    annotated_fusion_f = 'annotated_fusion.txt.tmp'
    with open(annotated_fusion_f, 'w') as outf:
        for chrom in chrom_info:
            # overlap gene annotations with fusion juncrions
            result = []
            # overlap genes
            if chrom in genes:
                result += Interval.overlapwith(genes[chrom].interval,
                                               fusions[chrom])
            # overlap novel genes in denovo mode
            if denovo_flag and chrom in novel_genes:
                result += Interval.overlapwith(novel_genes[chrom].interval,
                                               fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list([x for x in itl[2:] if x.startswith('iso')])
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    fus_loc = '%s\t%d\t%d\tFUSIONJUNC/%s' % (chrom, fus_start,
                                                             fus_end, reads)
                    edge_annotations = []  # first or last exon flag
                    secondary_exon = defaultdict(dict)  # secondary exons
                    annotate_flag = 0
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            if not secondary_flag:
                                continue
                        (fusion_info,
                         index,
                         edge,
                         secondary) = map_fusion_to_iso(fus_start,
                                                        fus_end, s,
                                                        gene_info[iso_id])
                        if fusion_info:
                            annotate_flag += 1
                            bed_info = '\t'.join([fus_loc, '0', s,
                                                  str(fus_start),
                                                  str(fus_start), '0,0,0'])
                            bed = '\t'.join([bed_info, fusion_info, g, i,
                                             index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                        elif secondary_flag and secondary is not None:
                            li, ri = secondary
                            gene = ':'.join([g, s])
                            if li is not None:
                                li = str(li)
                                secondary_exon['left'][gene] = ':'.join([i,
                                                                         li])
                            if ri is not None:
                                ri = str(ri)
                                secondary_exon['right'][gene] = ':'.join([i,
                                                                          ri])
                    if edge_annotations:
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
                    if secondary_flag and not annotate_flag:
                        for gene in secondary_exon['left']:
                            if gene in secondary_exon['right']:
                                left = secondary_exon['left'][gene]
                                right = secondary_exon['right'][gene]
                                g, s = gene.split(':')
                                # for avoid dup, use fus_loc_new
                                fus_loc_new = fus_loc + '\t0\t%s' % s
                                outf.write('%s\t%s:%s\t%s:%s\n' % (fus_loc_new,
                                                                   g, left, g,
                                                                   right))
    print('Annotated %d fusion junctions!' % len(total))
Beispiel #2
0
def fix_fusion(ref_f, genome_fa, out_file, no_fix, secondary_flag=0,
               denovo_flag=0):
    """
    Realign fusion juncrions
    """
    print('Start to fix fusion junctions...')
    fa = check_fasta(genome_fa)
    ref = parse_ref(ref_f, 2)
    annotated_fusion_f = 'annotated_fusion.txt.tmp'
    fusions, fusion_names, fixed_flag = fix_bed(annotated_fusion_f, ref, fa,
                                                no_fix, denovo_flag)
    total = 0
    annotations = set()
    fixed_fusion_f = out_file
    if secondary_flag:
        secondary_f = open('low_conf_%s' % out_file, 'w')
    with open(fixed_fusion_f, 'w') as outf:
        for fus in fusion_names:
            reads = str(fusions[fus])
            fixed = fixed_flag[fus]
            if fixed > 0:
                total += 1
            fixed = str(fixed)
            name = 'circular_RNA/' + reads
            if fus.startswith('secondary'):
                _, loc, strand, left_info, right_info = fus.split('|')
                secondary_f.write('\t'.join([loc, name, fixed, strand,
                                             left_info, right_info]))
                secondary_f.write('\n')
                continue
            gene, iso, chrom, strand, index = fus.split()
            starts, ends = ref['\t'.join([gene, iso, chrom, strand])]
            exon_num = len(starts)
            intron_num = exon_num - 1
            if ',' in index:  # back spliced exons
                s, e = [int(x) for x in index.split(',')]
                if strand == '+':
                    index_info = ','.join(str(x + 1) for x in range(s, e + 1))
                else:
                    index_info = ','.join(str(exon_num - x)
                                          for x in range(s, e + 1))
                start = str(starts[s])
                end = str(ends[e])
                length = str(e - s + 1)
                sizes, offsets = generate_bed(int(start), starts[s:(e + 1)],
                                              ends[s:(e + 1)])
                annotation_info = '\t'.join([chrom, start, end, sizes,
                                             offsets])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                if s == 0:
                    left_intron = 'None'
                else:
                    left_intron = '%s:%d-%d' % (chrom, ends[s - 1], starts[s])
                if e == len(ends) - 1:
                    right_intron = 'None'
                else:
                    right_intron = '%s:%d-%d' % (chrom, ends[e], starts[e + 1])
                intron = '|'.join([left_intron, right_intron])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', length, sizes, offsets,
                                 reads, 'circRNA', gene, iso, index_info,
                                 intron])
            else:  # ciRNAs
                index, start, end = index.split('|')
                size = str(int(end) - int(start))
                annotation_info = '\t'.join([chrom, start, end, size, '0'])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                index = int(index)
                if strand == '+':
                    index_info = str(index + 1)
                else:
                    index_info = str(intron_num - index)
                intron = '%s:%d-%d' % (chrom, ends[index], starts[index + 1])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', '1', size, '0',
                                 reads, 'ciRNA', gene, iso, index_info,
                                 intron])
            if denovo_flag:  # in denovo mode
                annotations.add(annotation_info)
            outf.write(bed + '\n')
    if secondary_flag:
        secondary_f.close()

    os.remove('annotated_fusion.txt.tmp')

    print('Fixed %d fusion junctions!' % total)
Beispiel #3
0
def annotate_fusion(ref_f, out_dir, denovo_flag=0):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    # gene annotations
    genes, novel_genes, gene_info, chrom_info = parse_ref(ref_f, 1)
    fusion_bed = '%s/../fusion_junction.bed' % out_dir
    fusions, fusion_index = parse_bed(fusion_bed)  # fusion junctions
    total = set()
    annotated_fusion_f = '%s/annotated_fusion.txt' % out_dir
    with open(annotated_fusion_f, 'w') as outf:
        for chrom in chrom_info:
            # overlap gene annotations with fusion juncrions
            result = []
            # overlap genes
            if chrom in genes:
                result += Interval.overlapwith(genes[chrom].interval,
                                               fusions[chrom])
            # overlap novel genes in denovo mode
            if denovo_flag and chrom in novel_genes:
                result += Interval.overlapwith(novel_genes[chrom].interval,
                                               fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list(filter(lambda x: x.startswith('iso'), itl[2:]))
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    edge_annotations = []  # first or last exon flag
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            continue
                        (fusion_info,
                         index,
                         edge) = map_fusion_to_iso(fus_start,
                                                   fus_end, s,
                                                   gene_info[iso_id])
                        if fusion_info:
                            fus_start_str = str(fus_start)
                            fus_end_str = str(fus_end)
                            bed_info = '\t'.join([chrom, fus_start_str,
                                                  fus_end_str,
                                                  'FUSIONJUNC/%s' % reads,
                                                  '0', s, fus_start_str,
                                                  fus_start_str, '0,0,0'])
                            bed = '\t'.join([bed_info, fusion_info, g, i,
                                             index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                    if edge_annotations:
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
    print('Annotated %d fusion junctions!' % len(total))
def annotate_fusion(ref_f, out_dir, denovo_flag=0):
    """
    Align fusion juncrions to gene annotations
    """
    print('Start to annotate fusion junctions...')
    # gene annotations
    genes, novel_genes, gene_info, chrom_info = parse_ref(ref_f, 1)
    fusion_bed = '%s/../fusion_junction.bed' % out_dir
    fusions, fusion_index = parse_bed(fusion_bed)  # fusion junctions
    total = set()
    annotated_fusion_f = '%s/annotated_fusion.txt' % out_dir
    with open(annotated_fusion_f, 'w') as outf:
        for chrom in chrom_info:
            # overlap gene annotations with fusion juncrions
            result = []
            # overlap genes
            if chrom in genes:
                result += Interval.overlapwith(genes[chrom].interval,
                                               fusions[chrom])
            # overlap novel genes in denovo mode
            if denovo_flag and chrom in novel_genes:
                result += Interval.overlapwith(novel_genes[chrom].interval,
                                               fusions[chrom])
            for itl in result:
                # extract gene annotations
                iso = list(filter(lambda x: x.startswith('iso'), itl[2:]))
                # for each overlapped fusion junction
                for fus in itl[(2 + len(iso)):]:
                    reads = fus.split()[1]
                    fus_start, fus_end = fusion_index[fus]
                    edge_annotations = []  # first or last exon flag
                    for iso_id in iso:
                        g, i, c, s = iso_id.split()[1:]
                        start = gene_info[iso_id][0][0]
                        end = gene_info[iso_id][-1][-1]
                        # fusion junction excesses boundaries of gene
                        # annotation
                        if fus_start < start - 10 or fus_end > end + 10:
                            continue
                        (fusion_info, index,
                         edge) = map_fusion_to_iso(fus_start, fus_end, s,
                                                   gene_info[iso_id])
                        if fusion_info:
                            fus_start_str = str(fus_start)
                            fus_end_str = str(fus_end)
                            bed_info = '\t'.join([
                                chrom, fus_start_str, fus_end_str,
                                'FUSIONJUNC/%s' % reads, '0', s, fus_start_str,
                                fus_start_str, '0,0,0'
                            ])
                            bed = '\t'.join(
                                [bed_info, fusion_info, g, i, index])
                            if not edge:  # not first or last exon
                                outf.write(bed + '\n')
                                total.add(fus)
                            else:  # first or last exon
                                edge_annotations.append(bed)
                    if edge_annotations:
                        for bed in edge_annotations:
                            outf.write(bed + '\n')
                        total.add(fus)
    print('Annotated %d fusion junctions!' % len(total))
Beispiel #5
0
def fix_fusion(ref_f, genome_fa, out_dir, no_fix, denovo_flag=0):
    """
    Realign fusion juncrions
    """
    print('Start to fix fusion junctions...')
    fa = check_fasta(genome_fa)
    ref = parse_ref(ref_f, 2)
    annotated_fusion_f = '%s/annotated_fusion.txt' % out_dir
    fusions, fusion_names, fixed_flag = fix_bed(annotated_fusion_f, ref, fa,
                                                no_fix, denovo_flag)
    total = 0
    annotations = set()
    fixed_fusion_f = '%s/circ_fusion.txt' % out_dir
    with open(fixed_fusion_f, 'w') as outf:
        for fus in fusion_names:
            reads = str(fusions[fus])
            fixed = fixed_flag[fus]
            if fixed > 0:
                total += 1
            fixed = str(fixed)
            name = 'circular_RNA/' + reads
            gene, iso, chrom, strand, index = fus.split()
            starts, ends = ref['\t'.join([gene, iso, chrom, strand])]
            exon_num = len(starts)
            intron_num = exon_num - 1
            if ',' in index:  # back spliced exons
                s, e = [int(x) for x in index.split(',')]
                if strand == '+':
                    index_info = ','.join(str(x + 1) for x in xrange(s, e + 1))
                else:
                    index_info = ','.join(str(exon_num - x)
                                          for x in xrange(s, e + 1))
                start = str(starts[s])
                end = str(ends[e])
                length = str(e - s + 1)
                sizes, offsets = generate_bed(int(start), starts[s:(e + 1)],
                                              ends[s:(e + 1)])
                annotation_info = '\t'.join([chrom, start, end, sizes,
                                             offsets])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                if s == 0:
                    left_intron = 'None'
                else:
                    left_intron = '%s:%d-%d' % (chrom, ends[s - 1], starts[s])
                if e == len(ends) - 1:
                    right_intron = 'None'
                else:
                    right_intron = '%s:%d-%d' % (chrom, ends[e], starts[e + 1])
                intron = '|'.join([left_intron, right_intron])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', length, sizes, offsets,
                                 reads, 'circRNA', gene, iso, index_info,
                                 intron])
            else:  # ciRNAs
                index, start, end = index.split('|')
                size = str(int(end) - int(start))
                annotation_info = '\t'.join([chrom, start, end, size, '0'])
                # remove circular RNA info duplications in denovo mode
                if denovo_flag and annotation_info in annotations:
                    continue
                index = int(index)
                if strand == '+':
                    index_info = str(index + 1)
                else:
                    index_info = str(intron_num - index)
                intron = '%s:%d-%d' % (chrom, ends[index], starts[index + 1])
                bed = '\t'.join([chrom, start, end, name, fixed, strand, start,
                                 start, '0,0,0', '1', size, '0',
                                 reads, 'ciRNA', gene, iso, index_info,
                                 intron])
            if denovo_flag:  # in denovo mode
                annotations.add(annotation_info)
            outf.write(bed + '\n')
    print('Fixed %d fusion junctions!' % total)