Esempio n. 1
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum, c in clustered.iteritems():
        c.sort(key=lambda x: int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix, int(cnum))

        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr', 'internal', 'ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr', 'internal'
                                ] or reptypes[cnum] == ['internal', 'ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1

        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands) == 1: pstrand = strands.pop()
        else: pstrand = '.'  # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (
            cluster_id, category, len(c), (pend - pstart), pcov)
        pline = [
            c[0][0], 'merged', 'gene',
            str(pstart),
            str(pend), '.', pstrand, '.', pattr
        ]
        newlines.append(pline)

        for l in c:
            l[1] = category
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,
                                                           cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k, v)
                                   for k, v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >> args.outfile, '\t'.join(l)

    for cat in ['prototype', 'oneside', 'soloint', 'sololtr', 'unusual']:
        print >> sys.stderr, '%s:     %d' % (cat, catcount[cat])
Esempio n. 2
0
def make_mergeline(clust, name, category):
    nm_spos     = min(int(l[3]) for l in clust)
    nm_epos = max(int(l[4]) for l in clust)
    nm_strand = set(l[6] for l in clust)
    nm_strand = nm_strand.pop() if len(nm_strand) == 1 else '.'
    nm_cov = utils.covered_len(clust)
    nm_attr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (name, category, len(clust), (nm_epos-nm_spos), nm_cov)
    nm = [clust[0][0], 'merged', 'gene', str(nm_spos), str(nm_epos), '.', nm_strand, '.', nm_attr]
    return nm
Esempio n. 3
0
def make_mergeline(clust, name, category):
    nm_spos = min(int(l[3]) for l in clust)
    nm_epos = max(int(l[4]) for l in clust)
    nm_strand = set(l[6] for l in clust)
    nm_strand = nm_strand.pop() if len(nm_strand) == 1 else '.'
    nm_cov = utils.covered_len(clust)
    nm_attr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (
        name, category, len(clust), (nm_epos - nm_spos), nm_cov)
    nm = [
        clust[0][0], 'merged', 'gene',
        str(nm_spos),
        str(nm_epos), '.', nm_strand, '.', nm_attr
    ]
    return nm
Esempio n. 4
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum,c in clustered.iteritems():
        c.sort(key=lambda x:int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix,int(cnum))
                
        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr','internal','ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr','internal'] or reptypes[cnum] == ['internal','ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1
        
        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend   = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands)==1: pstrand = strands.pop()
        else: pstrand = '.' # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (cluster_id, category, len(c), (pend-pstart), pcov)
        pline = [c[0][0], 'merged', 'gene', str(pstart), str(pend), '.', pstrand, '.', pattr]
        newlines.append(pline)

        for l in c:
            l[1] = category        
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k,v) for k,v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >>args.outfile, '\t'.join(l)       

    for cat in ['prototype','oneside','soloint','sololtr','unusual']:
        print >>sys.stderr, '%s:     %d' % (cat, catcount[cat])