コード例 #1
0
def main(args):
    if args.mapping:
        tdict = {}
        table = utils.tab_line_gen(args.mapping)
        header = table.next()
        for t in table:
            d = dict(zip(header, t))
            tdict[d[args.key]] = d
    else:
        # Lookup of any key in tdict returns a dictionary where the value of 'name' is ''
        tdict = defaultdict(lambda: {'name': ''})

    gtf = utils.tab_line_gen(args.infile)
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        assert 'locus' not in attrd
        if g[1] == 'merged':
            newname = tdict[attrd['name']][args.value]
            if newname == '':
                n1, n2 = attrd['name'].split('_')
                newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2)
            g[8] += ' locus "%s";' % newname
            print >> args.outfile, '\t'.join(g)
        else:
            newname = tdict[attrd['gene_id']][args.value]
            if newname == '':
                n1, n2 = attrd['gene_id'].split('_')
                newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2)
            g[8] += ' locus "%s";' % newname
            print >> args.outfile, '\t'.join(g)
コード例 #2
0
def main(args):
    if args.mapping:
        tdict = {}
        table = utils.tab_line_gen(args.mapping)
        header = table.next()
        for t in table:
            d = dict(zip(header,t))
            tdict[d[args.key]] = d
    else:
        # Lookup of any key in tdict returns a dictionary where the value of 'name' is ''
        tdict = defaultdict(lambda:{'name':''})

    gtf = utils.tab_line_gen(args.infile)
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        assert 'locus' not in attrd    
        if g[1]=='merged':
            newname = tdict[attrd['name']][args.value]
            if newname == '':
                n1,n2 = attrd['name'].split('_')
                newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2)
            g[8] += ' locus "%s";' % newname
            print >>args.outfile, '\t'.join(g)
        else:
            newname = tdict[attrd['gene_id']][args.value]
            if newname == '':
                n1,n2 = attrd['gene_id'].split('_')
                newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2)
            g[8] += ' locus "%s";' % newname
            print >>args.outfile, '\t'.join(g)
コード例 #3
0
def main(args):
    combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]
    if args.prefix is None:
        prefix = Counter(g.attr['locus'].split('_')[0] for g in combined_gtf).most_common()[0][0]
    else:
        prefix = args.prefix

    namemap = {}
    if args.cytoband:
        byband = defaultdict(list)
        p1 = Popen('bedtools intersect -wo -a - -b %s' % args.cytoband, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        o,e = p1.communicate(input='\n'.join(str(g) for g in combined_gtf if g.feature.startswith('span')))
        for l in utils.tab_line_gen(o.strip('\n').split('\n')):
            g1 = GTFLine(l[:9])
            g2 = GTFLine(l[9:-1])
            band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id'])
            byband[band].append(g1)

        for band, locs in byband.iteritems():
            if len(locs) == 1:
                namemap[locs[0].attr['locus']] = '%s_%s' % (prefix, band)
            else:
                locs.sort(key=lambda x:x.start)
                for i,loc in enumerate(locs):
                    namemap[loc.attr['locus']] = '%s_%s%s' % (prefix, band, someletters[i])
    else:
        for g in combined_gtf:
            namemap[g.attr['locus']] = g.attr['locus']
    
    for g in combined_gtf:
        g.attr['oLocus'] = g.attr['locus']
        g.attr['locus'] = namemap[g.attr['locus']]
        print g
コード例 #4
0
def main(args):
    # Load the internal GTF
    combined_gtf = [GTFLine(l) for l in tab_line_gen(args.internalGTF)]
    combined_gtf = [g for g in combined_gtf if not g.source=='span_internal']
    
    # Load the left GTF
    for l in tab_line_gen(args.ltrGTF):
        l_b = GTFLine(l[10:])
        l_b.attr['locus'] = GTFLine(l[:9]).attr['locus']
        combined_gtf.append(l_b)

    byloc = defaultdict(list)
    for g in combined_gtf:
        byloc[g.attr['locus']].append(g)
    
    #for locid,ilocus in itertools.groupby(combined_gtf, key=lambda x:x.attr['locus']):
    for locid,locus in byloc.iteritems(): 
        locus = utils.remove_dups(locus)
        locus = utils.adjust_overlaps(locus)
        category = locus_category(locus)
        for a in locus:
            a.source = category
        spanning = utils.create_spanning(locus)
        spanning.attr = {'category': category,
                         'locus': locid }
        print >>args.outfile, '### %s ###' % locid
        print >>args.outfile, spanning
        print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
コード例 #5
0
def main(args):
    # Load the internal GTF
    combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.internalGTF)]
    combined_gtf = [g for g in combined_gtf if not g.feature.startswith('span')]
    
    # Load the LTR GTF
    for l in utils.tab_line_gen(args.ltrGTF):
        l_b = GTFLine(l[-10:])
        l_b.attr['locus'] = GTFLine(l[:9]).attr['locus']
        combined_gtf.append(l_b)

    model_lengths = utils.guess_rmsk_model_lengths(combined_gtf)
    
    # Organize by locus    
    byloc = defaultdict(list)
    for g in combined_gtf:
        byloc[g.attr['locus']].append(g)
    
    for locid,locus in byloc.iteritems(): 
        # Remove duplicate annotations
        locus = utils.remove_dups(locus)
        # Adjust overlaps
        locus = utils.adjust_overlaps(locus)
        # Determine category
        category = locus_category(locus)
        # Add information to all annotations
        strand = set([a.strand for a in locus])
        if len(strand) == 1 and '-' in locus:
            locus.sort(key=lambda x:x.end, reverse=True)
        else:
            locus.sort(key=lambda x:x.start)
        for i,a in enumerate(locus):
            a.source = category
            a.attr['exon_number'] = i+1
        
        # Create spanning annotation
        spanning = utils.create_spanning(locus)        
        # Calculate model coverage and percent
        internal_model = get_internal_model(locus)
        model_cov = calculate_internal_coverage(locus)
        model_pct = min(100, (float(model_cov) / model_lengths[internal_model])*100)
        spanning.attr = {'locus': locid ,
                         'category': category,
                         'model_cov': model_cov,
                         'model_pct': '%.1f' % model_pct,
                         'exons':len(locus)
                         }
        
        print >>args.outfile, '### %s ###' % locid
        print >>args.outfile, spanning
        print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
コード例 #6
0
ファイル: gtf2table.py プロジェクト: gwcbi/HERV_annotation
def main(parser):
    args = parser.parse_args()

    rows = []
    base_columns = [
        'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand',
        'frame'
    ]
    attr_columns = set()

    # Parse rows
    gtflines = utils.tab_line_gen(args.infile)
    for l in utils.sort_gtf(gtflines):
        rowd = dict(zip(base_columns, l[:8]))
        for k, v in re.findall('(\S+)\s+"([\s\S]+?)";', l[8]):
            attr_columns.add(k)
            rowd[k] = v
        rows.append(rowd)

    # Set column headers
    columns = base_columns + list(attr_columns)
    if args.keycol in columns:
        columns.remove(args.keycol)
        columns = [args.keycol] + columns

    # Print the table
    print >> args.outfile, '\t'.join(columns)
    for rowd in rows:
        print >> args.outfile, '\t'.join(
            [rowd[c] if c in rowd else '' for c in columns])
コード例 #7
0
def main(args):
    ### Read the GTF file ################################################################
    gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]
    
    bychrom = defaultdict(lambda:{'+':list(), '-':list()})
    for g in gtf:
        bychrom[g.chrom][g.strand].append(g)

    merged_hits = []
    for cchrom, strands in bychrom.iteritems():
        # Plus strand
        if len(strands['+']):
            strands['+'].sort(key=lambda x:x.start)
            cur = [ strands['+'][0] ]
            for g1 in strands['+'][1:]:
                g0 = cur[-1]
                # Genomic distance between hits
                gdist = g1.start - g0.end            
                if gdist <= args.shortdist:
                    domerge = True
                else:
                    domerge = g0.attr['repLeft'] < g1.attr['repLeft']
                    domerge &= gdist < args.longdist
                if domerge:
                    cur.append(g1)
                else:
                    merged_hits.append(cur)
                    cur = [ g1 ]
            merged_hits.append(cur)

        # Minus strand
        if len(strands['-']):
            strands['-'].sort(key=lambda x:x.end, reverse=True)
            cur = [ strands['-'][0] ]
            for g1 in strands['-'][1:]:
                g0 = cur[-1]
                # Genomic distance between hits
                gdist = g0.start - g1.end
                if gdist <= args.shortdist:
                    domerge = True
                else:
                    domerge = g0.attr['repStart'] < g1.attr['repStart']
                    domerge &= gdist < args.longdist
                if domerge:
                    cur.append(g1)
                else:
                    merged_hits.append(cur)
                    cur = [ g1 ]
            merged_hits.append(cur)
    
    for i,cur in enumerate(merged_hits):
        locid = '%s_%04d' % (args.prefix, i+1)
        spanning = utils.create_spanning(cur)
        spanning.attr['locus'] = locid
        for g in cur:
            g.attr['locus'] = locid
        
        print >>args.outfile, '### %s ###' % locid 
        print >>args.outfile, spanning
        print >>args.outfile, '\n'.join(str(_) for _ in sorted(cur,key=lambda x:x.start))
コード例 #8
0
def main(args):
    # Load GTF file
    gtf = utils.tab_line_gen(args.infile)

    # Skip lines not being merged
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        if 'name' in attrd and attrd['name'] == args.name:
            break
        else:
            print >>args.outfile, '\t'.join(g)

    # Get all lines for locus to be split
    mergelines = [g]
    sublines   = []
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        if g[1]=='merged' and attrd['name'] == args.name:
            mergelines.append(g)
        elif 'gene_id' in attrd and attrd['gene_id'] == args.name:
            sublines.append(g)
        else:
            break

    # Set the new category and names
    if args.category:
        category1, category2 = args.category.split(',')
    else:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        category1 = attrd['category']
        category2 = attrd['category']

    if args.newname:
        newname1, newname2 = args.newname.split(',')
    else:
        newname1 = '%s.1' % args.name
        newname2 = '%s.2' % args.name
    
    # Print out the edited lines
    sub1 = sublines[:args.split]
    print >>args.outfile, '\t'.join(make_mergeline(sub1, newname1, category1))
    for l in sub1:
        l[1] = category1
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname1, l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname1, l[8])
        print >>args.outfile, '\t'.join(l)

    sub2 = sublines[args.split:]
    print >>args.outfile, '\t'.join(make_mergeline(sub2, newname2, category2))
    for l in sub2:
        l[1] = category2
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname2, l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname2, l[8])
        print >>args.outfile, '\t'.join(l)

    # Resume printing the file
    print >>args.outfile, '\t'.join(g) 
    for g in gtf:
        print >>args.outfile, '\t'.join(g)
コード例 #9
0
def main(args):
    if args.chroms:
        chroms = [l.strip('\n').split('\t')[0] for l in args.chroms]
    else:
        chroms = None
    
    for l in utils.sort_gtf(utils.tab_line_gen(args.infile), chroms):
        print >>args.outfile, '\t'.join(l)
コード例 #10
0
def main(args):
    gtf = [GTFLine(l).asdict() for l in utils.tab_line_gen(args.infile)]
    allkeys = set(chain.from_iterable(d.keys() for d in gtf))
    columns = GTFLine.GTFCOLS + GTFLine.ATTRORDER
    columns += [k for k in sorted(allkeys) if k not in columns]
    print >>args.outfile, '\t'.join(columns)
    for d in gtf:
        print >>args.outfile, '\t'.join(str(d[c]) if c in d else '' for c in columns)
コード例 #11
0
ファイル: manual_merge.py プロジェクト: gwcbi/HERV_annotation
def main(args):
    names = args.names.split(',')
    category = args.category
    gtf = utils.tab_line_gen(args.infile)

    # Skip lines not being merged
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        if 'name' in attrd and attrd['name'] in names:
            break
        else:
            print >> args.outfile, '\t'.join(g)

    mergelines = [g]
    sublines = []
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        if g[1] == 'merged' and attrd['name'] in names:
            mergelines.append(g)
        elif 'gene_id' in attrd and attrd['gene_id'] in names:
            sublines.append(g)
        else:
            break

    # Merge the merge lines
    new_merge = mergelines[0][:]
    nm_spos = min(int(m[3]) for m in mergelines)
    nm_epos = max(int(m[4]) for m in mergelines)
    nm_strand = set(m[6] for m in mergelines)

    all_attrd = [
        dict(re.findall('(\S+)\s+"([\s\S]+?)";', m[8])) for m in mergelines
    ]
    nm_name = all_attrd[0]['name']
    nm_category = category if category is not None else all_attrd[0]['category']
    nm_nfeats = sum(int(d['nfeats']) for d in all_attrd)
    nm_length = (nm_epos - nm_spos)
    nm_cov = sum(int(d['cov']) for d in all_attrd)

    new_merge[3] = str(nm_spos)
    new_merge[4] = str(nm_epos)
    new_merge[6] = nm_strand.pop() if len(nm_strand) == 1 else '.'
    new_merge[
        8] = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (
            nm_name, nm_category, nm_nfeats, nm_length, nm_cov)
    print >> args.outfile, '\t'.join(new_merge)

    for l in sublines:
        l[1] = category if category is not None else all_attrd[0]['category']
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % nm_name, l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % nm_name,
                      l[8])
        print >> args.outfile, '\t'.join(l)

    # Resume printing the file
    print >> args.outfile, '\t'.join(g)
    for g in gtf:
        print >> args.outfile, '\t'.join(g)
コード例 #12
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum, c in clustered.iteritems():
        c.sort(key=lambda x: int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix, int(cnum))

        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr', 'internal', 'ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr', 'internal'
                                ] or reptypes[cnum] == ['internal', 'ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1

        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands) == 1: pstrand = strands.pop()
        else: pstrand = '.'  # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (
            cluster_id, category, len(c), (pend - pstart), pcov)
        pline = [
            c[0][0], 'merged', 'gene',
            str(pstart),
            str(pend), '.', pstrand, '.', pattr
        ]
        newlines.append(pline)

        for l in c:
            l[1] = category
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,
                                                           cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k, v)
                                   for k, v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >> args.outfile, '\t'.join(l)

    for cat in ['prototype', 'oneside', 'soloint', 'sololtr', 'unusual']:
        print >> sys.stderr, '%s:     %d' % (cat, catcount[cat])
コード例 #13
0
def main(args):
    ltrtypes = defaultdict(list)
    gtf = utils.tab_line_gen(args.gtf)
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        if attrd['repType'] == 'ltr':
            ltrtypes[attrd['locus']].append(attrd['repName'])

    titer = utils.tab_line_gen(args.infile)
    header = titer.next()
    print >> args.outfile, '\t'.join(header + ['subfamily'])

    for row in titer:
        if row[0] in ltrtypes:
            subfam = ','.join(sorted(set(ltrtypes[row[0]])))
        else:
            subfam = ''

        print >> sys.stdout, '\t'.join(row + [subfam])
コード例 #14
0
def main(args):
    ltrtypes = defaultdict(list)
    gtf = utils.tab_line_gen(args.gtf)
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        if attrd['repType']=='ltr':
            ltrtypes[attrd['locus']].append(attrd['repName'])
    
    titer = utils.tab_line_gen(args.infile)
    header = titer.next()
    print >>args.outfile, '\t'.join(header + ['subfamily'])

    for row in titer:
        if row[0] in ltrtypes:
            subfam  = ','.join(sorted(set(ltrtypes[row[0]])))
        else:
            subfam = ''
        
        print >>sys.stdout, '\t'.join(row + [subfam])
コード例 #15
0
def main(args):
    # Filehandle for rejected loci
    if args.reject_gtf is None:
        import os
        args.reject_gtf = open(os.devnull,'w')

    # Filtering parameters
    min_internal_pct = args.min_internal_pct * 100
    min_internal_bases = args.min_internal_bases
    
    # Load the internal GTF
    combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]
    model_lengths = utils.guess_rmsk_model_lengths(combined_gtf)
    
    loccounts = Counter()
    byloc = defaultdict(list)
    for g in combined_gtf:
        byloc[g.attr['locus']].append(g)
    
    if min_internal_pct > 0:
        print >>sys.stderr, "Removing loci matching less than %d percent of internal model..." % int(min_internal_pct)
    if min_internal_bases > 0:
        print >>sys.stderr, "Removing loci matching less than %d internal bases..." % min_internal_bases
    
    rejectflag = False
    for locid,locus in byloc.iteritems():
        spn = utils.get_span(locus)
        locus  = [a for a in locus if a != spn] # not a.feature.startswith('span')]
        category = spn.attr['category']
        model_pct = spn.attr['model_pct']
        model_cov = spn.attr['model_cov']
        
        if model_pct >= min_internal_pct and model_cov >= min_internal_bases:
            loccounts[category] += 1
            outh = args.outfile
        else:
            if not rejectflag:
                print >>sys.stderr, 'Removed loci:'
                print >>sys.stderr, '%-18s%-6s%-6s%s' % ('locus','bp','pct','category')
                rejectflag = True
            loccounts['rejected'] += 1
            print >>sys.stderr, '%-18s%-6d%-6.1f%s' % (locid, model_cov, model_pct, category)
            outh = args.reject_gtf
        
        print >>outh, '### %s ###' % locid
        print >>outh, spn
        print >>outh, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))            
    
    if not rejectflag:
        print >>sys.stderr, 'All passed filter.'

    print >>sys.stderr, 'Summary:'
    for cat in ['internal','prototype','oneside','unusual','rejected']:
        print >>sys.stderr, '%s%d' % (cat.ljust(20), loccounts[cat])
コード例 #16
0
def main(args):
    names = args.names.split(',')
    category = args.category
    gtf = utils.tab_line_gen(args.infile)

    # Skip lines not being merged
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        if 'name' in attrd and attrd['name'] in names:
            break
        else:
            print >>args.outfile, '\t'.join(g)

    mergelines = [g]
    sublines   = []
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8]))
        if g[1]=='merged' and attrd['name'] in names:
            mergelines.append(g)
        elif 'gene_id' in attrd and attrd['gene_id'] in names:
            sublines.append(g)
        else:
            break

    # Merge the merge lines    
    new_merge = mergelines[0][:]
    nm_spos = min(int(m[3]) for m in mergelines)
    nm_epos = max(int(m[4]) for m in mergelines)
    nm_strand = set(m[6] for m in mergelines)
    
    all_attrd = [dict(re.findall('(\S+)\s+"([\s\S]+?)";', m[8])) for m in mergelines]
    nm_name = all_attrd[0]['name']
    nm_category = category if category is not None else all_attrd[0]['category']
    nm_nfeats = sum(int(d['nfeats']) for d in all_attrd)
    nm_length = (nm_epos - nm_spos)
    nm_cov = sum(int(d['cov']) for d in all_attrd)
    
    new_merge[3] = str(nm_spos)
    new_merge[4] = str(nm_epos)
    new_merge[6] = nm_strand.pop() if len(nm_strand) == 1 else '.'
    new_merge[8] = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (nm_name, nm_category, nm_nfeats, nm_length, nm_cov)
    print  >>args.outfile, '\t'.join(new_merge)

    for l in sublines:
        l[1] = category if category is not None else all_attrd[0]['category']
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % nm_name , l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % nm_name , l[8])
        print >>args.outfile, '\t'.join(l)

    # Resume printing the file
    print >>args.outfile, '\t'.join(g) 
    for g in gtf:
        print >>args.outfile, '\t'.join(g)
コード例 #17
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum,c in clustered.iteritems():
        c.sort(key=lambda x:int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix,int(cnum))
                
        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr','internal','ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr','internal'] or reptypes[cnum] == ['internal','ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1
        
        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend   = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands)==1: pstrand = strands.pop()
        else: pstrand = '.' # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (cluster_id, category, len(c), (pend-pstart), pcov)
        pline = [c[0][0], 'merged', 'gene', str(pstart), str(pend), '.', pstrand, '.', pattr]
        newlines.append(pline)

        for l in c:
            l[1] = category        
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k,v) for k,v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >>args.outfile, '\t'.join(l)       

    for cat in ['prototype','oneside','soloint','sololtr','unusual']:
        print >>sys.stderr, '%s:     %d' % (cat, catcount[cat])
コード例 #18
0
def main(args):
    combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]

    # Find overlaps within the span features
    overlap_groups = utils.find_overlaps([g for g in combined_gtf if g.feature.startswith('span')])
    overlap_groups = {k:v for k,v in overlap_groups.iteritems() if len(v) > 1}
    print >>sys.stderr, "Found %d groups with conflict." % len(overlap_groups)    
    
    # Resolve commands
    if args.resolve_file is not None:
        resolve_cmds = json.load(args.resolve_file)
    else:
        if args.resolve is not None:
            # Resolve commands were provided as command-line argument
            resolve_cmds = args.resolve
        else:
            # Prompt user to enter resolve commands
            resolve_cmds = {}
            for groupid in sorted(overlap_groups.keys(), key=lambda x:int(x)):
                ogroup = overlap_groups[groupid]
                print >>sys.stderr, utils.groupstr(groupid, ogroup)                        
                concmd = utils.prompt_cmd()
                resolve_cmds[groupid] = concmd
        
        print >>sys.stderr, 'Commands for resolving conflicts (JSON):\n'
        print >>sys.stderr, json.dumps(resolve_cmds)

    # Resolve the conflicts
    byloc = defaultdict(list)
    for g in combined_gtf:
        byloc[g.attr['locus']].append(g)
    
    for groupid, ogroup in overlap_groups.iteritems():
        print >>sys.stderr, utils.groupstr(groupid, ogroup)    
        locids = [a.attr['locus'] for a in ogroup]
        # Temporarily remove the loci
        fulllocs = {}
        for locid in locids:
            print >>sys.stderr, '\tRemoving %s' % locid      
            fulllocs[locid] = byloc.pop(locid)
        assert groupid in resolve_cmds
        newlocs = resolve(resolve_cmds[groupid], ogroup, fulllocs)
        for newlocid, newlocus in newlocs.iteritems():
            print >>sys.stderr, '\tInserting %s' % newlocid
            # print >>sys.stderr, '\n'.join(str(_) for _ in newlocus)
        byloc.update(newlocs)

    for locid,locus in byloc.iteritems():
        print >>args.outfile, '### %s ###' % locid
        print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
コード例 #19
0
def main(args):
    discard = set()
    discard_fh = open(args.discard, 'w')

    lines = utils.tab_line_gen(args.infile)
    for l in lines:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
        if l[1] == 'merged':
            cov = int(attrd['cov'])
            if cov < args.threshold:
                discard.add(attrd['name'])
                print >>discard_fh, '\t'.join(l)
            else:
                print >>args.outfile, '\t'.join(l)            
        else:
            if attrd['gene_id'] in discard:
                print >>discard_fh, '\t'.join(l)
            else:
                print >>args.outfile, '\t'.join(l)
コード例 #20
0
def main(args):
    discard = set()
    discard_fh = open(args.discard, 'w')

    lines = utils.tab_line_gen(args.infile)
    for l in lines:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8]))
        if l[1] == 'merged':
            cov = int(attrd['cov'])
            if cov < args.threshold:
                discard.add(attrd['name'])
                print >> discard_fh, '\t'.join(l)
            else:
                print >> args.outfile, '\t'.join(l)
        else:
            if attrd['gene_id'] in discard:
                print >> discard_fh, '\t'.join(l)
            else:
                print >> args.outfile, '\t'.join(l)
コード例 #21
0
def main(args):
    ### Read the GTF file ################################################################
    gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]
    
    ### Correct model coordinates ########################################################
    # The repStart, repEnd, and repLeft attributes downloaded from the UCSC rmsk database
    # does not always give the same model length. Here we guess what the correct model
    # length is then correct each record    
    mlen = utils.guess_rmsk_model_lengths(gtf)
    print >>sys.stderr, 'Model lengths:'
    print >>sys.stderr, '\n'.join('%s%d' % (k.ljust(16), mlen[k]) for k in sorted(mlen.keys()))
    correct_rmsk_model_coordinates(gtf,mlen)
    # Check that model coordinates are correct
    for g in gtf:
        if g.strand == '+':
            trueend = mlen[g.attr['repName']] + g.attr['repLeft']
        else:
            trueend = mlen[g.attr['repName']] + g.attr['repStart']
        assert trueend == g.attr['repEnd']
        
        print >>args.outfile, g
コード例 #22
0
ファイル: sortgtf.py プロジェクト: gwcbi/HERV_annotation
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    for l in utils.sort_gtf(lines):
        print >>args.outfile, '\t'.join(l)
コード例 #23
0
def main(args):
    ### Read the GTF file ################################################################
    print >> sys.stderr, 'Loading GTF: %s' % args.internal_file
    gtf = [
        GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file, 'rU'))
    ]

    ### Get model lengths
    # mlen = calculate_model_lengths(gtf)
    # print mlen
    mlen = calculate_model_lengths2(gtf)
    print >> sys.stderr, 'Model lengths: %s' % mlen

    ### Correct the model coordinates ####################################################
    correct_model_coordinates(gtf, mlen)
    for g in gtf:
        if g.strand == '+':
            trueend = mlen[g.attr['repName']] + g.attr['repLeft']
        else:
            trueend = mlen[g.attr['repName']] + g.attr['repStart']
        assert trueend == g.attr['repEnd']

    ### Organize hits by chromosome ######################################################
    bychrom = defaultdict(list)
    for g in gtf:
        bychrom[g.chrom].append(g)

    ### List of HERV loci ################################################################
    print >> sys.stderr, 'Assembling HERV loci'
    all_locs = []

    ### Create HERV loci for plus strand #################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            plus = [h for h in bychrom[chrom] if h.strand == '+']
            if not plus: continue
            plus.sort(key=lambda x: x.start)
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1))
            cur.internal.append(plus[0])
            for p1 in plus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p1.start - p0.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10:  ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme
                    insequence = p0.attr['repLeft'] < p1.attr['repLeft']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' %
                                    (args.prefix, len(all_locs) + 1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Create HERV loci for minus strand ################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            minus = [h for h in bychrom[chrom] if h.strand == '-']
            if not minus: continue
            minus.sort(key=lambda x: x.end,
                       reverse=True)  # Sort in reverse order
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1))
            cur.internal.append(minus[0])
            for p1 in minus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p0.start - p1.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10:  ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme
                    insequence = p0.attr['repStart'] < p1.attr['repStart']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' %
                                    (args.prefix, len(all_locs) + 1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Add LTRs to HERV loci ############################################################
    print >> sys.stderr, 'Finding flanking LTRs'
    for loc in all_locs:
        loc.find_ltr(args.ltr_files, args.flank)
        loc.adjust_overlaps()

    print >> sys.stderr, "Initial counts:"
    print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                   for cat, count in Counter(
                                       c.category()
                                       for c in all_locs).most_common())

    ### Filtering ########################################################################
    reject = set()
    if args.minpct > 0 or args.mincov > 0:
        print >> sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (
            int(args.minpct * 100), args.mincov)
        for loc in all_locs:
            if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct
                                  ) or loc.model_cov() < args.mincov:
                print >> sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(),
                                                     loc.category())
                reject.add(loc)

        for rloc in reject:
            all_locs.remove(rloc)

        print >> sys.stderr, "After filtering:"
        print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                       for cat, count in Counter(
                                           c.category()
                                           for c in all_locs).most_common())
        print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject))

    ### Deal with overlapping loci #######################################################
    # Create GTF with all_locs
    with open('tmp.gtf', 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs):
            print >> outh, '\t'.join(g)

    # Cluster overlapping and bookended using bedtools
    p1 = Popen('bedtools cluster -i tmp.gtf',
               shell=True,
               stdout=PIPE,
               stderr=PIPE)
    out, err = p1.communicate()
    os.remove('tmp.gtf')

    # Parse bedtools output
    overlap_groups = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        overlap_groups[f[-1]].append(GTFLine(f[:9]))

    # Remove clusters with one
    for k in overlap_groups.keys():
        if len(overlap_groups[k]) == 1:
            del overlap_groups[k]

    print >> sys.stderr, "%d overlap groups" % len(overlap_groups)

    if args.igv_preview and len(overlap_groups) > 0:
        print >> sys.stderr, "Loading IGV"
        # Create file for IGV viewing
        with open('tmp.gtf', 'w') as outh:
            liter = utils.sort_gtf(
                chain.from_iterable(loc.each_gtf() for loc in all_locs))
            print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(
            os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf'))
        igv.load(os.path.join(os.getcwd(), 'tmp.gtf'))

    tandem = []
    for k in sorted(overlap_groups.keys(), key=lambda x: int(x)):
        ogroup = overlap_groups[k]
        if args.igv_preview:
            locus_str = '%s:%s-%s' % (ogroup[0].chrom,
                                      min(gl.start for gl in ogroup) - 5000,
                                      max(gl.end for gl in ogroup) + 5000)
            igv.goto(locus_str)
            igv.expand()

        # Get locus for each member of overlap group
        og_locus = {}
        for o in ogroup:
            tmp = [c for c in all_locs if c.id == o.attr['name']]
            assert len(tmp) == 1
            og_locus[o.attr['name']] = tmp[0]
        # Print out the model coverage
        for n, loc in og_locus.iteritems():
            print >> sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(),
                                                 loc.category())

        # Parse user input
        z = raw_input('Action to take: ').strip()
        if z == '': continue
        inputcmd = z.strip().split(' ')
        if inputcmd[0] == 'REJECT':
            if len(inputcmd) == 1:
                # Only max will be kept
                st = sorted([loc for n, loc in og_locus.iteritems()],
                            key=lambda x: x.model_cov(),
                            reverse=True)[1:]
                loc_ids = [_.id for _ in st]
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split(',')
            else:
                assert False
            for loc_id in loc_ids:
                reject.add(og_locus[loc_id])
        elif inputcmd[0] == 'TANDEM':
            if len(inputcmd) == 1:
                assert len(og_locus) == 2, 'More than 2 loci are present'
                tandem.append([loc for n, loc in og_locus.iteritems()])
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split('+')
                tandem.append([og_locus[loc_id] for loc_id in loc_ids])
            else:
                assert False
        elif inputcmd[0] == 'DIFF':
            n1, n2 = inputcmd[1].split('-')
            g1 = og_locus[n1]
            g2 = og_locus[n2]
            if g1.span()[0] < g2.span()[1]:
                g1.shorten(g2.span()[1] + 20, g1.span()[1])
            elif g1.span()[1] < g2.span()[0]:
                g1.shorten(g1.span()[0], g2.span()[0] - 20)
            else:
                print "no overlap!"
            print g1
        elif inputcmd[0] == 'IGNORE':
            continue
        else:
            assert False, 'Unknown command: "%s"' % inputcmd[0]

    # Remove rejected annotations
    for rloc in reject:
        if rloc in all_locs:
            all_locs.remove(rloc)

    # Create the tandem annotations
    for tgroup in tandem:
        tandem_loc = HERVLocus(id=tgroup[0].id)
        tandem_loc.internal = list(
            chain.from_iterable(loc.internal for loc in tgroup))
        if tandem_loc.strand() == '+':
            tandem_loc.internal.sort(key=lambda x: x.start)
        else:
            tandem_loc.internal.sort(key=lambda x: x.end, reverse=True)

        tandem_loc.find_ltr(args.ltr_files, 1000)
        tandem_loc.adjust_overlaps()
        tandem_loc.is_tandem = True
        all_locs.append(tandem_loc)
        # Remove from original
        for rloc in tgroup:
            all_locs.remove(rloc)

    print >> sys.stderr, "After overlap removal:"
    print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                   for cat, count in Counter(
                                       c.category()
                                       for c in all_locs).most_common())
    print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject))
    if args.igv_preview and len(overlap_groups) > 0: os.remove('tmp.gtf')

    ### Sort loci ########################################################################
    bychrom = defaultdict(list)
    for loc in all_locs:
        bychrom[loc.chrom()].append(loc)

    final_locs = []
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            for loc in sorted(bychrom[chrom], key=lambda x: x.span()[0]):
                final_locs.append(loc)

    for i, loc in enumerate(final_locs):
        loc.id = '%s_%04d' % (args.prefix, i + 1)

    ### Rename loci according to cytoband #################################################
    # Create GTF with all_locs
    with open('tmp.gtf', 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >> outh, '\t'.join(g)

    p1 = Popen(
        'bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf',
        shell=True,
        stdout=PIPE,
        stderr=PIPE)
    out, err = p1.communicate()
    os.remove('tmp.gtf')

    byband = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        g1 = GTFLine(f[:9])
        g2 = GTFLine(f[9:-1])
        band = '%s%s' % (g2.chrom.strip('chr'), g2.attr['gene_id'])
        byband[band].append(g1)

    namemap = {}
    for band, glist in byband.iteritems():
        if len(glist) == 1:
            namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band)
        else:
            glist.sort(key=lambda x: x.start)
            for i, gl in enumerate(glist):
                namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band,
                                                        someletters[i])

    for loc in final_locs:
        loc.locus_name = namemap[loc.id]

    ### Create annotation files ##########################################################
    print >> sys.stderr, "Writing annotation files"
    with open('%s.gtf' % args.prefix, 'w') as outh:
        liter = utils.sort_gtf(
            chain.from_iterable(loc.each_gtf() for loc in final_locs))
        print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in final_locs:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_reject.gtf' % args.prefix, 'w') as outh:
        liter = utils.sort_gtf(
            chain.from_iterable(loc.each_gtf() for loc in reject))
        print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in reject:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_span.gtf' % args.prefix, 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >> outh, '\t'.join(g)

    with open('%s_table.txt' % args.prefix, 'w') as outh:
        print >> outh, '\t'.join([
            'locus_name', 'id', 'strand', 'chrom', 'start', 'end', 'strand',
            'nfeats', 'width', 'model_cov', 'ltr5_model', 'int_model',
            'ltr3_model'
        ])
        for loc in final_locs:
            mgtf = GTFLine(loc.span_gtf())
            row = [
                loc.locus_name,
                loc.id,
                loc.category(),
                mgtf.chrom,
                mgtf.start,
                mgtf.end,
                mgtf.strand,
                mgtf.attr['nfeats'],
                loc.width(),
                loc.model_cov(),
                loc.ltr_up_name(),
                loc.internal_name(),
                loc.ltr_down_name(),
            ]
            print >> outh, '\t'.join(str(_) for _ in row)

    ### Extract sequences ################################################################
    if args.get_sequences:
        print >> sys.stderr, "Extracting sequences"
        genome_fasta = args.genome_fasta  # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa'
        genome = dict((s.id, s) for s in SeqIO.parse(genome_fasta, 'fasta'))

        with open('%s.full.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0],
                                           loc.span()[1], loc.strand())
                print >> outh, '>%s|%s|%s' % (loc.locus_name, loc.category(),
                                              gcoord)
                print >> outh, str(loc.entire_sequence(genome).seq)

        with open('%s.internal.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (
                    loc.chrom(), min(p.start for p in loc.internal),
                    max(p.end for p in loc.internal), loc.strand())
                print >> outh, '>%s_int|%s|%s|%s' % (loc.locus_name,
                                                     loc.category(), gcoord,
                                                     loc.format_print_clust())
                print >> outh, str(loc.internal_sequence(genome).seq)

        with open('%s.5ltr.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_up_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (
                        loc.chrom(), min(p.start for p in loc.ltr_up),
                        max(p.end for p in loc.ltr_up), loc.strand())
                    print >> outh, '>%s_5LTR|%s|%s' % (
                        loc.locus_name, loc.ltr_up_name(), gcoord)
                    print >> outh, str(ltrseq.seq)

        with open('%s.3ltr.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_down_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (
                        loc.chrom(), min(p.start for p in loc.ltr_down),
                        max(p.end for p in loc.ltr_down), loc.strand())
                    print >> outh, '>%s_3LTR|%s|%s' % (
                        loc.locus_name, loc.ltr_down_name(), gcoord)
                    print >> outh, str(ltrseq.seq)

    ### IGV snapshots ####################################################################
    if args.igv_snapshot:
        print >> sys.stderr, "Taking IGV snapshots"
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(
            os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf'))
        if os.path.isdir('tmp'):
            for compare_gtf in glob('tmp/*.gtf'):
                igv.load(os.path.join(os.getcwd(), compare_gtf))

        igv.load(os.path.join(os.getcwd(), '%s.gtf' % args.prefix))
        igv.load(os.path.join(os.getcwd(), '%s_reject.gtf' % args.prefix))

        do_snapshots = True

        if do_snapshots:
            if not os.path.exists(os.path.join(os.getcwd(), 'snapshots')):
                os.mkdir(os.path.join(os.getcwd(), 'snapshots'))
            if not os.path.exists(os.path.join(os.getcwd(), 'reject')):
                os.mkdir(os.path.join(os.getcwd(), 'reject'))

            categories = ['prototype', 'oneside', 'internal']
            for cat in categories:
                if not os.path.exists(
                        os.path.join(os.getcwd(), 'snapshots/%s' % cat)):
                    os.mkdir(os.path.join(os.getcwd(), 'snapshots/%s' % cat))
                if not os.path.exists(
                        os.path.join(os.getcwd(), 'reject/%s' % cat)):
                    os.mkdir(os.path.join(os.getcwd(), 'reject/%s' % cat))

        for loc in final_locs:
            rc, lc = loc.span()
            locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000)
            print >> sys.stderr, '%s\t%s\t%s' % (loc.locus_name,
                                                 loc.category(), locus_str)
            igv.goto(locus_str)
            igv.expand()
            if do_snapshots:
                igv.snapshotDirectory(
                    os.path.join(os.getcwd(),
                                 'snapshots/%s' % loc.category().strip('*')))
                igv.snapshot(filename='%s.png' % loc.locus_name)

        for loc in reject:
            rc, lc = loc.span()
            locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000)
            print >> sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(),
                                                 locus_str)
            igv.goto(locus_str)
            igv.expand()
            if do_snapshots:
                igv.snapshotDirectory(
                    os.path.join(os.getcwd(),
                                 'reject/%s' % loc.category().strip('*')))
                igv.snapshot(filename='%s.png' % loc.id)
コード例 #24
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    bystrand = {"+": [], "-": []}
    for l in lines:
        bystrand[l[6]].append(l)

    bystrand["+"] = list(utils.sort_gtf(bystrand["+"]))
    bystrand["-"] = list(utils.sort_gtf(bystrand["-"]))

    grouped = {"+": [], "-": []}
    for strand in ["+", "-"]:
        score = None
        chrom = None
        tmp = []
        for l in bystrand[strand]:
            if score is not None:
                if l[5] != score or l[0] != chrom:
                    grouped[strand].append(tmp)
                    tmp = []
            tmp.append(l)
            score = l[5]
            chrom = l[0]

    gaplens = []
    merged = []
    for g in grouped["+"] + grouped["-"]:
        if len(g) == 1:
            merged.append(g[0])
        else:
            mygaps = []
            s = ""
            for i in range(len(g) - 1):
                gaplen = int(g[i + 1][3]) - int(g[i][4])
                s += "%s:%s-%s(%s)" % (g[i][0], g[i][3], g[i][4], g[i][6])
                s += " --- %d --- " % gaplen
                mygaps.append(gaplen)

            s += "%s:%s-%s(%s)" % (g[-1][0], g[-1][3], g[-1][4], g[-1][6])
            if any(g >= QUESTIONABLE for g in mygaps):
                continue
            else:
                gaplens.extend(mygaps)
            print >>sys.stderr, s
            # spos = min(int(l[3]) for l in g)
            # epos = max(int(l[4]) for l in g)
            # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g]
            # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.']
            # newattr = {'joined': ','.join(a['id'] for a in attrs),
            #            'repType': attrs[0]['repType'],
            #            }
            # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems()))
            # merged.append(newline)

    if gaplens:
        print >>sys.stderr, "min gap length:    %d" % min(gaplens)
        print >>sys.stderr, "mean gap length:   %d" % (float(sum(gaplens)) / len(gaplens))
        print >>sys.stderr, "median gap length: %d" % sorted(gaplens)[len(gaplens) / 2]
        print >>sys.stderr, "max gap length:    %d" % max(gaplens)
    else:
        print >>sys.stderr, "No gaps found"

    print >>args.outfile, "%d" % max(gaplens)
コード例 #25
0
def main(args):
    combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)]
    for g in combined_gtf:
        if args.fromAttr in g.attr:
            g.attr[args.toAttr] = g.attr[args.fromAttr]
        print >>args.outfile, g
コード例 #26
0
def main(args):
    # Load GTF file
    gtf = utils.tab_line_gen(args.infile)

    # Skip lines not being merged
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        if 'name' in attrd and attrd['name'] == args.name:
            break
        else:
            print >> args.outfile, '\t'.join(g)

    # Get all lines for locus to be split
    mergelines = [g]
    sublines = []
    for g in gtf:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        if g[1] == 'merged' and attrd['name'] == args.name:
            mergelines.append(g)
        elif 'gene_id' in attrd and attrd['gene_id'] == args.name:
            sublines.append(g)
        else:
            break

    # Set the new category and names
    if args.category:
        category1, category2 = args.category.split(',')
    else:
        attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8]))
        category1 = attrd['category']
        category2 = attrd['category']

    if args.newname:
        newname1, newname2 = args.newname.split(',')
    else:
        newname1 = '%s.1' % args.name
        newname2 = '%s.2' % args.name

    # Print out the edited lines
    sub1 = sublines[:args.split]
    print >> args.outfile, '\t'.join(make_mergeline(sub1, newname1, category1))
    for l in sub1:
        l[1] = category1
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname1, l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname1,
                      l[8])
        print >> args.outfile, '\t'.join(l)

    sub2 = sublines[args.split:]
    print >> args.outfile, '\t'.join(make_mergeline(sub2, newname2, category2))
    for l in sub2:
        l[1] = category2
        l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname2, l[8])
        l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname2,
                      l[8])
        print >> args.outfile, '\t'.join(l)

    # Resume printing the file
    print >> args.outfile, '\t'.join(g)
    for g in gtf:
        print >> args.outfile, '\t'.join(g)
コード例 #27
0
ファイル: names_HERV9.py プロジェクト: gwcbi/HERV_annotation
#! /usr/bin/env python

import sys
import re
import utils
from collections import defaultdict

cyto = dict([l.strip('\n').split('\t') for l in open('tmp/cyto_name_map.txt','rU')])
lines = list(utils.tab_line_gen(open('filtered.hg19.gtf','rU')))

### Seperate merged and unmerged lines 
### Group unmerged lines according to gene
merged = []
unmerged = defaultdict(list)
for l in lines:
    attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
    if l[1]=='merged':
        merged.append((attrs['name'],l))
    else:
        attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
        unmerged[attrs['gene_id']].append(l)

sdata = defaultdict(dict)
for mname,ml in merged:
    attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',ml[8]))
    sdata[mname]['category'] = attrs['category']
    sdata[mname]['band'] = cyto[mname]
    sdata[mname]['id'] = mname
    sdata[mname]['start'] = int(ml[3])
    sdata[mname]['chrom'] = ml[0]
    sdata[mname]['alias'] = ''
コード例 #28
0
def main(args):
    ### Read the GTF file ################################################################
    print >>sys.stderr, 'Loading GTF: %s' % args.internal_file
    gtf = [GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file,'rU'))]
    
    ### Get model lengths
    # mlen = calculate_model_lengths(gtf)
    # print mlen
    mlen = calculate_model_lengths2(gtf)
    print >>sys.stderr, 'Model lengths: %s' %  mlen

    ### Correct the model coordinates ####################################################
    correct_model_coordinates(gtf,mlen)
    for g in gtf:
        if g.strand == '+':
            trueend = mlen[g.attr['repName']] + g.attr['repLeft']
        else:
            trueend = mlen[g.attr['repName']] + g.attr['repStart']
        assert trueend == g.attr['repEnd']

    ### Organize hits by chromosome ######################################################
    bychrom = defaultdict(list)
    for g in gtf:
        bychrom[g.chrom].append(g)

    ### List of HERV loci ################################################################
    print >>sys.stderr, 'Assembling HERV loci'
    all_locs = []

    ### Create HERV loci for plus strand #################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            plus = [h for h in bychrom[chrom] if h.strand == '+']
            if not plus: continue
            plus.sort(key=lambda x: x.start)
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
            cur.internal.append(plus[0])
            for p1 in plus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p1.start - p0.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10: ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme 
                    insequence = p0.attr['repLeft'] < p1.attr['repLeft']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Create HERV loci for minus strand ################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            minus = [h for h in bychrom[chrom] if h.strand == '-']
            if not minus: continue
            minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
            cur.internal.append(minus[0])
            for p1 in minus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p0.start - p1.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10: ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme 
                    insequence = p0.attr['repStart'] < p1.attr['repStart']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Add LTRs to HERV loci ############################################################
    print >>sys.stderr, 'Finding flanking LTRs'    
    for loc in all_locs:
        loc.find_ltr(args.ltr_files, args.flank)
        loc.adjust_overlaps()
    
    print >>sys.stderr, "Initial counts:"
    print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())

    ### Filtering ########################################################################
    reject = set()
    if args.minpct > 0 or args.mincov > 0:
        print >>sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (int(args.minpct*100), args.mincov)
        for loc in all_locs:
            if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct) or loc.model_cov() < args.mincov:
                print >>sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category())
                reject.add(loc)
        
        for rloc in reject:
            all_locs.remove(rloc)
        
        print >>sys.stderr, "After filtering:"
        print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())
        print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject))


    ### Deal with overlapping loci #######################################################
    # Create GTF with all_locs
    with open('tmp.gtf','w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs):
            print >>outh, '\t'.join(g)

    # Cluster overlapping and bookended using bedtools
    p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE)
    out,err = p1.communicate()
    os.remove('tmp.gtf')

    # Parse bedtools output
    overlap_groups = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        overlap_groups[f[-1]].append(GTFLine(f[:9]))
    
    # Remove clusters with one
    for k in overlap_groups.keys():
        if len(overlap_groups[k]) == 1:
            del overlap_groups[k]
    
    print >>sys.stderr, "%d overlap groups" % len(overlap_groups)
    
    if args.igv_preview and len(overlap_groups)>0:
        print >>sys.stderr, "Loading IGV"
        # Create file for IGV viewing
        with open('tmp.gtf','w') as outh:
            liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in all_locs))
            print >>outh, '\n'.join('\t'.join(_) for _ in liter)
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf'))
        igv.load(os.path.join(os.getcwd(),'tmp.gtf'))

    tandem = []
    for k in sorted(overlap_groups.keys(), key=lambda x:int(x)):
        ogroup = overlap_groups[k]
        if args.igv_preview:
            locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup)-5000, max(gl.end for gl in ogroup)+5000)
            igv.goto(locus_str)
            igv.expand()
        
        # Get locus for each member of overlap group
        og_locus = {}
        for o in ogroup:
            tmp = [c for c in all_locs if c.id == o.attr['name']]
            assert len(tmp)==1
            og_locus[o.attr['name']] = tmp[0]
        # Print out the model coverage
        for n,loc in og_locus.iteritems():
            print >>sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category())

        # Parse user input
        z = raw_input('Action to take: ').strip()
        if z == '': continue
        inputcmd = z.strip().split(' ')
        if inputcmd[0] == 'REJECT':
            if len(inputcmd) == 1:
                # Only max will be kept
                st = sorted([loc for n,loc in og_locus.iteritems()], key=lambda x:x.model_cov(), reverse=True)[1:]
                loc_ids = [_.id for _ in st]
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split(',')
            else:
                assert False
            for loc_id in loc_ids:
                reject.add(og_locus[loc_id])
        elif inputcmd[0] == 'TANDEM':
            if len(inputcmd) == 1:
                assert len(og_locus)==2, 'More than 2 loci are present'
                tandem.append([loc for n,loc in og_locus.iteritems()])
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split('+')
                tandem.append([og_locus[loc_id] for loc_id in loc_ids])
            else:
                assert False
        elif inputcmd[0] == 'DIFF':
            n1,n2 = inputcmd[1].split('-')
            g1 = og_locus[n1]
            g2 = og_locus[n2]
            if g1.span()[0] < g2.span()[1]:
                g1.shorten(g2.span()[1]+20, g1.span()[1])
            elif g1.span()[1] < g2.span()[0]:
                g1.shorten(g1.span()[0], g2.span()[0]-20)
            else:
                print "no overlap!"
            print g1
        elif inputcmd[0] == 'IGNORE':
            continue
        else:
            assert False, 'Unknown command: "%s"' % inputcmd[0]

    # Remove rejected annotations
    for rloc in reject:
        if rloc in all_locs:
            all_locs.remove(rloc)
    
    # Create the tandem annotations
    for tgroup in tandem:
        tandem_loc = HERVLocus(id=tgroup[0].id)
        tandem_loc.internal = list(chain.from_iterable(loc.internal for loc in tgroup))
        if tandem_loc.strand() == '+':
            tandem_loc.internal.sort(key=lambda x:x.start)
        else:
            tandem_loc.internal.sort(key=lambda x:x.end, reverse=True)
    
        tandem_loc.find_ltr(args.ltr_files, 1000)
        tandem_loc.adjust_overlaps()
        tandem_loc.is_tandem = True
        all_locs.append(tandem_loc)
        # Remove from original
        for rloc in tgroup:
            all_locs.remove(rloc)
    
    print >>sys.stderr, "After overlap removal:"
    print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())
    print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject))
    if args.igv_preview and len(overlap_groups)>0: os.remove('tmp.gtf')
    
    ### Sort loci ########################################################################
    bychrom = defaultdict(list)
    for loc in all_locs:
        bychrom[loc.chrom()].append(loc)
    
    final_locs = []
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            for loc in sorted(bychrom[chrom], key=lambda x:x.span()[0]):
                final_locs.append(loc)
    
    for i,loc in enumerate(final_locs):
        loc.id = '%s_%04d' % (args.prefix, i+1)

    ### Rename loci according to cytoband #################################################
    # Create GTF with all_locs
    with open('tmp.gtf','w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >>outh, '\t'.join(g)    

    p1 = Popen('bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE)
    out,err = p1.communicate()
    os.remove('tmp.gtf')

    byband = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        g1 = GTFLine(f[:9])
        g2 = GTFLine(f[9:-1])
        band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id'])
        byband[band].append(g1)

    namemap = {}
    for band,glist in byband.iteritems():
        if len(glist) == 1:
            namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band)
        else:
            glist.sort(key=lambda x:x.start)
            for i,gl in enumerate(glist):
                namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i])

    for loc in final_locs:
        loc.locus_name = namemap[loc.id]

    ### Create annotation files ##########################################################
    print >>sys.stderr, "Writing annotation files"
    with open('%s.gtf' % args.prefix,'w') as outh:
        liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in final_locs))
        print >>outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in final_locs:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_reject.gtf' % args.prefix,'w') as outh:
        liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in reject))
        print >>outh, '\n'.join('\t'.join(_) for _ in liter)        
        # for loc in reject:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())
    
    with open('%s_span.gtf' % args.prefix,'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >>outh, '\t'.join(g) 
    
    with open('%s_table.txt' % args.prefix,'w') as outh:
        print >>outh, '\t'.join(['locus_name','id','strand','chrom','start','end','strand','nfeats','width','model_cov','ltr5_model','int_model','ltr3_model'])
        for loc in final_locs:
            mgtf = GTFLine(loc.span_gtf())
            row = [loc.locus_name, loc.id, loc.category(),
                   mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand,
                   mgtf.attr['nfeats'], loc.width(), loc.model_cov(),
                   loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(),
                   ]
            print >>outh, '\t'.join(str(_) for _ in row)

    ### Extract sequences ################################################################
    if args.get_sequences:
        print >>sys.stderr, "Extracting sequences"
        genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa'
        genome = dict((s.id,s) for s in SeqIO.parse(genome_fasta,'fasta'))
        
        with open('%s.full.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand())
                print >>outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord)
                print >>outh, str(loc.entire_sequence(genome).seq)
        
        with open('%s.internal.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand())
                print >>outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust())
                print >>outh, str(loc.internal_sequence(genome).seq)
        
        with open('%s.5ltr.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_up_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand())
                    print >>outh, '>%s_5LTR|%s|%s' % (loc.locus_name, loc.ltr_up_name(), gcoord)
                    print >>outh, str(ltrseq.seq)
        
        with open('%s.3ltr.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_down_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand())
                    print >>outh, '>%s_3LTR|%s|%s' % (loc.locus_name, loc.ltr_down_name(), gcoord)
                    print >>outh, str(ltrseq.seq)

    ### IGV snapshots ####################################################################
    if args.igv_snapshot:
            print >>sys.stderr, "Taking IGV snapshots"
            igv = IGV()
            igv.new()
            igv.genome('hg19')
            igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf'))
            if os.path.isdir('tmp'):
                for compare_gtf in glob('tmp/*.gtf'):
                    igv.load(os.path.join(os.getcwd(), compare_gtf))
            
            igv.load(os.path.join(os.getcwd(),'%s.gtf' % args.prefix))
            igv.load(os.path.join(os.getcwd(),'%s_reject.gtf' % args.prefix))
            
            do_snapshots = True
            
            if do_snapshots:
                if not os.path.exists(os.path.join(os.getcwd(),'snapshots')):
                    os.mkdir(os.path.join(os.getcwd(),'snapshots'))
                if not os.path.exists(os.path.join(os.getcwd(),'reject')):
                    os.mkdir(os.path.join(os.getcwd(),'reject'))
                
                categories = ['prototype', 'oneside', 'internal']
                for cat in categories:
                    if not os.path.exists(os.path.join(os.getcwd(),'snapshots/%s' % cat)):
                        os.mkdir(os.path.join(os.getcwd(),'snapshots/%s' % cat))    
                    if not os.path.exists(os.path.join(os.getcwd(),'reject/%s' % cat)):
                        os.mkdir(os.path.join(os.getcwd(),'reject/%s' % cat))
                        
            for loc in final_locs:
                rc,lc = loc.span()
                locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000)
                print >>sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str)
                igv.goto(locus_str)
                igv.expand()
                if do_snapshots:
                    igv.snapshotDirectory(os.path.join(os.getcwd(),'snapshots/%s' % loc.category().strip('*') ))
                    igv.snapshot(filename='%s.png' % loc.locus_name)
            
            for loc in reject:
                rc,lc = loc.span()
                locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000)
                print >>sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str)
                igv.goto(locus_str)
                igv.expand()
                if do_snapshots:
                    igv.snapshotDirectory(os.path.join(os.getcwd(),'reject/%s' % loc.category().strip('*') ))
                    igv.snapshot(filename='%s.png' % loc.id)    
コード例 #29
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    bystrand = {'+': [], '-': []}
    for l in lines:
        bystrand[l[6]].append(l)

    bystrand['+'] = list(utils.sort_gtf(bystrand['+']))
    bystrand['-'] = list(utils.sort_gtf(bystrand['-']))

    grouped = {'+': [], '-': []}
    for strand in ['+', '-']:
        score = None
        chrom = None
        tmp = []
        for l in bystrand[strand]:
            if score is not None:
                if l[5] != score or l[0] != chrom:
                    grouped[strand].append(tmp)
                    tmp = []
            tmp.append(l)
            score = l[5]
            chrom = l[0]

    gaplens = []
    merged = []
    for g in grouped['+'] + grouped['-']:
        if len(g) == 1:
            merged.append(g[0])
        else:
            mygaps = []
            s = ''
            for i in range(len(g) - 1):
                gaplen = int(g[i + 1][3]) - int(g[i][4])
                s += '%s:%s-%s(%s)' % (g[i][0], g[i][3], g[i][4], g[i][6])
                s += ' --- %d --- ' % gaplen
                mygaps.append(gaplen)

            s += '%s:%s-%s(%s)' % (g[-1][0], g[-1][3], g[-1][4], g[-1][6])
            if any(g >= QUESTIONABLE for g in mygaps):
                continue
            else:
                gaplens.extend(mygaps)
            print >> sys.stderr, s
            # spos = min(int(l[3]) for l in g)
            # epos = max(int(l[4]) for l in g)
            # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g]
            # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.']
            # newattr = {'joined': ','.join(a['id'] for a in attrs),
            #            'repType': attrs[0]['repType'],
            #            }
            # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems()))
            # merged.append(newline)

    if gaplens:
        print >> sys.stderr, 'min gap length:    %d' % min(gaplens)
        print >> sys.stderr, 'mean gap length:   %d' % (float(sum(gaplens)) /
                                                        len(gaplens))
        print >> sys.stderr, 'median gap length: %d' % sorted(gaplens)[
            len(gaplens) / 2]
        print >> sys.stderr, 'max gap length:    %d' % max(gaplens)
    else:
        print >> sys.stderr, 'No gaps found'

    print >> args.outfile, '%d' % max(gaplens)