Ejemplo n.º 1
0
def main(parser):
    args = parser.parse_args()

    rows = []
    base_columns = [
        'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand',
        'frame'
    ]
    attr_columns = set()

    # Parse rows
    gtflines = utils.tab_line_gen(args.infile)
    for l in utils.sort_gtf(gtflines):
        rowd = dict(zip(base_columns, l[:8]))
        for k, v in re.findall('(\S+)\s+"([\s\S]+?)";', l[8]):
            attr_columns.add(k)
            rowd[k] = v
        rows.append(rowd)

    # Set column headers
    columns = base_columns + list(attr_columns)
    if args.keycol in columns:
        columns.remove(args.keycol)
        columns = [args.keycol] + columns

    # Print the table
    print >> args.outfile, '\t'.join(columns)
    for rowd in rows:
        print >> args.outfile, '\t'.join(
            [rowd[c] if c in rowd else '' for c in columns])
Ejemplo n.º 2
0
def main(args):
    if args.chroms:
        chroms = [l.strip('\n').split('\t')[0] for l in args.chroms]
    else:
        chroms = None
    
    for l in utils.sort_gtf(utils.tab_line_gen(args.infile), chroms):
        print >>args.outfile, '\t'.join(l)
Ejemplo n.º 3
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum, c in clustered.iteritems():
        c.sort(key=lambda x: int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix, int(cnum))

        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr', 'internal', 'ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr', 'internal'
                                ] or reptypes[cnum] == ['internal', 'ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1

        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands) == 1: pstrand = strands.pop()
        else: pstrand = '.'  # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (
            cluster_id, category, len(c), (pend - pstart), pcov)
        pline = [
            c[0][0], 'merged', 'gene',
            str(pstart),
            str(pend), '.', pstrand, '.', pattr
        ]
        newlines.append(pline)

        for l in c:
            l[1] = category
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,
                                                           cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k, v)
                                   for k, v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >> args.outfile, '\t'.join(l)

    for cat in ['prototype', 'oneside', 'soloint', 'sololtr', 'unusual']:
        print >> sys.stderr, '%s:     %d' % (cat, catcount[cat])
Ejemplo n.º 4
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    clustered = utils.cluster_gtf(lines)
    reptypes = utils.by_attribute(clustered, 'repType')

    catcount = Counter()
    newlines = []
    for cnum,c in clustered.iteritems():
        c.sort(key=lambda x:int(x[3]))
        cluster_id = '%s_%04d' % (args.prefix,int(cnum))
                
        # Categorize cluster according to repeat types and orientation
        if reptypes[cnum] == ['ltr','internal','ltr']:
            category = 'prototype'
        elif reptypes[cnum] == ['ltr','internal'] or reptypes[cnum] == ['internal','ltr']:
            category = 'oneside'
        elif reptypes[cnum] == ['internal']:
            category = 'soloint'
        elif reptypes[cnum] == ['ltr']:
            category = 'sololtr'
        else:
            category = 'unusual'
        catcount[category] += 1
        
        # Create the parent (merged) annotation
        pstart = min(int(l[3]) for l in c)
        pend   = max(int(l[4]) for l in c)
        strands = set(l[6] for l in c)
        if len(strands)==1: pstrand = strands.pop()
        else: pstrand = '.' # Strand is ambiguous
        pcov = utils.covered_len(c)
        pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (cluster_id, category, len(c), (pend-pstart), pcov)
        pline = [c[0][0], 'merged', 'gene', str(pstart), str(pend), '.', pstrand, '.', pattr]
        newlines.append(pline)

        for l in c:
            l[1] = category        
            attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8]))
            if 'gene_id' in attr: del attr['gene_id']
            if 'transcript_id' in attr: del attr['transcript_id']
            l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,cluster_id)
            l[8] = l[8] + ' '.join('%s "%s";' % (k,v) for k,v in attr.iteritems())
            newlines.append(l[:-1])

    for l in utils.sort_gtf(newlines):
        print >>args.outfile, '\t'.join(l)       

    for cat in ['prototype','oneside','soloint','sololtr','unusual']:
        print >>sys.stderr, '%s:     %d' % (cat, catcount[cat])
Ejemplo n.º 5
0
def main(args):
    ### Read the GTF file ################################################################
    print >> sys.stderr, 'Loading GTF: %s' % args.internal_file
    gtf = [
        GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file, 'rU'))
    ]

    ### Get model lengths
    # mlen = calculate_model_lengths(gtf)
    # print mlen
    mlen = calculate_model_lengths2(gtf)
    print >> sys.stderr, 'Model lengths: %s' % mlen

    ### Correct the model coordinates ####################################################
    correct_model_coordinates(gtf, mlen)
    for g in gtf:
        if g.strand == '+':
            trueend = mlen[g.attr['repName']] + g.attr['repLeft']
        else:
            trueend = mlen[g.attr['repName']] + g.attr['repStart']
        assert trueend == g.attr['repEnd']

    ### Organize hits by chromosome ######################################################
    bychrom = defaultdict(list)
    for g in gtf:
        bychrom[g.chrom].append(g)

    ### List of HERV loci ################################################################
    print >> sys.stderr, 'Assembling HERV loci'
    all_locs = []

    ### Create HERV loci for plus strand #################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            plus = [h for h in bychrom[chrom] if h.strand == '+']
            if not plus: continue
            plus.sort(key=lambda x: x.start)
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1))
            cur.internal.append(plus[0])
            for p1 in plus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p1.start - p0.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10:  ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme
                    insequence = p0.attr['repLeft'] < p1.attr['repLeft']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' %
                                    (args.prefix, len(all_locs) + 1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Create HERV loci for minus strand ################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            minus = [h for h in bychrom[chrom] if h.strand == '-']
            if not minus: continue
            minus.sort(key=lambda x: x.end,
                       reverse=True)  # Sort in reverse order
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1))
            cur.internal.append(minus[0])
            for p1 in minus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p0.start - p1.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10:  ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme
                    insequence = p0.attr['repStart'] < p1.attr['repStart']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' %
                                    (args.prefix, len(all_locs) + 1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Add LTRs to HERV loci ############################################################
    print >> sys.stderr, 'Finding flanking LTRs'
    for loc in all_locs:
        loc.find_ltr(args.ltr_files, args.flank)
        loc.adjust_overlaps()

    print >> sys.stderr, "Initial counts:"
    print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                   for cat, count in Counter(
                                       c.category()
                                       for c in all_locs).most_common())

    ### Filtering ########################################################################
    reject = set()
    if args.minpct > 0 or args.mincov > 0:
        print >> sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (
            int(args.minpct * 100), args.mincov)
        for loc in all_locs:
            if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct
                                  ) or loc.model_cov() < args.mincov:
                print >> sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(),
                                                     loc.category())
                reject.add(loc)

        for rloc in reject:
            all_locs.remove(rloc)

        print >> sys.stderr, "After filtering:"
        print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                       for cat, count in Counter(
                                           c.category()
                                           for c in all_locs).most_common())
        print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject))

    ### Deal with overlapping loci #######################################################
    # Create GTF with all_locs
    with open('tmp.gtf', 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs):
            print >> outh, '\t'.join(g)

    # Cluster overlapping and bookended using bedtools
    p1 = Popen('bedtools cluster -i tmp.gtf',
               shell=True,
               stdout=PIPE,
               stderr=PIPE)
    out, err = p1.communicate()
    os.remove('tmp.gtf')

    # Parse bedtools output
    overlap_groups = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        overlap_groups[f[-1]].append(GTFLine(f[:9]))

    # Remove clusters with one
    for k in overlap_groups.keys():
        if len(overlap_groups[k]) == 1:
            del overlap_groups[k]

    print >> sys.stderr, "%d overlap groups" % len(overlap_groups)

    if args.igv_preview and len(overlap_groups) > 0:
        print >> sys.stderr, "Loading IGV"
        # Create file for IGV viewing
        with open('tmp.gtf', 'w') as outh:
            liter = utils.sort_gtf(
                chain.from_iterable(loc.each_gtf() for loc in all_locs))
            print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(
            os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf'))
        igv.load(os.path.join(os.getcwd(), 'tmp.gtf'))

    tandem = []
    for k in sorted(overlap_groups.keys(), key=lambda x: int(x)):
        ogroup = overlap_groups[k]
        if args.igv_preview:
            locus_str = '%s:%s-%s' % (ogroup[0].chrom,
                                      min(gl.start for gl in ogroup) - 5000,
                                      max(gl.end for gl in ogroup) + 5000)
            igv.goto(locus_str)
            igv.expand()

        # Get locus for each member of overlap group
        og_locus = {}
        for o in ogroup:
            tmp = [c for c in all_locs if c.id == o.attr['name']]
            assert len(tmp) == 1
            og_locus[o.attr['name']] = tmp[0]
        # Print out the model coverage
        for n, loc in og_locus.iteritems():
            print >> sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(),
                                                 loc.category())

        # Parse user input
        z = raw_input('Action to take: ').strip()
        if z == '': continue
        inputcmd = z.strip().split(' ')
        if inputcmd[0] == 'REJECT':
            if len(inputcmd) == 1:
                # Only max will be kept
                st = sorted([loc for n, loc in og_locus.iteritems()],
                            key=lambda x: x.model_cov(),
                            reverse=True)[1:]
                loc_ids = [_.id for _ in st]
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split(',')
            else:
                assert False
            for loc_id in loc_ids:
                reject.add(og_locus[loc_id])
        elif inputcmd[0] == 'TANDEM':
            if len(inputcmd) == 1:
                assert len(og_locus) == 2, 'More than 2 loci are present'
                tandem.append([loc for n, loc in og_locus.iteritems()])
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split('+')
                tandem.append([og_locus[loc_id] for loc_id in loc_ids])
            else:
                assert False
        elif inputcmd[0] == 'DIFF':
            n1, n2 = inputcmd[1].split('-')
            g1 = og_locus[n1]
            g2 = og_locus[n2]
            if g1.span()[0] < g2.span()[1]:
                g1.shorten(g2.span()[1] + 20, g1.span()[1])
            elif g1.span()[1] < g2.span()[0]:
                g1.shorten(g1.span()[0], g2.span()[0] - 20)
            else:
                print "no overlap!"
            print g1
        elif inputcmd[0] == 'IGNORE':
            continue
        else:
            assert False, 'Unknown command: "%s"' % inputcmd[0]

    # Remove rejected annotations
    for rloc in reject:
        if rloc in all_locs:
            all_locs.remove(rloc)

    # Create the tandem annotations
    for tgroup in tandem:
        tandem_loc = HERVLocus(id=tgroup[0].id)
        tandem_loc.internal = list(
            chain.from_iterable(loc.internal for loc in tgroup))
        if tandem_loc.strand() == '+':
            tandem_loc.internal.sort(key=lambda x: x.start)
        else:
            tandem_loc.internal.sort(key=lambda x: x.end, reverse=True)

        tandem_loc.find_ltr(args.ltr_files, 1000)
        tandem_loc.adjust_overlaps()
        tandem_loc.is_tandem = True
        all_locs.append(tandem_loc)
        # Remove from original
        for rloc in tgroup:
            all_locs.remove(rloc)

    print >> sys.stderr, "After overlap removal:"
    print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count)
                                   for cat, count in Counter(
                                       c.category()
                                       for c in all_locs).most_common())
    print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject))
    if args.igv_preview and len(overlap_groups) > 0: os.remove('tmp.gtf')

    ### Sort loci ########################################################################
    bychrom = defaultdict(list)
    for loc in all_locs:
        bychrom[loc.chrom()].append(loc)

    final_locs = []
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            for loc in sorted(bychrom[chrom], key=lambda x: x.span()[0]):
                final_locs.append(loc)

    for i, loc in enumerate(final_locs):
        loc.id = '%s_%04d' % (args.prefix, i + 1)

    ### Rename loci according to cytoband #################################################
    # Create GTF with all_locs
    with open('tmp.gtf', 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >> outh, '\t'.join(g)

    p1 = Popen(
        'bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf',
        shell=True,
        stdout=PIPE,
        stderr=PIPE)
    out, err = p1.communicate()
    os.remove('tmp.gtf')

    byband = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        g1 = GTFLine(f[:9])
        g2 = GTFLine(f[9:-1])
        band = '%s%s' % (g2.chrom.strip('chr'), g2.attr['gene_id'])
        byband[band].append(g1)

    namemap = {}
    for band, glist in byband.iteritems():
        if len(glist) == 1:
            namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band)
        else:
            glist.sort(key=lambda x: x.start)
            for i, gl in enumerate(glist):
                namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band,
                                                        someletters[i])

    for loc in final_locs:
        loc.locus_name = namemap[loc.id]

    ### Create annotation files ##########################################################
    print >> sys.stderr, "Writing annotation files"
    with open('%s.gtf' % args.prefix, 'w') as outh:
        liter = utils.sort_gtf(
            chain.from_iterable(loc.each_gtf() for loc in final_locs))
        print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in final_locs:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_reject.gtf' % args.prefix, 'w') as outh:
        liter = utils.sort_gtf(
            chain.from_iterable(loc.each_gtf() for loc in reject))
        print >> outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in reject:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_span.gtf' % args.prefix, 'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >> outh, '\t'.join(g)

    with open('%s_table.txt' % args.prefix, 'w') as outh:
        print >> outh, '\t'.join([
            'locus_name', 'id', 'strand', 'chrom', 'start', 'end', 'strand',
            'nfeats', 'width', 'model_cov', 'ltr5_model', 'int_model',
            'ltr3_model'
        ])
        for loc in final_locs:
            mgtf = GTFLine(loc.span_gtf())
            row = [
                loc.locus_name,
                loc.id,
                loc.category(),
                mgtf.chrom,
                mgtf.start,
                mgtf.end,
                mgtf.strand,
                mgtf.attr['nfeats'],
                loc.width(),
                loc.model_cov(),
                loc.ltr_up_name(),
                loc.internal_name(),
                loc.ltr_down_name(),
            ]
            print >> outh, '\t'.join(str(_) for _ in row)

    ### Extract sequences ################################################################
    if args.get_sequences:
        print >> sys.stderr, "Extracting sequences"
        genome_fasta = args.genome_fasta  # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa'
        genome = dict((s.id, s) for s in SeqIO.parse(genome_fasta, 'fasta'))

        with open('%s.full.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0],
                                           loc.span()[1], loc.strand())
                print >> outh, '>%s|%s|%s' % (loc.locus_name, loc.category(),
                                              gcoord)
                print >> outh, str(loc.entire_sequence(genome).seq)

        with open('%s.internal.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (
                    loc.chrom(), min(p.start for p in loc.internal),
                    max(p.end for p in loc.internal), loc.strand())
                print >> outh, '>%s_int|%s|%s|%s' % (loc.locus_name,
                                                     loc.category(), gcoord,
                                                     loc.format_print_clust())
                print >> outh, str(loc.internal_sequence(genome).seq)

        with open('%s.5ltr.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_up_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (
                        loc.chrom(), min(p.start for p in loc.ltr_up),
                        max(p.end for p in loc.ltr_up), loc.strand())
                    print >> outh, '>%s_5LTR|%s|%s' % (
                        loc.locus_name, loc.ltr_up_name(), gcoord)
                    print >> outh, str(ltrseq.seq)

        with open('%s.3ltr.fasta' % args.prefix, 'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_down_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (
                        loc.chrom(), min(p.start for p in loc.ltr_down),
                        max(p.end for p in loc.ltr_down), loc.strand())
                    print >> outh, '>%s_3LTR|%s|%s' % (
                        loc.locus_name, loc.ltr_down_name(), gcoord)
                    print >> outh, str(ltrseq.seq)

    ### IGV snapshots ####################################################################
    if args.igv_snapshot:
        print >> sys.stderr, "Taking IGV snapshots"
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(
            os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf'))
        if os.path.isdir('tmp'):
            for compare_gtf in glob('tmp/*.gtf'):
                igv.load(os.path.join(os.getcwd(), compare_gtf))

        igv.load(os.path.join(os.getcwd(), '%s.gtf' % args.prefix))
        igv.load(os.path.join(os.getcwd(), '%s_reject.gtf' % args.prefix))

        do_snapshots = True

        if do_snapshots:
            if not os.path.exists(os.path.join(os.getcwd(), 'snapshots')):
                os.mkdir(os.path.join(os.getcwd(), 'snapshots'))
            if not os.path.exists(os.path.join(os.getcwd(), 'reject')):
                os.mkdir(os.path.join(os.getcwd(), 'reject'))

            categories = ['prototype', 'oneside', 'internal']
            for cat in categories:
                if not os.path.exists(
                        os.path.join(os.getcwd(), 'snapshots/%s' % cat)):
                    os.mkdir(os.path.join(os.getcwd(), 'snapshots/%s' % cat))
                if not os.path.exists(
                        os.path.join(os.getcwd(), 'reject/%s' % cat)):
                    os.mkdir(os.path.join(os.getcwd(), 'reject/%s' % cat))

        for loc in final_locs:
            rc, lc = loc.span()
            locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000)
            print >> sys.stderr, '%s\t%s\t%s' % (loc.locus_name,
                                                 loc.category(), locus_str)
            igv.goto(locus_str)
            igv.expand()
            if do_snapshots:
                igv.snapshotDirectory(
                    os.path.join(os.getcwd(),
                                 'snapshots/%s' % loc.category().strip('*')))
                igv.snapshot(filename='%s.png' % loc.locus_name)

        for loc in reject:
            rc, lc = loc.span()
            locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000)
            print >> sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(),
                                                 locus_str)
            igv.goto(locus_str)
            igv.expand()
            if do_snapshots:
                igv.snapshotDirectory(
                    os.path.join(os.getcwd(),
                                 'reject/%s' % loc.category().strip('*')))
                igv.snapshot(filename='%s.png' % loc.id)
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    bystrand = {"+": [], "-": []}
    for l in lines:
        bystrand[l[6]].append(l)

    bystrand["+"] = list(utils.sort_gtf(bystrand["+"]))
    bystrand["-"] = list(utils.sort_gtf(bystrand["-"]))

    grouped = {"+": [], "-": []}
    for strand in ["+", "-"]:
        score = None
        chrom = None
        tmp = []
        for l in bystrand[strand]:
            if score is not None:
                if l[5] != score or l[0] != chrom:
                    grouped[strand].append(tmp)
                    tmp = []
            tmp.append(l)
            score = l[5]
            chrom = l[0]

    gaplens = []
    merged = []
    for g in grouped["+"] + grouped["-"]:
        if len(g) == 1:
            merged.append(g[0])
        else:
            mygaps = []
            s = ""
            for i in range(len(g) - 1):
                gaplen = int(g[i + 1][3]) - int(g[i][4])
                s += "%s:%s-%s(%s)" % (g[i][0], g[i][3], g[i][4], g[i][6])
                s += " --- %d --- " % gaplen
                mygaps.append(gaplen)

            s += "%s:%s-%s(%s)" % (g[-1][0], g[-1][3], g[-1][4], g[-1][6])
            if any(g >= QUESTIONABLE for g in mygaps):
                continue
            else:
                gaplens.extend(mygaps)
            print >>sys.stderr, s
            # spos = min(int(l[3]) for l in g)
            # epos = max(int(l[4]) for l in g)
            # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g]
            # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.']
            # newattr = {'joined': ','.join(a['id'] for a in attrs),
            #            'repType': attrs[0]['repType'],
            #            }
            # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems()))
            # merged.append(newline)

    if gaplens:
        print >>sys.stderr, "min gap length:    %d" % min(gaplens)
        print >>sys.stderr, "mean gap length:   %d" % (float(sum(gaplens)) / len(gaplens))
        print >>sys.stderr, "median gap length: %d" % sorted(gaplens)[len(gaplens) / 2]
        print >>sys.stderr, "max gap length:    %d" % max(gaplens)
    else:
        print >>sys.stderr, "No gaps found"

    print >>args.outfile, "%d" % max(gaplens)
Ejemplo n.º 7
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    for l in utils.sort_gtf(lines):
        print >>args.outfile, '\t'.join(l)
def main(args):
    ### Read the GTF file ################################################################
    print >>sys.stderr, 'Loading GTF: %s' % args.internal_file
    gtf = [GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file,'rU'))]
    
    ### Get model lengths
    # mlen = calculate_model_lengths(gtf)
    # print mlen
    mlen = calculate_model_lengths2(gtf)
    print >>sys.stderr, 'Model lengths: %s' %  mlen

    ### Correct the model coordinates ####################################################
    correct_model_coordinates(gtf,mlen)
    for g in gtf:
        if g.strand == '+':
            trueend = mlen[g.attr['repName']] + g.attr['repLeft']
        else:
            trueend = mlen[g.attr['repName']] + g.attr['repStart']
        assert trueend == g.attr['repEnd']

    ### Organize hits by chromosome ######################################################
    bychrom = defaultdict(list)
    for g in gtf:
        bychrom[g.chrom].append(g)

    ### List of HERV loci ################################################################
    print >>sys.stderr, 'Assembling HERV loci'
    all_locs = []

    ### Create HERV loci for plus strand #################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            plus = [h for h in bychrom[chrom] if h.strand == '+']
            if not plus: continue
            plus.sort(key=lambda x: x.start)
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
            cur.internal.append(plus[0])
            for p1 in plus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p1.start - p0.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10: ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme 
                    insequence = p0.attr['repLeft'] < p1.attr['repLeft']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Create HERV loci for minus strand ################################################
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            minus = [h for h in bychrom[chrom] if h.strand == '-']
            if not minus: continue
            minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order
            cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
            cur.internal.append(minus[0])
            for p1 in minus[1:]:
                p0 = cur.internal[-1]
                # Genomic distance between hits
                gdist = p0.start - p1.end
                # Determine whether p1 is in sequence with locus
                if gdist <= 10: ## Overlapping (or nearly) in genome
                    insequence = True
                else:
                    ## Hits are in sequence and genomic distance is not extreme 
                    insequence = p0.attr['repStart'] < p1.attr['repStart']
                    insequence &= gdist < args.longdist
                if insequence:
                    cur.internal.append(p1)
                else:
                    all_locs.append(cur)
                    cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1))
                    cur.internal.append(p1)
            all_locs.append(cur)

    ### Add LTRs to HERV loci ############################################################
    print >>sys.stderr, 'Finding flanking LTRs'    
    for loc in all_locs:
        loc.find_ltr(args.ltr_files, args.flank)
        loc.adjust_overlaps()
    
    print >>sys.stderr, "Initial counts:"
    print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())

    ### Filtering ########################################################################
    reject = set()
    if args.minpct > 0 or args.mincov > 0:
        print >>sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (int(args.minpct*100), args.mincov)
        for loc in all_locs:
            if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct) or loc.model_cov() < args.mincov:
                print >>sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category())
                reject.add(loc)
        
        for rloc in reject:
            all_locs.remove(rloc)
        
        print >>sys.stderr, "After filtering:"
        print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())
        print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject))


    ### Deal with overlapping loci #######################################################
    # Create GTF with all_locs
    with open('tmp.gtf','w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs):
            print >>outh, '\t'.join(g)

    # Cluster overlapping and bookended using bedtools
    p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE)
    out,err = p1.communicate()
    os.remove('tmp.gtf')

    # Parse bedtools output
    overlap_groups = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        overlap_groups[f[-1]].append(GTFLine(f[:9]))
    
    # Remove clusters with one
    for k in overlap_groups.keys():
        if len(overlap_groups[k]) == 1:
            del overlap_groups[k]
    
    print >>sys.stderr, "%d overlap groups" % len(overlap_groups)
    
    if args.igv_preview and len(overlap_groups)>0:
        print >>sys.stderr, "Loading IGV"
        # Create file for IGV viewing
        with open('tmp.gtf','w') as outh:
            liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in all_locs))
            print >>outh, '\n'.join('\t'.join(_) for _ in liter)
        igv = IGV()
        igv.new()
        igv.genome('hg19')
        igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf'))
        igv.load(os.path.join(os.getcwd(),'tmp.gtf'))

    tandem = []
    for k in sorted(overlap_groups.keys(), key=lambda x:int(x)):
        ogroup = overlap_groups[k]
        if args.igv_preview:
            locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup)-5000, max(gl.end for gl in ogroup)+5000)
            igv.goto(locus_str)
            igv.expand()
        
        # Get locus for each member of overlap group
        og_locus = {}
        for o in ogroup:
            tmp = [c for c in all_locs if c.id == o.attr['name']]
            assert len(tmp)==1
            og_locus[o.attr['name']] = tmp[0]
        # Print out the model coverage
        for n,loc in og_locus.iteritems():
            print >>sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category())

        # Parse user input
        z = raw_input('Action to take: ').strip()
        if z == '': continue
        inputcmd = z.strip().split(' ')
        if inputcmd[0] == 'REJECT':
            if len(inputcmd) == 1:
                # Only max will be kept
                st = sorted([loc for n,loc in og_locus.iteritems()], key=lambda x:x.model_cov(), reverse=True)[1:]
                loc_ids = [_.id for _ in st]
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split(',')
            else:
                assert False
            for loc_id in loc_ids:
                reject.add(og_locus[loc_id])
        elif inputcmd[0] == 'TANDEM':
            if len(inputcmd) == 1:
                assert len(og_locus)==2, 'More than 2 loci are present'
                tandem.append([loc for n,loc in og_locus.iteritems()])
            elif len(inputcmd) == 2:
                loc_ids = inputcmd[1].split('+')
                tandem.append([og_locus[loc_id] for loc_id in loc_ids])
            else:
                assert False
        elif inputcmd[0] == 'DIFF':
            n1,n2 = inputcmd[1].split('-')
            g1 = og_locus[n1]
            g2 = og_locus[n2]
            if g1.span()[0] < g2.span()[1]:
                g1.shorten(g2.span()[1]+20, g1.span()[1])
            elif g1.span()[1] < g2.span()[0]:
                g1.shorten(g1.span()[0], g2.span()[0]-20)
            else:
                print "no overlap!"
            print g1
        elif inputcmd[0] == 'IGNORE':
            continue
        else:
            assert False, 'Unknown command: "%s"' % inputcmd[0]

    # Remove rejected annotations
    for rloc in reject:
        if rloc in all_locs:
            all_locs.remove(rloc)
    
    # Create the tandem annotations
    for tgroup in tandem:
        tandem_loc = HERVLocus(id=tgroup[0].id)
        tandem_loc.internal = list(chain.from_iterable(loc.internal for loc in tgroup))
        if tandem_loc.strand() == '+':
            tandem_loc.internal.sort(key=lambda x:x.start)
        else:
            tandem_loc.internal.sort(key=lambda x:x.end, reverse=True)
    
        tandem_loc.find_ltr(args.ltr_files, 1000)
        tandem_loc.adjust_overlaps()
        tandem_loc.is_tandem = True
        all_locs.append(tandem_loc)
        # Remove from original
        for rloc in tgroup:
            all_locs.remove(rloc)
    
    print >>sys.stderr, "After overlap removal:"
    print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common())
    print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject))
    if args.igv_preview and len(overlap_groups)>0: os.remove('tmp.gtf')
    
    ### Sort loci ########################################################################
    bychrom = defaultdict(list)
    for loc in all_locs:
        bychrom[loc.chrom()].append(loc)
    
    final_locs = []
    for chrom in utils.CHROMNAMES:
        if chrom in bychrom:
            for loc in sorted(bychrom[chrom], key=lambda x:x.span()[0]):
                final_locs.append(loc)
    
    for i,loc in enumerate(final_locs):
        loc.id = '%s_%04d' % (args.prefix, i+1)

    ### Rename loci according to cytoband #################################################
    # Create GTF with all_locs
    with open('tmp.gtf','w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >>outh, '\t'.join(g)    

    p1 = Popen('bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE)
    out,err = p1.communicate()
    os.remove('tmp.gtf')

    byband = defaultdict(list)
    for ll in out.strip('\n').split('\n'):
        f = ll.split('\t')
        g1 = GTFLine(f[:9])
        g2 = GTFLine(f[9:-1])
        band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id'])
        byband[band].append(g1)

    namemap = {}
    for band,glist in byband.iteritems():
        if len(glist) == 1:
            namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band)
        else:
            glist.sort(key=lambda x:x.start)
            for i,gl in enumerate(glist):
                namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i])

    for loc in final_locs:
        loc.locus_name = namemap[loc.id]

    ### Create annotation files ##########################################################
    print >>sys.stderr, "Writing annotation files"
    with open('%s.gtf' % args.prefix,'w') as outh:
        liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in final_locs))
        print >>outh, '\n'.join('\t'.join(_) for _ in liter)
        # for loc in final_locs:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())

    with open('%s_reject.gtf' % args.prefix,'w') as outh:
        liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in reject))
        print >>outh, '\n'.join('\t'.join(_) for _ in liter)        
        # for loc in reject:
        #     print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf())
    
    with open('%s_span.gtf' % args.prefix,'w') as outh:
        for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs):
            print >>outh, '\t'.join(g) 
    
    with open('%s_table.txt' % args.prefix,'w') as outh:
        print >>outh, '\t'.join(['locus_name','id','strand','chrom','start','end','strand','nfeats','width','model_cov','ltr5_model','int_model','ltr3_model'])
        for loc in final_locs:
            mgtf = GTFLine(loc.span_gtf())
            row = [loc.locus_name, loc.id, loc.category(),
                   mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand,
                   mgtf.attr['nfeats'], loc.width(), loc.model_cov(),
                   loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(),
                   ]
            print >>outh, '\t'.join(str(_) for _ in row)

    ### Extract sequences ################################################################
    if args.get_sequences:
        print >>sys.stderr, "Extracting sequences"
        genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa'
        genome = dict((s.id,s) for s in SeqIO.parse(genome_fasta,'fasta'))
        
        with open('%s.full.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand())
                print >>outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord)
                print >>outh, str(loc.entire_sequence(genome).seq)
        
        with open('%s.internal.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand())
                print >>outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust())
                print >>outh, str(loc.internal_sequence(genome).seq)
        
        with open('%s.5ltr.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_up_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand())
                    print >>outh, '>%s_5LTR|%s|%s' % (loc.locus_name, loc.ltr_up_name(), gcoord)
                    print >>outh, str(ltrseq.seq)
        
        with open('%s.3ltr.fasta' % args.prefix,'w') as outh:
            for loc in final_locs:
                ltrseq = loc.ltr_down_sequence(genome)
                if ltrseq:
                    gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand())
                    print >>outh, '>%s_3LTR|%s|%s' % (loc.locus_name, loc.ltr_down_name(), gcoord)
                    print >>outh, str(ltrseq.seq)

    ### IGV snapshots ####################################################################
    if args.igv_snapshot:
            print >>sys.stderr, "Taking IGV snapshots"
            igv = IGV()
            igv.new()
            igv.genome('hg19')
            igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf'))
            if os.path.isdir('tmp'):
                for compare_gtf in glob('tmp/*.gtf'):
                    igv.load(os.path.join(os.getcwd(), compare_gtf))
            
            igv.load(os.path.join(os.getcwd(),'%s.gtf' % args.prefix))
            igv.load(os.path.join(os.getcwd(),'%s_reject.gtf' % args.prefix))
            
            do_snapshots = True
            
            if do_snapshots:
                if not os.path.exists(os.path.join(os.getcwd(),'snapshots')):
                    os.mkdir(os.path.join(os.getcwd(),'snapshots'))
                if not os.path.exists(os.path.join(os.getcwd(),'reject')):
                    os.mkdir(os.path.join(os.getcwd(),'reject'))
                
                categories = ['prototype', 'oneside', 'internal']
                for cat in categories:
                    if not os.path.exists(os.path.join(os.getcwd(),'snapshots/%s' % cat)):
                        os.mkdir(os.path.join(os.getcwd(),'snapshots/%s' % cat))    
                    if not os.path.exists(os.path.join(os.getcwd(),'reject/%s' % cat)):
                        os.mkdir(os.path.join(os.getcwd(),'reject/%s' % cat))
                        
            for loc in final_locs:
                rc,lc = loc.span()
                locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000)
                print >>sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str)
                igv.goto(locus_str)
                igv.expand()
                if do_snapshots:
                    igv.snapshotDirectory(os.path.join(os.getcwd(),'snapshots/%s' % loc.category().strip('*') ))
                    igv.snapshot(filename='%s.png' % loc.locus_name)
            
            for loc in reject:
                rc,lc = loc.span()
                locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000)
                print >>sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str)
                igv.goto(locus_str)
                igv.expand()
                if do_snapshots:
                    igv.snapshotDirectory(os.path.join(os.getcwd(),'reject/%s' % loc.category().strip('*') ))
                    igv.snapshot(filename='%s.png' % loc.id)    
Ejemplo n.º 9
0
def main(parser):
    args = parser.parse_args()
    lines = utils.tab_line_gen(args.infile)
    bystrand = {'+': [], '-': []}
    for l in lines:
        bystrand[l[6]].append(l)

    bystrand['+'] = list(utils.sort_gtf(bystrand['+']))
    bystrand['-'] = list(utils.sort_gtf(bystrand['-']))

    grouped = {'+': [], '-': []}
    for strand in ['+', '-']:
        score = None
        chrom = None
        tmp = []
        for l in bystrand[strand]:
            if score is not None:
                if l[5] != score or l[0] != chrom:
                    grouped[strand].append(tmp)
                    tmp = []
            tmp.append(l)
            score = l[5]
            chrom = l[0]

    gaplens = []
    merged = []
    for g in grouped['+'] + grouped['-']:
        if len(g) == 1:
            merged.append(g[0])
        else:
            mygaps = []
            s = ''
            for i in range(len(g) - 1):
                gaplen = int(g[i + 1][3]) - int(g[i][4])
                s += '%s:%s-%s(%s)' % (g[i][0], g[i][3], g[i][4], g[i][6])
                s += ' --- %d --- ' % gaplen
                mygaps.append(gaplen)

            s += '%s:%s-%s(%s)' % (g[-1][0], g[-1][3], g[-1][4], g[-1][6])
            if any(g >= QUESTIONABLE for g in mygaps):
                continue
            else:
                gaplens.extend(mygaps)
            print >> sys.stderr, s
            # spos = min(int(l[3]) for l in g)
            # epos = max(int(l[4]) for l in g)
            # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g]
            # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.']
            # newattr = {'joined': ','.join(a['id'] for a in attrs),
            #            'repType': attrs[0]['repType'],
            #            }
            # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems()))
            # merged.append(newline)

    if gaplens:
        print >> sys.stderr, 'min gap length:    %d' % min(gaplens)
        print >> sys.stderr, 'mean gap length:   %d' % (float(sum(gaplens)) /
                                                        len(gaplens))
        print >> sys.stderr, 'median gap length: %d' % sorted(gaplens)[
            len(gaplens) / 2]
        print >> sys.stderr, 'max gap length:    %d' % max(gaplens)
    else:
        print >> sys.stderr, 'No gaps found'

    print >> args.outfile, '%d' % max(gaplens)