def main(args): if args.mapping: tdict = {} table = utils.tab_line_gen(args.mapping) header = table.next() for t in table: d = dict(zip(header, t)) tdict[d[args.key]] = d else: # Lookup of any key in tdict returns a dictionary where the value of 'name' is '' tdict = defaultdict(lambda: {'name': ''}) gtf = utils.tab_line_gen(args.infile) for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) assert 'locus' not in attrd if g[1] == 'merged': newname = tdict[attrd['name']][args.value] if newname == '': n1, n2 = attrd['name'].split('_') newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2) g[8] += ' locus "%s";' % newname print >> args.outfile, '\t'.join(g) else: newname = tdict[attrd['gene_id']][args.value] if newname == '': n1, n2 = attrd['gene_id'].split('_') newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2) g[8] += ' locus "%s";' % newname print >> args.outfile, '\t'.join(g)
def main(args): if args.mapping: tdict = {} table = utils.tab_line_gen(args.mapping) header = table.next() for t in table: d = dict(zip(header,t)) tdict[d[args.key]] = d else: # Lookup of any key in tdict returns a dictionary where the value of 'name' is '' tdict = defaultdict(lambda:{'name':''}) gtf = utils.tab_line_gen(args.infile) for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) assert 'locus' not in attrd if g[1]=='merged': newname = tdict[attrd['name']][args.value] if newname == '': n1,n2 = attrd['name'].split('_') newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2) g[8] += ' locus "%s";' % newname print >>args.outfile, '\t'.join(g) else: newname = tdict[attrd['gene_id']][args.value] if newname == '': n1,n2 = attrd['gene_id'].split('_') newname = '%s_%s_%s' % (n1, g[0].strip('chr'), n2) g[8] += ' locus "%s";' % newname print >>args.outfile, '\t'.join(g)
def main(args): combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] if args.prefix is None: prefix = Counter(g.attr['locus'].split('_')[0] for g in combined_gtf).most_common()[0][0] else: prefix = args.prefix namemap = {} if args.cytoband: byband = defaultdict(list) p1 = Popen('bedtools intersect -wo -a - -b %s' % args.cytoband, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE) o,e = p1.communicate(input='\n'.join(str(g) for g in combined_gtf if g.feature.startswith('span'))) for l in utils.tab_line_gen(o.strip('\n').split('\n')): g1 = GTFLine(l[:9]) g2 = GTFLine(l[9:-1]) band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id']) byband[band].append(g1) for band, locs in byband.iteritems(): if len(locs) == 1: namemap[locs[0].attr['locus']] = '%s_%s' % (prefix, band) else: locs.sort(key=lambda x:x.start) for i,loc in enumerate(locs): namemap[loc.attr['locus']] = '%s_%s%s' % (prefix, band, someletters[i]) else: for g in combined_gtf: namemap[g.attr['locus']] = g.attr['locus'] for g in combined_gtf: g.attr['oLocus'] = g.attr['locus'] g.attr['locus'] = namemap[g.attr['locus']] print g
def main(args): # Load the internal GTF combined_gtf = [GTFLine(l) for l in tab_line_gen(args.internalGTF)] combined_gtf = [g for g in combined_gtf if not g.source=='span_internal'] # Load the left GTF for l in tab_line_gen(args.ltrGTF): l_b = GTFLine(l[10:]) l_b.attr['locus'] = GTFLine(l[:9]).attr['locus'] combined_gtf.append(l_b) byloc = defaultdict(list) for g in combined_gtf: byloc[g.attr['locus']].append(g) #for locid,ilocus in itertools.groupby(combined_gtf, key=lambda x:x.attr['locus']): for locid,locus in byloc.iteritems(): locus = utils.remove_dups(locus) locus = utils.adjust_overlaps(locus) category = locus_category(locus) for a in locus: a.source = category spanning = utils.create_spanning(locus) spanning.attr = {'category': category, 'locus': locid } print >>args.outfile, '### %s ###' % locid print >>args.outfile, spanning print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
def main(args): # Load the internal GTF combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.internalGTF)] combined_gtf = [g for g in combined_gtf if not g.feature.startswith('span')] # Load the LTR GTF for l in utils.tab_line_gen(args.ltrGTF): l_b = GTFLine(l[-10:]) l_b.attr['locus'] = GTFLine(l[:9]).attr['locus'] combined_gtf.append(l_b) model_lengths = utils.guess_rmsk_model_lengths(combined_gtf) # Organize by locus byloc = defaultdict(list) for g in combined_gtf: byloc[g.attr['locus']].append(g) for locid,locus in byloc.iteritems(): # Remove duplicate annotations locus = utils.remove_dups(locus) # Adjust overlaps locus = utils.adjust_overlaps(locus) # Determine category category = locus_category(locus) # Add information to all annotations strand = set([a.strand for a in locus]) if len(strand) == 1 and '-' in locus: locus.sort(key=lambda x:x.end, reverse=True) else: locus.sort(key=lambda x:x.start) for i,a in enumerate(locus): a.source = category a.attr['exon_number'] = i+1 # Create spanning annotation spanning = utils.create_spanning(locus) # Calculate model coverage and percent internal_model = get_internal_model(locus) model_cov = calculate_internal_coverage(locus) model_pct = min(100, (float(model_cov) / model_lengths[internal_model])*100) spanning.attr = {'locus': locid , 'category': category, 'model_cov': model_cov, 'model_pct': '%.1f' % model_pct, 'exons':len(locus) } print >>args.outfile, '### %s ###' % locid print >>args.outfile, spanning print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
def main(parser): args = parser.parse_args() rows = [] base_columns = [ 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] attr_columns = set() # Parse rows gtflines = utils.tab_line_gen(args.infile) for l in utils.sort_gtf(gtflines): rowd = dict(zip(base_columns, l[:8])) for k, v in re.findall('(\S+)\s+"([\s\S]+?)";', l[8]): attr_columns.add(k) rowd[k] = v rows.append(rowd) # Set column headers columns = base_columns + list(attr_columns) if args.keycol in columns: columns.remove(args.keycol) columns = [args.keycol] + columns # Print the table print >> args.outfile, '\t'.join(columns) for rowd in rows: print >> args.outfile, '\t'.join( [rowd[c] if c in rowd else '' for c in columns])
def main(args): ### Read the GTF file ################################################################ gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] bychrom = defaultdict(lambda:{'+':list(), '-':list()}) for g in gtf: bychrom[g.chrom][g.strand].append(g) merged_hits = [] for cchrom, strands in bychrom.iteritems(): # Plus strand if len(strands['+']): strands['+'].sort(key=lambda x:x.start) cur = [ strands['+'][0] ] for g1 in strands['+'][1:]: g0 = cur[-1] # Genomic distance between hits gdist = g1.start - g0.end if gdist <= args.shortdist: domerge = True else: domerge = g0.attr['repLeft'] < g1.attr['repLeft'] domerge &= gdist < args.longdist if domerge: cur.append(g1) else: merged_hits.append(cur) cur = [ g1 ] merged_hits.append(cur) # Minus strand if len(strands['-']): strands['-'].sort(key=lambda x:x.end, reverse=True) cur = [ strands['-'][0] ] for g1 in strands['-'][1:]: g0 = cur[-1] # Genomic distance between hits gdist = g0.start - g1.end if gdist <= args.shortdist: domerge = True else: domerge = g0.attr['repStart'] < g1.attr['repStart'] domerge &= gdist < args.longdist if domerge: cur.append(g1) else: merged_hits.append(cur) cur = [ g1 ] merged_hits.append(cur) for i,cur in enumerate(merged_hits): locid = '%s_%04d' % (args.prefix, i+1) spanning = utils.create_spanning(cur) spanning.attr['locus'] = locid for g in cur: g.attr['locus'] = locid print >>args.outfile, '### %s ###' % locid print >>args.outfile, spanning print >>args.outfile, '\n'.join(str(_) for _ in sorted(cur,key=lambda x:x.start))
def main(args): # Load GTF file gtf = utils.tab_line_gen(args.infile) # Skip lines not being merged for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) if 'name' in attrd and attrd['name'] == args.name: break else: print >>args.outfile, '\t'.join(g) # Get all lines for locus to be split mergelines = [g] sublines = [] for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) if g[1]=='merged' and attrd['name'] == args.name: mergelines.append(g) elif 'gene_id' in attrd and attrd['gene_id'] == args.name: sublines.append(g) else: break # Set the new category and names if args.category: category1, category2 = args.category.split(',') else: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) category1 = attrd['category'] category2 = attrd['category'] if args.newname: newname1, newname2 = args.newname.split(',') else: newname1 = '%s.1' % args.name newname2 = '%s.2' % args.name # Print out the edited lines sub1 = sublines[:args.split] print >>args.outfile, '\t'.join(make_mergeline(sub1, newname1, category1)) for l in sub1: l[1] = category1 l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname1, l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname1, l[8]) print >>args.outfile, '\t'.join(l) sub2 = sublines[args.split:] print >>args.outfile, '\t'.join(make_mergeline(sub2, newname2, category2)) for l in sub2: l[1] = category2 l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname2, l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname2, l[8]) print >>args.outfile, '\t'.join(l) # Resume printing the file print >>args.outfile, '\t'.join(g) for g in gtf: print >>args.outfile, '\t'.join(g)
def main(args): if args.chroms: chroms = [l.strip('\n').split('\t')[0] for l in args.chroms] else: chroms = None for l in utils.sort_gtf(utils.tab_line_gen(args.infile), chroms): print >>args.outfile, '\t'.join(l)
def main(args): gtf = [GTFLine(l).asdict() for l in utils.tab_line_gen(args.infile)] allkeys = set(chain.from_iterable(d.keys() for d in gtf)) columns = GTFLine.GTFCOLS + GTFLine.ATTRORDER columns += [k for k in sorted(allkeys) if k not in columns] print >>args.outfile, '\t'.join(columns) for d in gtf: print >>args.outfile, '\t'.join(str(d[c]) if c in d else '' for c in columns)
def main(args): names = args.names.split(',') category = args.category gtf = utils.tab_line_gen(args.infile) # Skip lines not being merged for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) if 'name' in attrd and attrd['name'] in names: break else: print >> args.outfile, '\t'.join(g) mergelines = [g] sublines = [] for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) if g[1] == 'merged' and attrd['name'] in names: mergelines.append(g) elif 'gene_id' in attrd and attrd['gene_id'] in names: sublines.append(g) else: break # Merge the merge lines new_merge = mergelines[0][:] nm_spos = min(int(m[3]) for m in mergelines) nm_epos = max(int(m[4]) for m in mergelines) nm_strand = set(m[6] for m in mergelines) all_attrd = [ dict(re.findall('(\S+)\s+"([\s\S]+?)";', m[8])) for m in mergelines ] nm_name = all_attrd[0]['name'] nm_category = category if category is not None else all_attrd[0]['category'] nm_nfeats = sum(int(d['nfeats']) for d in all_attrd) nm_length = (nm_epos - nm_spos) nm_cov = sum(int(d['cov']) for d in all_attrd) new_merge[3] = str(nm_spos) new_merge[4] = str(nm_epos) new_merge[6] = nm_strand.pop() if len(nm_strand) == 1 else '.' new_merge[ 8] = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % ( nm_name, nm_category, nm_nfeats, nm_length, nm_cov) print >> args.outfile, '\t'.join(new_merge) for l in sublines: l[1] = category if category is not None else all_attrd[0]['category'] l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % nm_name, l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % nm_name, l[8]) print >> args.outfile, '\t'.join(l) # Resume printing the file print >> args.outfile, '\t'.join(g) for g in gtf: print >> args.outfile, '\t'.join(g)
def main(parser): args = parser.parse_args() lines = utils.tab_line_gen(args.infile) clustered = utils.cluster_gtf(lines) reptypes = utils.by_attribute(clustered, 'repType') catcount = Counter() newlines = [] for cnum, c in clustered.iteritems(): c.sort(key=lambda x: int(x[3])) cluster_id = '%s_%04d' % (args.prefix, int(cnum)) # Categorize cluster according to repeat types and orientation if reptypes[cnum] == ['ltr', 'internal', 'ltr']: category = 'prototype' elif reptypes[cnum] == ['ltr', 'internal' ] or reptypes[cnum] == ['internal', 'ltr']: category = 'oneside' elif reptypes[cnum] == ['internal']: category = 'soloint' elif reptypes[cnum] == ['ltr']: category = 'sololtr' else: category = 'unusual' catcount[category] += 1 # Create the parent (merged) annotation pstart = min(int(l[3]) for l in c) pend = max(int(l[4]) for l in c) strands = set(l[6] for l in c) if len(strands) == 1: pstrand = strands.pop() else: pstrand = '.' # Strand is ambiguous pcov = utils.covered_len(c) pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % ( cluster_id, category, len(c), (pend - pstart), pcov) pline = [ c[0][0], 'merged', 'gene', str(pstart), str(pend), '.', pstrand, '.', pattr ] newlines.append(pline) for l in c: l[1] = category attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8])) if 'gene_id' in attr: del attr['gene_id'] if 'transcript_id' in attr: del attr['transcript_id'] l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id, cluster_id) l[8] = l[8] + ' '.join('%s "%s";' % (k, v) for k, v in attr.iteritems()) newlines.append(l[:-1]) for l in utils.sort_gtf(newlines): print >> args.outfile, '\t'.join(l) for cat in ['prototype', 'oneside', 'soloint', 'sololtr', 'unusual']: print >> sys.stderr, '%s: %d' % (cat, catcount[cat])
def main(args): ltrtypes = defaultdict(list) gtf = utils.tab_line_gen(args.gtf) for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) if attrd['repType'] == 'ltr': ltrtypes[attrd['locus']].append(attrd['repName']) titer = utils.tab_line_gen(args.infile) header = titer.next() print >> args.outfile, '\t'.join(header + ['subfamily']) for row in titer: if row[0] in ltrtypes: subfam = ','.join(sorted(set(ltrtypes[row[0]]))) else: subfam = '' print >> sys.stdout, '\t'.join(row + [subfam])
def main(args): ltrtypes = defaultdict(list) gtf = utils.tab_line_gen(args.gtf) for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) if attrd['repType']=='ltr': ltrtypes[attrd['locus']].append(attrd['repName']) titer = utils.tab_line_gen(args.infile) header = titer.next() print >>args.outfile, '\t'.join(header + ['subfamily']) for row in titer: if row[0] in ltrtypes: subfam = ','.join(sorted(set(ltrtypes[row[0]]))) else: subfam = '' print >>sys.stdout, '\t'.join(row + [subfam])
def main(args): # Filehandle for rejected loci if args.reject_gtf is None: import os args.reject_gtf = open(os.devnull,'w') # Filtering parameters min_internal_pct = args.min_internal_pct * 100 min_internal_bases = args.min_internal_bases # Load the internal GTF combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] model_lengths = utils.guess_rmsk_model_lengths(combined_gtf) loccounts = Counter() byloc = defaultdict(list) for g in combined_gtf: byloc[g.attr['locus']].append(g) if min_internal_pct > 0: print >>sys.stderr, "Removing loci matching less than %d percent of internal model..." % int(min_internal_pct) if min_internal_bases > 0: print >>sys.stderr, "Removing loci matching less than %d internal bases..." % min_internal_bases rejectflag = False for locid,locus in byloc.iteritems(): spn = utils.get_span(locus) locus = [a for a in locus if a != spn] # not a.feature.startswith('span')] category = spn.attr['category'] model_pct = spn.attr['model_pct'] model_cov = spn.attr['model_cov'] if model_pct >= min_internal_pct and model_cov >= min_internal_bases: loccounts[category] += 1 outh = args.outfile else: if not rejectflag: print >>sys.stderr, 'Removed loci:' print >>sys.stderr, '%-18s%-6s%-6s%s' % ('locus','bp','pct','category') rejectflag = True loccounts['rejected'] += 1 print >>sys.stderr, '%-18s%-6d%-6.1f%s' % (locid, model_cov, model_pct, category) outh = args.reject_gtf print >>outh, '### %s ###' % locid print >>outh, spn print >>outh, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start)) if not rejectflag: print >>sys.stderr, 'All passed filter.' print >>sys.stderr, 'Summary:' for cat in ['internal','prototype','oneside','unusual','rejected']: print >>sys.stderr, '%s%d' % (cat.ljust(20), loccounts[cat])
def main(args): names = args.names.split(',') category = args.category gtf = utils.tab_line_gen(args.infile) # Skip lines not being merged for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) if 'name' in attrd and attrd['name'] in names: break else: print >>args.outfile, '\t'.join(g) mergelines = [g] sublines = [] for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',g[8])) if g[1]=='merged' and attrd['name'] in names: mergelines.append(g) elif 'gene_id' in attrd and attrd['gene_id'] in names: sublines.append(g) else: break # Merge the merge lines new_merge = mergelines[0][:] nm_spos = min(int(m[3]) for m in mergelines) nm_epos = max(int(m[4]) for m in mergelines) nm_strand = set(m[6] for m in mergelines) all_attrd = [dict(re.findall('(\S+)\s+"([\s\S]+?)";', m[8])) for m in mergelines] nm_name = all_attrd[0]['name'] nm_category = category if category is not None else all_attrd[0]['category'] nm_nfeats = sum(int(d['nfeats']) for d in all_attrd) nm_length = (nm_epos - nm_spos) nm_cov = sum(int(d['cov']) for d in all_attrd) new_merge[3] = str(nm_spos) new_merge[4] = str(nm_epos) new_merge[6] = nm_strand.pop() if len(nm_strand) == 1 else '.' new_merge[8] = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (nm_name, nm_category, nm_nfeats, nm_length, nm_cov) print >>args.outfile, '\t'.join(new_merge) for l in sublines: l[1] = category if category is not None else all_attrd[0]['category'] l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % nm_name , l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % nm_name , l[8]) print >>args.outfile, '\t'.join(l) # Resume printing the file print >>args.outfile, '\t'.join(g) for g in gtf: print >>args.outfile, '\t'.join(g)
def main(parser): args = parser.parse_args() lines = utils.tab_line_gen(args.infile) clustered = utils.cluster_gtf(lines) reptypes = utils.by_attribute(clustered, 'repType') catcount = Counter() newlines = [] for cnum,c in clustered.iteritems(): c.sort(key=lambda x:int(x[3])) cluster_id = '%s_%04d' % (args.prefix,int(cnum)) # Categorize cluster according to repeat types and orientation if reptypes[cnum] == ['ltr','internal','ltr']: category = 'prototype' elif reptypes[cnum] == ['ltr','internal'] or reptypes[cnum] == ['internal','ltr']: category = 'oneside' elif reptypes[cnum] == ['internal']: category = 'soloint' elif reptypes[cnum] == ['ltr']: category = 'sololtr' else: category = 'unusual' catcount[category] += 1 # Create the parent (merged) annotation pstart = min(int(l[3]) for l in c) pend = max(int(l[4]) for l in c) strands = set(l[6] for l in c) if len(strands)==1: pstrand = strands.pop() else: pstrand = '.' # Strand is ambiguous pcov = utils.covered_len(c) pattr = 'name "%s"; category "%s"; nfeats "%d"; length "%d"; cov "%d";' % (cluster_id, category, len(c), (pend-pstart), pcov) pline = [c[0][0], 'merged', 'gene', str(pstart), str(pend), '.', pstrand, '.', pattr] newlines.append(pline) for l in c: l[1] = category attr = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) if 'gene_id' in attr: del attr['gene_id'] if 'transcript_id' in attr: del attr['transcript_id'] l[8] = 'gene_id "%s"; transcript_id "%s"; ' % (cluster_id,cluster_id) l[8] = l[8] + ' '.join('%s "%s";' % (k,v) for k,v in attr.iteritems()) newlines.append(l[:-1]) for l in utils.sort_gtf(newlines): print >>args.outfile, '\t'.join(l) for cat in ['prototype','oneside','soloint','sololtr','unusual']: print >>sys.stderr, '%s: %d' % (cat, catcount[cat])
def main(args): combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] # Find overlaps within the span features overlap_groups = utils.find_overlaps([g for g in combined_gtf if g.feature.startswith('span')]) overlap_groups = {k:v for k,v in overlap_groups.iteritems() if len(v) > 1} print >>sys.stderr, "Found %d groups with conflict." % len(overlap_groups) # Resolve commands if args.resolve_file is not None: resolve_cmds = json.load(args.resolve_file) else: if args.resolve is not None: # Resolve commands were provided as command-line argument resolve_cmds = args.resolve else: # Prompt user to enter resolve commands resolve_cmds = {} for groupid in sorted(overlap_groups.keys(), key=lambda x:int(x)): ogroup = overlap_groups[groupid] print >>sys.stderr, utils.groupstr(groupid, ogroup) concmd = utils.prompt_cmd() resolve_cmds[groupid] = concmd print >>sys.stderr, 'Commands for resolving conflicts (JSON):\n' print >>sys.stderr, json.dumps(resolve_cmds) # Resolve the conflicts byloc = defaultdict(list) for g in combined_gtf: byloc[g.attr['locus']].append(g) for groupid, ogroup in overlap_groups.iteritems(): print >>sys.stderr, utils.groupstr(groupid, ogroup) locids = [a.attr['locus'] for a in ogroup] # Temporarily remove the loci fulllocs = {} for locid in locids: print >>sys.stderr, '\tRemoving %s' % locid fulllocs[locid] = byloc.pop(locid) assert groupid in resolve_cmds newlocs = resolve(resolve_cmds[groupid], ogroup, fulllocs) for newlocid, newlocus in newlocs.iteritems(): print >>sys.stderr, '\tInserting %s' % newlocid # print >>sys.stderr, '\n'.join(str(_) for _ in newlocus) byloc.update(newlocs) for locid,locus in byloc.iteritems(): print >>args.outfile, '### %s ###' % locid print >>args.outfile, '\n'.join(str(_) for _ in sorted(locus,key=lambda x:x.start))
def main(args): discard = set() discard_fh = open(args.discard, 'w') lines = utils.tab_line_gen(args.infile) for l in lines: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) if l[1] == 'merged': cov = int(attrd['cov']) if cov < args.threshold: discard.add(attrd['name']) print >>discard_fh, '\t'.join(l) else: print >>args.outfile, '\t'.join(l) else: if attrd['gene_id'] in discard: print >>discard_fh, '\t'.join(l) else: print >>args.outfile, '\t'.join(l)
def main(args): discard = set() discard_fh = open(args.discard, 'w') lines = utils.tab_line_gen(args.infile) for l in lines: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', l[8])) if l[1] == 'merged': cov = int(attrd['cov']) if cov < args.threshold: discard.add(attrd['name']) print >> discard_fh, '\t'.join(l) else: print >> args.outfile, '\t'.join(l) else: if attrd['gene_id'] in discard: print >> discard_fh, '\t'.join(l) else: print >> args.outfile, '\t'.join(l)
def main(args): ### Read the GTF file ################################################################ gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] ### Correct model coordinates ######################################################## # The repStart, repEnd, and repLeft attributes downloaded from the UCSC rmsk database # does not always give the same model length. Here we guess what the correct model # length is then correct each record mlen = utils.guess_rmsk_model_lengths(gtf) print >>sys.stderr, 'Model lengths:' print >>sys.stderr, '\n'.join('%s%d' % (k.ljust(16), mlen[k]) for k in sorted(mlen.keys())) correct_rmsk_model_coordinates(gtf,mlen) # Check that model coordinates are correct for g in gtf: if g.strand == '+': trueend = mlen[g.attr['repName']] + g.attr['repLeft'] else: trueend = mlen[g.attr['repName']] + g.attr['repStart'] assert trueend == g.attr['repEnd'] print >>args.outfile, g
def main(parser): args = parser.parse_args() lines = utils.tab_line_gen(args.infile) for l in utils.sort_gtf(lines): print >>args.outfile, '\t'.join(l)
def main(args): ### Read the GTF file ################################################################ print >> sys.stderr, 'Loading GTF: %s' % args.internal_file gtf = [ GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file, 'rU')) ] ### Get model lengths # mlen = calculate_model_lengths(gtf) # print mlen mlen = calculate_model_lengths2(gtf) print >> sys.stderr, 'Model lengths: %s' % mlen ### Correct the model coordinates #################################################### correct_model_coordinates(gtf, mlen) for g in gtf: if g.strand == '+': trueend = mlen[g.attr['repName']] + g.attr['repLeft'] else: trueend = mlen[g.attr['repName']] + g.attr['repStart'] assert trueend == g.attr['repEnd'] ### Organize hits by chromosome ###################################################### bychrom = defaultdict(list) for g in gtf: bychrom[g.chrom].append(g) ### List of HERV loci ################################################################ print >> sys.stderr, 'Assembling HERV loci' all_locs = [] ### Create HERV loci for plus strand ################################################# for chrom in utils.CHROMNAMES: if chrom in bychrom: plus = [h for h in bychrom[chrom] if h.strand == '+'] if not plus: continue plus.sort(key=lambda x: x.start) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(plus[0]) for p1 in plus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p1.start - p0.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repLeft'] < p1.attr['repLeft'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(p1) all_locs.append(cur) ### Create HERV loci for minus strand ################################################ for chrom in utils.CHROMNAMES: if chrom in bychrom: minus = [h for h in bychrom[chrom] if h.strand == '-'] if not minus: continue minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(minus[0]) for p1 in minus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p0.start - p1.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repStart'] < p1.attr['repStart'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs) + 1)) cur.internal.append(p1) all_locs.append(cur) ### Add LTRs to HERV loci ############################################################ print >> sys.stderr, 'Finding flanking LTRs' for loc in all_locs: loc.find_ltr(args.ltr_files, args.flank) loc.adjust_overlaps() print >> sys.stderr, "Initial counts:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) ### Filtering ######################################################################## reject = set() if args.minpct > 0 or args.mincov > 0: print >> sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % ( int(args.minpct * 100), args.mincov) for loc in all_locs: if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct ) or loc.model_cov() < args.mincov: print >> sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category()) reject.add(loc) for rloc in reject: all_locs.remove(rloc) print >> sys.stderr, "After filtering:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject)) ### Deal with overlapping loci ####################################################### # Create GTF with all_locs with open('tmp.gtf', 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs): print >> outh, '\t'.join(g) # Cluster overlapping and bookended using bedtools p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE) out, err = p1.communicate() os.remove('tmp.gtf') # Parse bedtools output overlap_groups = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') overlap_groups[f[-1]].append(GTFLine(f[:9])) # Remove clusters with one for k in overlap_groups.keys(): if len(overlap_groups[k]) == 1: del overlap_groups[k] print >> sys.stderr, "%d overlap groups" % len(overlap_groups) if args.igv_preview and len(overlap_groups) > 0: print >> sys.stderr, "Loading IGV" # Create file for IGV viewing with open('tmp.gtf', 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in all_locs)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) igv = IGV() igv.new() igv.genome('hg19') igv.load( os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(), 'tmp.gtf')) tandem = [] for k in sorted(overlap_groups.keys(), key=lambda x: int(x)): ogroup = overlap_groups[k] if args.igv_preview: locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup) - 5000, max(gl.end for gl in ogroup) + 5000) igv.goto(locus_str) igv.expand() # Get locus for each member of overlap group og_locus = {} for o in ogroup: tmp = [c for c in all_locs if c.id == o.attr['name']] assert len(tmp) == 1 og_locus[o.attr['name']] = tmp[0] # Print out the model coverage for n, loc in og_locus.iteritems(): print >> sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category()) # Parse user input z = raw_input('Action to take: ').strip() if z == '': continue inputcmd = z.strip().split(' ') if inputcmd[0] == 'REJECT': if len(inputcmd) == 1: # Only max will be kept st = sorted([loc for n, loc in og_locus.iteritems()], key=lambda x: x.model_cov(), reverse=True)[1:] loc_ids = [_.id for _ in st] elif len(inputcmd) == 2: loc_ids = inputcmd[1].split(',') else: assert False for loc_id in loc_ids: reject.add(og_locus[loc_id]) elif inputcmd[0] == 'TANDEM': if len(inputcmd) == 1: assert len(og_locus) == 2, 'More than 2 loci are present' tandem.append([loc for n, loc in og_locus.iteritems()]) elif len(inputcmd) == 2: loc_ids = inputcmd[1].split('+') tandem.append([og_locus[loc_id] for loc_id in loc_ids]) else: assert False elif inputcmd[0] == 'DIFF': n1, n2 = inputcmd[1].split('-') g1 = og_locus[n1] g2 = og_locus[n2] if g1.span()[0] < g2.span()[1]: g1.shorten(g2.span()[1] + 20, g1.span()[1]) elif g1.span()[1] < g2.span()[0]: g1.shorten(g1.span()[0], g2.span()[0] - 20) else: print "no overlap!" print g1 elif inputcmd[0] == 'IGNORE': continue else: assert False, 'Unknown command: "%s"' % inputcmd[0] # Remove rejected annotations for rloc in reject: if rloc in all_locs: all_locs.remove(rloc) # Create the tandem annotations for tgroup in tandem: tandem_loc = HERVLocus(id=tgroup[0].id) tandem_loc.internal = list( chain.from_iterable(loc.internal for loc in tgroup)) if tandem_loc.strand() == '+': tandem_loc.internal.sort(key=lambda x: x.start) else: tandem_loc.internal.sort(key=lambda x: x.end, reverse=True) tandem_loc.find_ltr(args.ltr_files, 1000) tandem_loc.adjust_overlaps() tandem_loc.is_tandem = True all_locs.append(tandem_loc) # Remove from original for rloc in tgroup: all_locs.remove(rloc) print >> sys.stderr, "After overlap removal:" print >> sys.stderr, '\n'.join('%s%d' % (cat.ljust(20, ' '), count) for cat, count in Counter( c.category() for c in all_locs).most_common()) print >> sys.stderr, '%s%d' % ('Rejected'.ljust(20, ' '), len(reject)) if args.igv_preview and len(overlap_groups) > 0: os.remove('tmp.gtf') ### Sort loci ######################################################################## bychrom = defaultdict(list) for loc in all_locs: bychrom[loc.chrom()].append(loc) final_locs = [] for chrom in utils.CHROMNAMES: if chrom in bychrom: for loc in sorted(bychrom[chrom], key=lambda x: x.span()[0]): final_locs.append(loc) for i, loc in enumerate(final_locs): loc.id = '%s_%04d' % (args.prefix, i + 1) ### Rename loci according to cytoband ################################################# # Create GTF with all_locs with open('tmp.gtf', 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >> outh, '\t'.join(g) p1 = Popen( 'bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE) out, err = p1.communicate() os.remove('tmp.gtf') byband = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') g1 = GTFLine(f[:9]) g2 = GTFLine(f[9:-1]) band = '%s%s' % (g2.chrom.strip('chr'), g2.attr['gene_id']) byband[band].append(g1) namemap = {} for band, glist in byband.iteritems(): if len(glist) == 1: namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band) else: glist.sort(key=lambda x: x.start) for i, gl in enumerate(glist): namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i]) for loc in final_locs: loc.locus_name = namemap[loc.id] ### Create annotation files ########################################################## print >> sys.stderr, "Writing annotation files" with open('%s.gtf' % args.prefix, 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in final_locs)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in final_locs: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_reject.gtf' % args.prefix, 'w') as outh: liter = utils.sort_gtf( chain.from_iterable(loc.each_gtf() for loc in reject)) print >> outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in reject: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_span.gtf' % args.prefix, 'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >> outh, '\t'.join(g) with open('%s_table.txt' % args.prefix, 'w') as outh: print >> outh, '\t'.join([ 'locus_name', 'id', 'strand', 'chrom', 'start', 'end', 'strand', 'nfeats', 'width', 'model_cov', 'ltr5_model', 'int_model', 'ltr3_model' ]) for loc in final_locs: mgtf = GTFLine(loc.span_gtf()) row = [ loc.locus_name, loc.id, loc.category(), mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand, mgtf.attr['nfeats'], loc.width(), loc.model_cov(), loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(), ] print >> outh, '\t'.join(str(_) for _ in row) ### Extract sequences ################################################################ if args.get_sequences: print >> sys.stderr, "Extracting sequences" genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa' genome = dict((s.id, s) for s in SeqIO.parse(genome_fasta, 'fasta')) with open('%s.full.fasta' % args.prefix, 'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand()) print >> outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord) print >> outh, str(loc.entire_sequence(genome).seq) with open('%s.internal.fasta' % args.prefix, 'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand()) print >> outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust()) print >> outh, str(loc.internal_sequence(genome).seq) with open('%s.5ltr.fasta' % args.prefix, 'w') as outh: for loc in final_locs: ltrseq = loc.ltr_up_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand()) print >> outh, '>%s_5LTR|%s|%s' % ( loc.locus_name, loc.ltr_up_name(), gcoord) print >> outh, str(ltrseq.seq) with open('%s.3ltr.fasta' % args.prefix, 'w') as outh: for loc in final_locs: ltrseq = loc.ltr_down_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % ( loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand()) print >> outh, '>%s_3LTR|%s|%s' % ( loc.locus_name, loc.ltr_down_name(), gcoord) print >> outh, str(ltrseq.seq) ### IGV snapshots #################################################################### if args.igv_snapshot: print >> sys.stderr, "Taking IGV snapshots" igv = IGV() igv.new() igv.genome('hg19') igv.load( os.path.join(os.getcwd(), '../other_sources/rmsk_LTR.hg19.gtf')) if os.path.isdir('tmp'): for compare_gtf in glob('tmp/*.gtf'): igv.load(os.path.join(os.getcwd(), compare_gtf)) igv.load(os.path.join(os.getcwd(), '%s.gtf' % args.prefix)) igv.load(os.path.join(os.getcwd(), '%s_reject.gtf' % args.prefix)) do_snapshots = True if do_snapshots: if not os.path.exists(os.path.join(os.getcwd(), 'snapshots')): os.mkdir(os.path.join(os.getcwd(), 'snapshots')) if not os.path.exists(os.path.join(os.getcwd(), 'reject')): os.mkdir(os.path.join(os.getcwd(), 'reject')) categories = ['prototype', 'oneside', 'internal'] for cat in categories: if not os.path.exists( os.path.join(os.getcwd(), 'snapshots/%s' % cat)): os.mkdir(os.path.join(os.getcwd(), 'snapshots/%s' % cat)) if not os.path.exists( os.path.join(os.getcwd(), 'reject/%s' % cat)): os.mkdir(os.path.join(os.getcwd(), 'reject/%s' % cat)) for loc in final_locs: rc, lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000) print >> sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory( os.path.join(os.getcwd(), 'snapshots/%s' % loc.category().strip('*'))) igv.snapshot(filename='%s.png' % loc.locus_name) for loc in reject: rc, lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc - 5000, lc + 5000) print >> sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory( os.path.join(os.getcwd(), 'reject/%s' % loc.category().strip('*'))) igv.snapshot(filename='%s.png' % loc.id)
def main(parser): args = parser.parse_args() lines = utils.tab_line_gen(args.infile) bystrand = {"+": [], "-": []} for l in lines: bystrand[l[6]].append(l) bystrand["+"] = list(utils.sort_gtf(bystrand["+"])) bystrand["-"] = list(utils.sort_gtf(bystrand["-"])) grouped = {"+": [], "-": []} for strand in ["+", "-"]: score = None chrom = None tmp = [] for l in bystrand[strand]: if score is not None: if l[5] != score or l[0] != chrom: grouped[strand].append(tmp) tmp = [] tmp.append(l) score = l[5] chrom = l[0] gaplens = [] merged = [] for g in grouped["+"] + grouped["-"]: if len(g) == 1: merged.append(g[0]) else: mygaps = [] s = "" for i in range(len(g) - 1): gaplen = int(g[i + 1][3]) - int(g[i][4]) s += "%s:%s-%s(%s)" % (g[i][0], g[i][3], g[i][4], g[i][6]) s += " --- %d --- " % gaplen mygaps.append(gaplen) s += "%s:%s-%s(%s)" % (g[-1][0], g[-1][3], g[-1][4], g[-1][6]) if any(g >= QUESTIONABLE for g in mygaps): continue else: gaplens.extend(mygaps) print >>sys.stderr, s # spos = min(int(l[3]) for l in g) # epos = max(int(l[4]) for l in g) # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g] # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.'] # newattr = {'joined': ','.join(a['id'] for a in attrs), # 'repType': attrs[0]['repType'], # } # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems())) # merged.append(newline) if gaplens: print >>sys.stderr, "min gap length: %d" % min(gaplens) print >>sys.stderr, "mean gap length: %d" % (float(sum(gaplens)) / len(gaplens)) print >>sys.stderr, "median gap length: %d" % sorted(gaplens)[len(gaplens) / 2] print >>sys.stderr, "max gap length: %d" % max(gaplens) else: print >>sys.stderr, "No gaps found" print >>args.outfile, "%d" % max(gaplens)
def main(args): combined_gtf = [GTFLine(l) for l in utils.tab_line_gen(args.infile)] for g in combined_gtf: if args.fromAttr in g.attr: g.attr[args.toAttr] = g.attr[args.fromAttr] print >>args.outfile, g
def main(args): # Load GTF file gtf = utils.tab_line_gen(args.infile) # Skip lines not being merged for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) if 'name' in attrd and attrd['name'] == args.name: break else: print >> args.outfile, '\t'.join(g) # Get all lines for locus to be split mergelines = [g] sublines = [] for g in gtf: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) if g[1] == 'merged' and attrd['name'] == args.name: mergelines.append(g) elif 'gene_id' in attrd and attrd['gene_id'] == args.name: sublines.append(g) else: break # Set the new category and names if args.category: category1, category2 = args.category.split(',') else: attrd = dict(re.findall('(\S+)\s+"([\s\S]+?)";', g[8])) category1 = attrd['category'] category2 = attrd['category'] if args.newname: newname1, newname2 = args.newname.split(',') else: newname1 = '%s.1' % args.name newname2 = '%s.2' % args.name # Print out the edited lines sub1 = sublines[:args.split] print >> args.outfile, '\t'.join(make_mergeline(sub1, newname1, category1)) for l in sub1: l[1] = category1 l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname1, l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname1, l[8]) print >> args.outfile, '\t'.join(l) sub2 = sublines[args.split:] print >> args.outfile, '\t'.join(make_mergeline(sub2, newname2, category2)) for l in sub2: l[1] = category2 l[8] = re.sub('gene_id "\S+";', 'gene_id "%s";' % newname2, l[8]) l[8] = re.sub('transcript_id "\S+";', 'transcript_id "%s";' % newname2, l[8]) print >> args.outfile, '\t'.join(l) # Resume printing the file print >> args.outfile, '\t'.join(g) for g in gtf: print >> args.outfile, '\t'.join(g)
#! /usr/bin/env python import sys import re import utils from collections import defaultdict cyto = dict([l.strip('\n').split('\t') for l in open('tmp/cyto_name_map.txt','rU')]) lines = list(utils.tab_line_gen(open('filtered.hg19.gtf','rU'))) ### Seperate merged and unmerged lines ### Group unmerged lines according to gene merged = [] unmerged = defaultdict(list) for l in lines: attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) if l[1]=='merged': merged.append((attrs['name'],l)) else: attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) unmerged[attrs['gene_id']].append(l) sdata = defaultdict(dict) for mname,ml in merged: attrs = dict(re.findall('(\S+)\s+"([\s\S]+?)";',ml[8])) sdata[mname]['category'] = attrs['category'] sdata[mname]['band'] = cyto[mname] sdata[mname]['id'] = mname sdata[mname]['start'] = int(ml[3]) sdata[mname]['chrom'] = ml[0] sdata[mname]['alias'] = ''
def main(args): ### Read the GTF file ################################################################ print >>sys.stderr, 'Loading GTF: %s' % args.internal_file gtf = [GTFLine(l) for l in utils.tab_line_gen(open(args.internal_file,'rU'))] ### Get model lengths # mlen = calculate_model_lengths(gtf) # print mlen mlen = calculate_model_lengths2(gtf) print >>sys.stderr, 'Model lengths: %s' % mlen ### Correct the model coordinates #################################################### correct_model_coordinates(gtf,mlen) for g in gtf: if g.strand == '+': trueend = mlen[g.attr['repName']] + g.attr['repLeft'] else: trueend = mlen[g.attr['repName']] + g.attr['repStart'] assert trueend == g.attr['repEnd'] ### Organize hits by chromosome ###################################################### bychrom = defaultdict(list) for g in gtf: bychrom[g.chrom].append(g) ### List of HERV loci ################################################################ print >>sys.stderr, 'Assembling HERV loci' all_locs = [] ### Create HERV loci for plus strand ################################################# for chrom in utils.CHROMNAMES: if chrom in bychrom: plus = [h for h in bychrom[chrom] if h.strand == '+'] if not plus: continue plus.sort(key=lambda x: x.start) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(plus[0]) for p1 in plus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p1.start - p0.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repLeft'] < p1.attr['repLeft'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(p1) all_locs.append(cur) ### Create HERV loci for minus strand ################################################ for chrom in utils.CHROMNAMES: if chrom in bychrom: minus = [h for h in bychrom[chrom] if h.strand == '-'] if not minus: continue minus.sort(key=lambda x: x.end, reverse=True) # Sort in reverse order cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(minus[0]) for p1 in minus[1:]: p0 = cur.internal[-1] # Genomic distance between hits gdist = p0.start - p1.end # Determine whether p1 is in sequence with locus if gdist <= 10: ## Overlapping (or nearly) in genome insequence = True else: ## Hits are in sequence and genomic distance is not extreme insequence = p0.attr['repStart'] < p1.attr['repStart'] insequence &= gdist < args.longdist if insequence: cur.internal.append(p1) else: all_locs.append(cur) cur = HERVLocus(id='%s_%04d' % (args.prefix, len(all_locs)+1)) cur.internal.append(p1) all_locs.append(cur) ### Add LTRs to HERV loci ############################################################ print >>sys.stderr, 'Finding flanking LTRs' for loc in all_locs: loc.find_ltr(args.ltr_files, args.flank) loc.adjust_overlaps() print >>sys.stderr, "Initial counts:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) ### Filtering ######################################################################## reject = set() if args.minpct > 0 or args.mincov > 0: print >>sys.stderr, "Removing loci with less than %d percent or %dbp model coverage" % (int(args.minpct*100), args.mincov) for loc in all_locs: if loc.model_cov() < (mlen[loc.internal_name()] * args.minpct) or loc.model_cov() < args.mincov: print >>sys.stderr, '%s\t%d\t%s' % (loc.id, loc.model_cov(), loc.category()) reject.add(loc) for rloc in reject: all_locs.remove(rloc) print >>sys.stderr, "After filtering:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject)) ### Deal with overlapping loci ####################################################### # Create GTF with all_locs with open('tmp.gtf','w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in all_locs): print >>outh, '\t'.join(g) # Cluster overlapping and bookended using bedtools p1 = Popen('bedtools cluster -i tmp.gtf', shell=True, stdout=PIPE, stderr=PIPE) out,err = p1.communicate() os.remove('tmp.gtf') # Parse bedtools output overlap_groups = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') overlap_groups[f[-1]].append(GTFLine(f[:9])) # Remove clusters with one for k in overlap_groups.keys(): if len(overlap_groups[k]) == 1: del overlap_groups[k] print >>sys.stderr, "%d overlap groups" % len(overlap_groups) if args.igv_preview and len(overlap_groups)>0: print >>sys.stderr, "Loading IGV" # Create file for IGV viewing with open('tmp.gtf','w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in all_locs)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf')) igv.load(os.path.join(os.getcwd(),'tmp.gtf')) tandem = [] for k in sorted(overlap_groups.keys(), key=lambda x:int(x)): ogroup = overlap_groups[k] if args.igv_preview: locus_str = '%s:%s-%s' % (ogroup[0].chrom, min(gl.start for gl in ogroup)-5000, max(gl.end for gl in ogroup)+5000) igv.goto(locus_str) igv.expand() # Get locus for each member of overlap group og_locus = {} for o in ogroup: tmp = [c for c in all_locs if c.id == o.attr['name']] assert len(tmp)==1 og_locus[o.attr['name']] = tmp[0] # Print out the model coverage for n,loc in og_locus.iteritems(): print >>sys.stderr, '%s\t%d\t%s' % (n, loc.model_cov(), loc.category()) # Parse user input z = raw_input('Action to take: ').strip() if z == '': continue inputcmd = z.strip().split(' ') if inputcmd[0] == 'REJECT': if len(inputcmd) == 1: # Only max will be kept st = sorted([loc for n,loc in og_locus.iteritems()], key=lambda x:x.model_cov(), reverse=True)[1:] loc_ids = [_.id for _ in st] elif len(inputcmd) == 2: loc_ids = inputcmd[1].split(',') else: assert False for loc_id in loc_ids: reject.add(og_locus[loc_id]) elif inputcmd[0] == 'TANDEM': if len(inputcmd) == 1: assert len(og_locus)==2, 'More than 2 loci are present' tandem.append([loc for n,loc in og_locus.iteritems()]) elif len(inputcmd) == 2: loc_ids = inputcmd[1].split('+') tandem.append([og_locus[loc_id] for loc_id in loc_ids]) else: assert False elif inputcmd[0] == 'DIFF': n1,n2 = inputcmd[1].split('-') g1 = og_locus[n1] g2 = og_locus[n2] if g1.span()[0] < g2.span()[1]: g1.shorten(g2.span()[1]+20, g1.span()[1]) elif g1.span()[1] < g2.span()[0]: g1.shorten(g1.span()[0], g2.span()[0]-20) else: print "no overlap!" print g1 elif inputcmd[0] == 'IGNORE': continue else: assert False, 'Unknown command: "%s"' % inputcmd[0] # Remove rejected annotations for rloc in reject: if rloc in all_locs: all_locs.remove(rloc) # Create the tandem annotations for tgroup in tandem: tandem_loc = HERVLocus(id=tgroup[0].id) tandem_loc.internal = list(chain.from_iterable(loc.internal for loc in tgroup)) if tandem_loc.strand() == '+': tandem_loc.internal.sort(key=lambda x:x.start) else: tandem_loc.internal.sort(key=lambda x:x.end, reverse=True) tandem_loc.find_ltr(args.ltr_files, 1000) tandem_loc.adjust_overlaps() tandem_loc.is_tandem = True all_locs.append(tandem_loc) # Remove from original for rloc in tgroup: all_locs.remove(rloc) print >>sys.stderr, "After overlap removal:" print >>sys.stderr, '\n'.join('%s%d' % (cat.ljust(20,' '),count) for cat,count in Counter(c.category() for c in all_locs).most_common()) print >>sys.stderr, '%s%d' % ('Rejected'.ljust(20,' '), len(reject)) if args.igv_preview and len(overlap_groups)>0: os.remove('tmp.gtf') ### Sort loci ######################################################################## bychrom = defaultdict(list) for loc in all_locs: bychrom[loc.chrom()].append(loc) final_locs = [] for chrom in utils.CHROMNAMES: if chrom in bychrom: for loc in sorted(bychrom[chrom], key=lambda x:x.span()[0]): final_locs.append(loc) for i,loc in enumerate(final_locs): loc.id = '%s_%04d' % (args.prefix, i+1) ### Rename loci according to cytoband ################################################# # Create GTF with all_locs with open('tmp.gtf','w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >>outh, '\t'.join(g) p1 = Popen('bedtools intersect -wo -a tmp.gtf -b ../other_sources/cytoband.gtf', shell=True, stdout=PIPE, stderr=PIPE) out,err = p1.communicate() os.remove('tmp.gtf') byband = defaultdict(list) for ll in out.strip('\n').split('\n'): f = ll.split('\t') g1 = GTFLine(f[:9]) g2 = GTFLine(f[9:-1]) band = '%s%s' % (g2.chrom.strip('chr'),g2.attr['gene_id']) byband[band].append(g1) namemap = {} for band,glist in byband.iteritems(): if len(glist) == 1: namemap[glist[0].attr['name']] = '%s_%s' % (args.prefix, band) else: glist.sort(key=lambda x:x.start) for i,gl in enumerate(glist): namemap[gl.attr['name']] = '%s_%s%s' % (args.prefix, band, someletters[i]) for loc in final_locs: loc.locus_name = namemap[loc.id] ### Create annotation files ########################################################## print >>sys.stderr, "Writing annotation files" with open('%s.gtf' % args.prefix,'w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in final_locs)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in final_locs: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_reject.gtf' % args.prefix,'w') as outh: liter = utils.sort_gtf(chain.from_iterable(loc.each_gtf() for loc in reject)) print >>outh, '\n'.join('\t'.join(_) for _ in liter) # for loc in reject: # print >>outh, '\n'.join('\t'.join(g) for g in loc.each_gtf()) with open('%s_span.gtf' % args.prefix,'w') as outh: for g in utils.sort_gtf(loc.span_gtf() for loc in final_locs): print >>outh, '\t'.join(g) with open('%s_table.txt' % args.prefix,'w') as outh: print >>outh, '\t'.join(['locus_name','id','strand','chrom','start','end','strand','nfeats','width','model_cov','ltr5_model','int_model','ltr3_model']) for loc in final_locs: mgtf = GTFLine(loc.span_gtf()) row = [loc.locus_name, loc.id, loc.category(), mgtf.chrom, mgtf.start, mgtf.end, mgtf.strand, mgtf.attr['nfeats'], loc.width(), loc.model_cov(), loc.ltr_up_name(), loc.internal_name(), loc.ltr_down_name(), ] print >>outh, '\t'.join(str(_) for _ in row) ### Extract sequences ################################################################ if args.get_sequences: print >>sys.stderr, "Extracting sequences" genome_fasta = args.genome_fasta # '/Users/bendall/Projects/References/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa' genome = dict((s.id,s) for s in SeqIO.parse(genome_fasta,'fasta')) with open('%s.full.fasta' % args.prefix,'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), loc.span()[0], loc.span()[1], loc.strand()) print >>outh, '>%s|%s|%s' % (loc.locus_name, loc.category(), gcoord) print >>outh, str(loc.entire_sequence(genome).seq) with open('%s.internal.fasta' % args.prefix,'w') as outh: for loc in final_locs: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.internal), max(p.end for p in loc.internal), loc.strand()) print >>outh, '>%s_int|%s|%s|%s' % (loc.locus_name, loc.category(), gcoord, loc.format_print_clust()) print >>outh, str(loc.internal_sequence(genome).seq) with open('%s.5ltr.fasta' % args.prefix,'w') as outh: for loc in final_locs: ltrseq = loc.ltr_up_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_up), max(p.end for p in loc.ltr_up), loc.strand()) print >>outh, '>%s_5LTR|%s|%s' % (loc.locus_name, loc.ltr_up_name(), gcoord) print >>outh, str(ltrseq.seq) with open('%s.3ltr.fasta' % args.prefix,'w') as outh: for loc in final_locs: ltrseq = loc.ltr_down_sequence(genome) if ltrseq: gcoord = '%s:%d-%d(%s)' % (loc.chrom(), min(p.start for p in loc.ltr_down), max(p.end for p in loc.ltr_down), loc.strand()) print >>outh, '>%s_3LTR|%s|%s' % (loc.locus_name, loc.ltr_down_name(), gcoord) print >>outh, str(ltrseq.seq) ### IGV snapshots #################################################################### if args.igv_snapshot: print >>sys.stderr, "Taking IGV snapshots" igv = IGV() igv.new() igv.genome('hg19') igv.load(os.path.join(os.getcwd(),'../other_sources/rmsk_LTR.hg19.gtf')) if os.path.isdir('tmp'): for compare_gtf in glob('tmp/*.gtf'): igv.load(os.path.join(os.getcwd(), compare_gtf)) igv.load(os.path.join(os.getcwd(),'%s.gtf' % args.prefix)) igv.load(os.path.join(os.getcwd(),'%s_reject.gtf' % args.prefix)) do_snapshots = True if do_snapshots: if not os.path.exists(os.path.join(os.getcwd(),'snapshots')): os.mkdir(os.path.join(os.getcwd(),'snapshots')) if not os.path.exists(os.path.join(os.getcwd(),'reject')): os.mkdir(os.path.join(os.getcwd(),'reject')) categories = ['prototype', 'oneside', 'internal'] for cat in categories: if not os.path.exists(os.path.join(os.getcwd(),'snapshots/%s' % cat)): os.mkdir(os.path.join(os.getcwd(),'snapshots/%s' % cat)) if not os.path.exists(os.path.join(os.getcwd(),'reject/%s' % cat)): os.mkdir(os.path.join(os.getcwd(),'reject/%s' % cat)) for loc in final_locs: rc,lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000) print >>sys.stderr, '%s\t%s\t%s' % (loc.locus_name, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory(os.path.join(os.getcwd(),'snapshots/%s' % loc.category().strip('*') )) igv.snapshot(filename='%s.png' % loc.locus_name) for loc in reject: rc,lc = loc.span() locus_str = '%s:%d-%d' % (loc.chrom(), rc-5000, lc+5000) print >>sys.stderr, '%s\t%s\t%s' % (loc.id, loc.category(), locus_str) igv.goto(locus_str) igv.expand() if do_snapshots: igv.snapshotDirectory(os.path.join(os.getcwd(),'reject/%s' % loc.category().strip('*') )) igv.snapshot(filename='%s.png' % loc.id)
def main(parser): args = parser.parse_args() lines = utils.tab_line_gen(args.infile) bystrand = {'+': [], '-': []} for l in lines: bystrand[l[6]].append(l) bystrand['+'] = list(utils.sort_gtf(bystrand['+'])) bystrand['-'] = list(utils.sort_gtf(bystrand['-'])) grouped = {'+': [], '-': []} for strand in ['+', '-']: score = None chrom = None tmp = [] for l in bystrand[strand]: if score is not None: if l[5] != score or l[0] != chrom: grouped[strand].append(tmp) tmp = [] tmp.append(l) score = l[5] chrom = l[0] gaplens = [] merged = [] for g in grouped['+'] + grouped['-']: if len(g) == 1: merged.append(g[0]) else: mygaps = [] s = '' for i in range(len(g) - 1): gaplen = int(g[i + 1][3]) - int(g[i][4]) s += '%s:%s-%s(%s)' % (g[i][0], g[i][3], g[i][4], g[i][6]) s += ' --- %d --- ' % gaplen mygaps.append(gaplen) s += '%s:%s-%s(%s)' % (g[-1][0], g[-1][3], g[-1][4], g[-1][6]) if any(g >= QUESTIONABLE for g in mygaps): continue else: gaplens.extend(mygaps) print >> sys.stderr, s # spos = min(int(l[3]) for l in g) # epos = max(int(l[4]) for l in g) # attrs = [dict(re.findall('(\S+)\s+"([\s\S]+?)";',l[8])) for l in g] # newline = [g[0][0], 'joined', 'exon', str(spos), str(epos), g[0][5], g[0][6], '.'] # newattr = {'joined': ','.join(a['id'] for a in attrs), # 'repType': attrs[0]['repType'], # } # newline.append(' '.join('%s "%s";' % (k,v) for k,v in newattr.iteritems())) # merged.append(newline) if gaplens: print >> sys.stderr, 'min gap length: %d' % min(gaplens) print >> sys.stderr, 'mean gap length: %d' % (float(sum(gaplens)) / len(gaplens)) print >> sys.stderr, 'median gap length: %d' % sorted(gaplens)[ len(gaplens) / 2] print >> sys.stderr, 'max gap length: %d' % max(gaplens) else: print >> sys.stderr, 'No gaps found' print >> args.outfile, '%d' % max(gaplens)