def gtf_add_isoform(gtf, iso, out=sys.stdout, quiet=False): isoforms = {} if not quiet: sys.stderr.write('Reading isoforms...\n') for line in gzip_reader(iso): if line[0] == '#': continue cols = line.rstrip().split('\t') isoforms[cols[1]] = cols[0] if not quiet: sys.stderr.write('Reading/Writing GTF...\n') for line in gzip_reader(gtf): try: comment = None idx = line.find('#') if idx > -1: if idx == 0: sys.stdout.write(line) continue comment = line[idx:] line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip( ).split('\t') transcript_id = None for key, val in [ x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x ]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == 'transcript_id': transcript_id = val if attrs[-1] != ';': attrs = '%s;' % attrs if transcript_id in isoforms: attrs = '%s isoform_id "%s";' % (attrs, isoforms[transcript_id]) out.write('\t'.join([ chrom, source, feature, start, end, score, strand, frame, attrs ])) if comment: out.write('\t%s' % comment) out.write('\n') except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1)
def gtf_addreflink(gtf, reflink, out=sys.stdout, quiet=False, replace=False): link_values = {} if not quiet: sys.stderr.write('Reading refLink...\n') for line in gzip_reader(reflink): cols = line.rstrip().split('\t') link_values[cols[2]] = (cols[0], cols[6]) if not quiet: sys.stderr.write('Reading GTF...\n') for line in gzip_reader(gtf): try: comment = None idx = line.find('#') if idx > -1: if idx == 0: sys.stdout.write(line) continue comment = line[idx:] line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t') transcript_id = None gene_id = None for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == 'transcript_id': transcript_id = val elif key == 'gene_id': gene_id = val if attrs[-1] != ';': attrs = '%s;' % attrs if transcript_id in link_values: if replace: attrs = 'gene_id "%s"; transcript_id "%s"; gene_name "%s"; orig_gene_id "%s";' % (link_values[transcript_id][1], transcript_id, link_values[transcript_id][0], gene_id) else: extra = 'gene_name "%s"; isoform_id "%s";' % link_values[transcript_id] attrs = '%s %s' % (attrs, extra) out.write('\t'.join([chrom, source, feature, start, end, score, strand, frame, attrs])) if comment: out.write('\t%s' % comment) out.write('\n') except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1)
def gtf_add_isoform(gtf, iso, out=sys.stdout, quiet=False): isoforms = {} if not quiet: sys.stderr.write('Reading isoforms...\n') for line in gzip_reader(iso): if line[0] == '#': continue cols = line.rstrip().split('\t') isoforms[cols[1]] = cols[0] if not quiet: sys.stderr.write('Reading/Writing GTF...\n') for line in gzip_reader(gtf): try: comment = None idx = line.find('#') if idx > -1: if idx == 0: sys.stdout.write(line) continue comment = line[idx:] line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t') transcript_id = None for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == 'transcript_id': transcript_id = val if attrs[-1] != ';': attrs = '%s;' % attrs if transcript_id in isoforms: attrs = '%s isoform_id "%s";' % (attrs, isoforms[transcript_id]) out.write('\t'.join([chrom, source, feature, start, end, score, strand, frame, attrs])) if comment: out.write('\t%s' % comment) out.write('\n') except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1)
def bedgraph_clean(bedgraph, chrom_sizes, out=sys.stdout): refs = {} with open(chrom_sizes) as f: for line in f: cols = line.strip().split("\t") refs[cols[0]] = int(cols[1]) first = True extra = "" for line in gzip_reader(bedgraph, callback=lambda: extra): if first: out.write(line) # header first = False continue cols = line.strip().split("\t") ref = cols[0] start = int(cols[1]) end = int(cols[2]) extra = "%s:%s-%s" % (ref, start, end) if not ref in refs: continue if start >= refs[ref]: # skip this... it makes no sense continue if end > refs[ref]: # truncate this record... cols[2] = refs[ref] out.write("%s\n" % "\t".join([str(x) for x in cols]))
def gtf_add_xref(gtf, xref, column=4, out=sys.stdout, quiet=False): gene_names = {} if not quiet: sys.stderr.write("Reading xref...\n") for line in gzip_reader(xref): if line[0] == "#": continue cols = line.rstrip().split("\t") gene_names[cols[0]] = cols[column] if not quiet: sys.stderr.write("Reading/writing GTF...\n") for line in gzip_reader(gtf): try: comment = None idx = line.find("#") if idx > -1: if idx == 0: sys.stdout.write(line) continue comment = line[idx:] line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split("\t") transcript_id = None for key, val in [x.split(" ") for x in [x.strip() for x in attrs.split(";")] if x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == "transcript_id": transcript_id = val if attrs[-1] != ";": attrs = "%s;" % attrs if transcript_id in gene_names: attrs = '%s gene_name "%s";' % (attrs, gene_names[transcript_id]) out.write("\t".join([chrom, source, feature, start, end, score, strand, frame, attrs])) if comment: out.write("\t%s" % comment) out.write("\n") except: import traceback sys.stderr.write("Error parsing line:\n%s\n" % line) traceback.print_exc() sys.exit(1)
def qseq_reader(fname=None, fileobj=None, quiet=False): if not fileobj: if not fname: raise ValueError('Must pass fname or fileobj!') for line in gzip_reader(fname, quiet=quiet): yield QseqRecord(*line.strip().split('\t')[:11]) else: for line in fileobj: yield QseqRecord(*line.strip().split('\t')[:11])
def gtf_filter(fname, filters, out=sys.stdout): for line in gzip_reader(fname): cols = line.strip('\n').split('\t') good = True for filt in filters: if not filt.process(cols): good = False break if good: out.write('%s\n' % '\t'.join([str(x) for x in cols]))
def gtf_filter(fname, filters, out=sys.stdout): for line in gzip_reader(fname): cols = line.strip('\n').split('\t') good = True for filt in filters: cols = filt.process(cols) if not cols: good = False break if good: out.write('%s\n' % '\t'.join([str(x) for x in cols]))
def gtf_remove_dup(fname, out=sys.stdout, quiet=False): if not quiet: sys.stderr.write('Reading GTF...\n') dup_count = 0 good_count = 0 for line in gzip_reader(fname, quiet=quiet): try: if line[0] == '#': out.write(line) continue chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t') transcript_id = None for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == 'transcript_id': transcript_id = val if '_dup' in transcript_id: dup_count += 1 continue good_count += 1 out.write(line) except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1) if not quiet: sys.stderr.write('Kept %s transcript/exon annotations\n' % good_count) sys.stderr.write('Removed %s duplicate transcript/exon annotations\n' % dup_count) return (good_count, dup_count)
def gtf_addreflink(gtf, reflink, out=sys.stdout, quiet=False, replace=False): link_values = {} if not quiet: sys.stderr.write('Reading refLink...\n') for line in gzip_reader(reflink): cols = line.rstrip().split('\t') link_values[cols[2]] = (cols[0], cols[6]) if not quiet: sys.stderr.write('Reading GTF...\n') for line in gzip_reader(gtf): try: comment = None idx = line.find('#') if idx > -1: if idx == 0: sys.stdout.write(line) continue comment = line[idx:] line = line[:-idx] chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip( ).split('\t') transcript_id = None gene_id = None for key, val in [ x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x ]: if val[0] == '"' and val[-1] == '"': val = val[1:-1] if key == 'transcript_id': transcript_id = val elif key == 'gene_id': gene_id = val if attrs[-1] != ';': attrs = '%s;' % attrs if transcript_id in link_values: if replace: attrs = 'gene_id "%s"; transcript_id "%s"; gene_name "%s"; orig_gene_id "%s";' % ( link_values[transcript_id][1], transcript_id, link_values[transcript_id][0], gene_id) else: extra = 'gene_name "%s"; isoform_id "%s";' % link_values[ transcript_id] attrs = '%s %s' % (attrs, extra) out.write('\t'.join([ chrom, source, feature, start, end, score, strand, frame, attrs ])) if comment: out.write('\t%s' % comment) out.write('\n') except: import traceback sys.stderr.write('Error parsing line:\n%s\n' % line) traceback.print_exc() sys.exit(1)