def make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines): # make temporary gff file for gene tmp_out = open('%s.gff' % mgene_id, 'w') for tid in merged_g2t[mgene_id]: for line in merged_tid_lines[tid]: a = line.split('\t') if a[2] == 'exon': print >> tmp_out, line, tmp_out.close() tid_overlap_graph = nx.Graph() # intersect with self proc = subprocess.Popen('intersectBed -wo -s -a %s.gff -b %s.gff' % (mgene_id,mgene_id), shell=True, stdout=subprocess.PIPE) line = proc.stdout.readline() while line: a = line.split('\t') tid1 = gff.gtf_kv(a[8])['transcript_id'] tid2 = gff.gtf_kv(a[17])['transcript_id'] # ignore same and ignore different ref genes if tid1 != tid2: tid_overlap_graph.add_edge(tid1,tid2) line = proc.stdout.readline() proc.communicate() os.remove('%s.gff' % mgene_id) return tid_overlap_graph
def hash_genes_repeats_nt(gtf_file, repeats_gff, gene_key='gene_id', add_star=True): gene_repeat_nt = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene_id = gtf_kv(a[8])['gene_id'] rep_kv = gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # get overlap nt_overlap = int(a[18]) if not gene_id in gene_repeat_nt: gene_repeat_nt[gene_id] = {} gene_repeat_nt[gene_id][(rep,fam)] = gene_repeat_nt[gene_id].get((rep,fam),0) + nt_overlap if add_star: gene_repeat_nt[gene_id][('*',fam)] = gene_repeat_nt[gene_id].get(('*',fam),0) + nt_overlap gene_repeat_nt[gene_id][('*','*')] = gene_repeat_nt[gene_id].get(('*','*'),0) + nt_overlap line = p.stdout.readline() p.communicate() return gene_repeat_nt
def hash_repeats_genes(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False): repeat_genes = {} if add_star: repeat_genes[('*','*','+')] = set() repeat_genes[('*','*','-')] = set() else: repeat_genes[('*','*')] = set() for line in open(repeats_gff): a = line.split('\t') kv = gtf_kv(a[8]) if stranded: repeat_genes[(kv['repeat'],kv['family'],'+')] = set() repeat_genes[(kv['repeat'],kv['family'],'-')] = set() if add_star: repeat_genes[('*',kv['family'],'+')] = set() repeat_genes[('*',kv['family'],'-')] = set() else: repeat_genes[(kv['repeat'],kv['family'])] = set() if add_star: repeat_genes[('*',kv['family'])] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene_id = gtf_kv(a[8])[gene_key] rep_kv = gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # get strands gene_strand = a[6] te_strand = a[15] if stranded: if gene_strand == te_strand: orient = '+' else: orient = '-' repeat_genes[(rep,fam,orient)].add(gene_id) if add_star: repeat_genes[('*',fam,orient)].add(gene_id) repeat_genes[('*','*',orient)].add(gene_id) else: repeat_genes[(rep,fam)].add(gene_id) if add_star: repeat_genes[('*',fam)].add(gene_id) repeat_genes[('*','*')].add(gene_id) line = p.stdout.readline() p.communicate() return repeat_genes
def filter_single(ref_gtf): # intersect with self and compute overlap sets #p = subprocess.Popen('intersectBed -sorted -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE) p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE) # computer overlaps gene_overlaps = {} for line in p.stdout: a = line.split('\t') gid1 = gff.gtf_kv(a[8])['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: gene_overlaps.setdefault(gid1,set()).add(gid2) gene_overlaps.setdefault(gid2,set()).add(gid1) p.communicate() # filter overlapping genes out single_gtf_fd, single_gtf_file = tempfile.mkstemp() single_gtf_out = open(single_gtf_file, 'w') for line in open(ref_gtf): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['gene_id'] if gene_id not in gene_overlaps: print >> single_gtf_out, line, single_gtf_out.close() return single_gtf_fd, single_gtf_file
def filter_single(ref_gtf): # intersect with self and compute overlap sets #p = subprocess.Popen('intersectBed -sorted -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE) p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (ref_gtf, ref_gtf), shell=True, stdout=subprocess.PIPE) # computer overlaps gene_overlaps = {} for line in p.stdout: a = line.split('\t') gid1 = gff.gtf_kv(a[8])['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: gene_overlaps.setdefault(gid1, set()).add(gid2) gene_overlaps.setdefault(gid2, set()).add(gid1) p.communicate() # filter overlapping genes out single_gtf_fd, single_gtf_file = tempfile.mkstemp() single_gtf_out = open(single_gtf_file, 'w') for line in open(ref_gtf): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['gene_id'] if gene_id not in gene_overlaps: print >> single_gtf_out, line, single_gtf_out.close() return single_gtf_fd, single_gtf_file
def preprocess_anchors(anchor_gff, mode, max_anchors, anchor_is_gtf, min_length, window): # get lengths anchor_lengths = {} for line in open(anchor_gff): a = line.split('\t') if anchor_is_gtf: anchor_id = gff.gtf_kv(a[8])['transcript_id'] else: anchor_id = (a[0], int(a[3]), int(a[4])) anchor_lengths[anchor_id] = anchor_lengths.get(anchor_id,0) + int(a[4])-int(a[3])+1 # filter small if min_length != None: for anchor_id in anchor_lengths.keys(): if anchor_lengths[anchor_id] < min_length: del anchor_lengths[anchor_id] # sample if max_anchors < len(anchor_lengths): anchors_chosen = set(random.sample(anchor_lengths.keys(), max_anchors)) else: anchors_chosen = set(anchor_lengths.keys()) # make new GFF prep_anchor_fd, prep_anchor_gff = tempfile.mkstemp() print >> sys.stderr, 'Opening tempfile %s for preprocessed anchors.' % prep_anchor_gff prep_anchor_out = open(prep_anchor_gff, 'w') for line in open(anchor_gff): a = line.split('\t') if anchor_is_gtf: anchor_id = gff.gtf_kv(a[8])['transcript_id'] else: anchor_id = (a[0], int(a[3]), int(a[4])) if anchor_id in anchors_chosen: if mode == 'span': print >> prep_anchor_out, line, elif mode == 'mid': # standardize size start = int(a[3]) end = int(a[4]) mid = start + (end-start)/2 a[3] = str(mid - window/2) a[4] = str(mid + window/2) a[-1] = a[-1].rstrip() if int(a[3]) > 0: print >> prep_anchor_out, '\t'.join(a) else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) prep_anchor_out.close() return prep_anchor_fd, prep_anchor_gff
def hash_genes_repeats(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False): gene_repeats = {} for line in open(gtf_file): a = line.split('\t') gene_id = gtf_kv(a[8])[gene_key] gene_repeats[gene_id] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene_id = gtf_kv(a[8])[gene_key] rep_kv = gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # get strands gene_strand = a[6] te_strand = a[15] if stranded: if gene_strand == te_strand: orient = '+' else: orient = '-' gene_repeats[gene_id].add((rep, fam, orient)) if add_star: gene_repeats[gene_id].add(('*', fam, orient)) gene_repeats[gene_id].add(('*', '*', orient)) else: gene_repeats[gene_id].add((rep, fam)) if add_star: gene_repeats[gene_id].add(('*', fam)) gene_repeats[gene_id].add(('*', '*')) line = p.stdout.readline() p.communicate() return gene_repeats
def initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins): print >> sys.stderr, 'Initializing coverage using anchor gff %s' % anchor_gff coverage = {} for line in open(anchor_gff): a = line.split('\t') chrom = a[0] start = int(a[3]) end = int(a[4]) if anchor_is_gtf: anchor_id = gff.gtf_kv(a[8])['transcript_id'] else: anchor_id = '%s:%d-%d' % (chrom, start, end) if not anchor_id in coverage: if mode == 'span': coverage[anchor_id] = [0] * bins elif mode == 'mid': coverage[anchor_id] = [0] * (end - start + 1) else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) print >> sys.stderr, '%d anchors found.' % len(coverage) return coverage
def main(): usage = "usage: %prog [options] <gtf_file>" parser = OptionParser(usage) # parser.add_option() (options, args) = parser.parse_args() gtf_file = args[0] genes = {} for line in open(gtf_file): a = line.split() gene_id = a[9][1:-2] genes.setdefault(gene_id, []).append(line) for gene_id in genes: start = min([int(line.split()[3]) for line in genes[gene_id]]) end = max([int(line.split()[4]) for line in genes[gene_id]]) a = genes[gene_id][0].split("\t") kv = gff.gtf_kv(a[8]) succinct_kv = {"gene_id": kv["gene_id"]} succinct_kv["transcript_id"] = ",".join(list(set([line.split()[11][1:-2] for line in genes[gene_id]]))) d = [a[0], "gtf", "gene", str(start), str(end), ".", a[6], ".", gff.kv_gtf(succinct_kv)] print "\t".join(d)
def main(): usage = 'usage: %prog [options] <gtf> <fpkm tracking | diff>' parser = OptionParser(usage) parser.add_option('-a', dest='all_isoforms', default=False, action='store_true', help='Consider all isoforms. Default is to ignore bs ones') parser.add_option('-p', dest='pseudocount', default=0.125) parser.add_option('-r', dest='random_zeros', default=False, action='store_true', help='Randomly choose an isoform for zero FPKM genes [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and fpkm tracking') else: gtf_file = args[0] fpkm_file = args[1] gene_max_iso = map_genes(gtf_file, fpkm_file, options.pseudocount, options.all_isoforms, options.random_zeros) # filter gtf file for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) gene_id = kv['gene_id'] tid = kv['transcript_id'] if gene_max_iso.get(gene_id,None) == tid: print line,
def hash_repeat_family(): repeat_family = {} for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']): a = line.split('\t') kv = gtf_kv(a[8]) repeat_family[kv['repeat']] = kv['family'] return repeat_family
def main(): usage = 'usage: %prog [options] -k <key> <gtf file>' parser = OptionParser(usage) parser.add_option('-k', dest='key', help='Key to extract') parser.add_option('-l', dest='line_too', action='store_true', default=False, help='Print the line too [Default: %default]') (options,args) = parser.parse_args() if len(args) == 1: if args[0] == '-': gtf_open = sys.stdin else: gtf_open = open(args[0]) else: parser.error(usage) if not options.key: parser.error('Must provide key') else: keys = options.key.split(',') for line in gtf_open: if not line.startswith('##'): a = line.split('\t') kv = gff.gtf_kv(a[8]) if options.line_too: key_str = '\t'.join([kv.get(key,'-') for key in keys]) print('%s\t%s' % (key_str,line)) else: print('\t'.join([kv.get(key,'-') for key in keys]))
def main(): usage = 'usage: %prog [options] <transcript .gff>' parser = OptionParser(usage) parser.add_option('-c', dest='cgff_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='Gtf file mapping transcripts to chromosomes [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gff file mapping features to transcripts') else: tgff_file = args[0] # get transcript information transcripts = {} for line in open(options.cgff_file): a = line.split('\t') if a[2] == 'exon': trans_id = gff.gtf_kv(a[8])['transcript_id'] if not trans_id in transcripts: transcripts[trans_id] = Transcript(trans_id,a[0],a[6]) transcripts[trans_id].add_exon(int(a[3]), int(a[4])) # process transcript features for line in open(tgff_file): feat = Feature(line) map_feature(transcripts[feat.trans_id], feat)
def main(): usage = 'usage: %prog [options] <gtf file> <cell type>' parser = OptionParser(usage) parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and cell type') else: gtf_file = args[0] cell_type = args[1] # get expression data cuff = cufflinks.fpkm_tracking() # find cell type experiment index cell_indexes = [i for i in range(len(cuff.experiments)) if cuff.experiments[i]==cell_type] if len(cell_indexes) == 0: parser.error('Cell type %s does not match any quantified experiments' % cell_type) else: cell_i = cell_indexes[0] # parser gtf file for line in open(gtf_file): a = line.split('\t') gene_id = gff.gtf_kv(a[8])['gene_id'] expr_vec = cuff.gene_expr(gene_id) if expr_vec[cell_i] > options.expr_t: print line,
def header_gff(header, seq, gff_file, options): header_seqs = {} for line in open(gff_file): a = line.split('\t') a[-1] = a[-1].rstrip() if (not options.exon or a[2] == 'exon') and a[0] == header: kv = gff.gtf_kv(a[8]) #head_id = kv.get(options.header_key,a[8]+'_'+a[0]+':'+a[3]+'-'+a[4]) head_id = kv.get(options.header_key,a[8]) if options.gene_too: head_id += ' gene=%s' % kv.get('gene_id','') feat_start = int(a[3]) feat_end = int(a[4]) feat_seq = seq[feat_start-1:feat_end] if a[6] == '+': header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'') for header in header_seqs: print '>%s' % header if options.split_lines: i = 0 while i < len(header_seqs[header]): print header_seqs[header][i:i+60] i += 60 else: print header_seqs[header]
def hash_te(te_gff_in): te_bp = {} for line in te_gff_in: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] family = kv['family'] length = int(a[4]) - int(a[3]) + 1 te_bp[(rep,family)] = te_bp.get((rep,family),0) + length te_bp[('*',family)] = te_bp.get(('*',family),0) + length te_bp[('*','*')] = te_bp.get(('*','*'),0) + length if rep.startswith('LTR'): te_bp[('LTR*',family)] = te_bp.get(('LTR*',family),0) + length if rep.startswith('LTR12'): te_bp[('LTR12*',family)] = te_bp.get(('LTR12*',family),0) + length if rep.startswith('LTR7') and (len(rep) < 5 or rep[4].isalpha()): te_bp[('LTR7*',family)] = te_bp.get(('LTR7*',family),0) + length if rep.startswith('THE1') and len(rep) == 5: te_bp[('THE1*',family)] = te_bp.get(('THE1*',family),0) + length if rep.startswith('MER61') and len(rep) == 6: te_bp[('MER61*',family)] = te_bp.get(('MER61*',family),0) + length if rep.startswith('L1PA'): te_bp[('L1PA*',family)] = te_bp.get(('L1PA*',family),0) + length return te_bp
def main(): usage = 'usage: %prog [options] <gtf_file>' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() gtf_file = args[0] genes = {} for line in open(gtf_file): a = line.split() gene_id = a[9][1:-2] genes.setdefault(gene_id, []).append(line) for gene_id in genes: start = min([int(line.split()[3]) for line in genes[gene_id]]) end = max([int(line.split()[4]) for line in genes[gene_id]]) a = genes[gene_id][0].split('\t') kv = gff.gtf_kv(a[8]) succinct_kv = {'gene_id': kv['gene_id']} succinct_kv['transcript_id'] = ','.join( list(set([line.split()[11][1:-2] for line in genes[gene_id]]))) d = [ a[0], 'gtf', 'gene', str(start), str(end), '.', a[6], '.', gff.kv_gtf(succinct_kv) ] print '\t'.join(d)
def main(): usage = 'usage: %prog [options] -k <key> <gtf file>' parser = OptionParser(usage) parser.add_option('-k', dest='key', help='Key to extract') parser.add_option('-l', dest='line_too', action='store_true', default=False, help='Print the line too [Default: %default]') (options, args) = parser.parse_args() if len(args) == 1: if args[0] == '-': gtf_open = sys.stdin else: gtf_open = open(args[0]) else: parser.error(usage) if not options.key: parser.error('Must provide key') else: keys = options.key.split(',') for line in gtf_open: if not line.startswith('##'): a = line.split('\t') kv = gff.gtf_kv(a[8]) if options.line_too: key_str = '\t'.join([kv.get(key, '-') for key in keys]) print('%s\t%s' % (key_str, line)) else: print('\t'.join([kv.get(key, '-') for key in keys]))
def gff_df(gff_file, gene_index): """Read GFF w/ keys into DataFrame.""" chrms = [] starts = [] ends = [] strands = [] gtf_lists = {} for line in open(gff_file): a = line.split('\t') chrms.append(a[0]) starts.append(int(a[3])) ends.append(int(a[3])) strands.append(a[5]) for kv in gff.gtf_kv(a[-1]).items(): gtf_lists.setdefault(kv[0], []).append(kv[1]) df = pd.DataFrame({ 'chr': chrms, 'start': starts, 'end': ends, 'strand': strands }) for k, kl in gtf_lists.items(): df[k] = kl df.set_index(gene_index, inplace=True) return df
def initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins): print >> sys.stderr, 'Initializing coverage using anchor gff %s' % anchor_gff coverage = {} for line in open(anchor_gff): a = line.split('\t') chrom = a[0] start = int(a[3]) end = int(a[4]) if anchor_is_gtf: anchor_id = gff.gtf_kv(a[8])['transcript_id'] else: anchor_id = '%s:%d-%d' % (chrom, start, end) if not anchor_id in coverage: if mode == 'span': coverage[anchor_id] = [0]*bins elif mode == 'mid': coverage[anchor_id] = [0]*(end-start+1) else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) print >> sys.stderr, '%d anchors found.' % len(coverage) return coverage
def header_gff(header, seq, gff_file, options): header_seqs = {} for line in open(gff_file): a = line.split('\t') a[-1] = a[-1].rstrip() if (not options.exon or a[2] == 'exon') and a[0] == header: try: kv = gff.gtf_kv(a[8]) except: kv = {} head_id = kv.get(options.header_key,a[0]+':'+a[3]+'-'+a[4]) #head_id = kv.get(options.header_key,a[8]) if options.gene_too: head_id += ' gene=%s' % kv.get('gene_id','') feat_start = int(a[3]) feat_end = int(a[4]) feat_seq = seq[feat_start-1:feat_end] if a[6] == '+': header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq else: header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'') for header in header_seqs: print '>%s' % header if options.split_lines: i = 0 while i < len(header_seqs[header]): print header_seqs[header][i:i+60] i += 60 else: print header_seqs[header]
def main(): usage = 'usage: %prog [options] <gene/transcript id>' parser = OptionParser(usage) parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]') parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]') parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]') (options,args) = parser.parse_args() if options.transcript_expr: cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: trans_ids = set() for line in open(options.lnc_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['gene_id'] == args[0]: trans_ids.add(kv['transcript_id']) else: trans_ids = [args[0]] for trans_id in trans_ids: print '%s:' % trans_id cuff.gene_expr_print(trans_id) else: cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir) if args[0].find('XLOC') != -1: gene_id = args[0] else: t2g = gff.t2g(options.lnc_gtf) gene_id = t2g[args[0]] cuff.gene_expr_print(gene_id)
def main(): usage = 'usage: %prog [options] <transcript .gff>' parser = OptionParser(usage) parser.add_option( '-c', dest='cgff_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='Gtf file mapping transcripts to chromosomes [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gff file mapping features to transcripts') else: tgff_file = args[0] # get transcript information transcripts = {} for line in open(options.cgff_file): a = line.split('\t') if a[2] == 'exon': trans_id = gff.gtf_kv(a[8])['transcript_id'] if not trans_id in transcripts: transcripts[trans_id] = Transcript(trans_id, a[0], a[6]) transcripts[trans_id].add_exon(int(a[3]), int(a[4])) # process transcript features for line in open(tgff_file): feat = Feature(line) map_feature(transcripts[feat.trans_id], feat)
def main(): usage = 'usage: %prog [options] <bed file>' parser = OptionParser(usage) parser.add_option('-g', dest='orig_gtf', help='The original gtf file of the TransMap\'d genes to be used to transfer gene id\'s') parser.add_option('-m', dest='merge_dist', type='int', default=30, help='Minimum distance two exons can be apart for them to be merged [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide bed file') else: bed_file = args[0] # map transcript id's to gene id's if possible t2g = {} if options.orig_gtf: for line in open(options.orig_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) t2g[kv['transcript_id']] = kv['gene_id'] # hash to disambiguate multi-mapping transcripts transcript_maps = {} for line in open(bed_file): a = line.split('\t') a[-1] = a[-1].rstrip() tid = a[3] gid = t2g.get(a[3],a[3]) transcript_maps[tid] = transcript_maps.get(tid,0) + 1 if transcript_maps[tid] > 1: gid += '_v%d' % transcript_maps[tid] tid += '_v%d' % transcript_maps[tid] gene_start = int(a[1]) gene_end = int(a[2]) block_sizes = [int(x) for x in a[10].split(',') if x] block_starts = [int(x) for x in a[11].split(',') if x] exon_cols = [] last_end = None exon_num = 1 for i in range(len(block_starts)): exon_start = gene_start+1+block_starts[i] exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1 if last_end and last_end+options.merge_dist >= exon_start: # merge w/ last exon_cols[-1][4] = str(exon_end) else: exon_cols.append([a[0], 'TransMap', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d"' % (gid,tid,exon_num)]) exon_num += 1 last_end = exon_end for cols in exon_cols: print '\t'.join(cols)
def te_target_size_bed(te_gff, ref_bed, read_len): # hash TE intervals by BED region bed_te_intervals = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (ref_bed, te_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') bchrom = a[0] bstart = int(a[1]) bend = int(a[2]) bid = (bchrom, bstart) rep_kv = gff.gtf_kv(a[11]) rep = rep_kv['repeat'] fam = rep_kv['family'] tstart = int(a[6]) tend = int(a[7]) ostart = max(bstart, tstart) oend = min(bend, tend) if not bid in bed_te_intervals: bed_te_intervals[bid] = {} bed_te_intervals[bid].setdefault((rep, fam), []).append((ostart, oend)) bed_te_intervals[bid].setdefault(('*', fam), []).append((ostart, oend)) bed_te_intervals[bid].setdefault(('*', '*'), []).append((ostart, oend)) p.communicate() target_size = {} for bid in bed_te_intervals: bchrom, bstart = bid for te in bed_te_intervals[bid]: bt_intervals = bed_te_intervals[bid][te] bt_intervals.sort() # merge intervals, limited at the start by the BED region's start merged_intervals = [(max(bstart, bt_intervals[0][0] - read_len + 1), bt_intervals[0][1])] for i in range(1, len(bt_intervals)): start1, end1 = merged_intervals[-1] start2, end2 = bt_intervals[i] if end1 + 1 < start2 - read_len + 1: merged_intervals.append((start2 - read_len + 1, end2)) else: merged_intervals[-1] = (start1, end2) # sum target_size[te] = target_size.get(te, 0) + sum( [e - s + 1 for (s, e) in merged_intervals]) return target_size
def hash_genes_repeats_num(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False): gene_repeat_num = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene_id = gtf_kv(a[8])[gene_key] rep_kv = gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # get strands gene_strand = a[6] te_strand = a[15] if stranded: if gene_strand == te_strand: orient = '+' else: orient = '-' if not gene_id in gene_repeat_num: gene_repeat_num[gene_id] = {} gene_repeat_num[gene_id][(rep,fam,orient)] = gene_repeat_num[gene_id].get((rep,fam,orient),0) + 1 if add_star: gene_repeat_num[gene_id][('*',fam,orient)] = gene_repeat_num[gene_id].get(('*',fam,orient),0) + 1 gene_repeat_num[gene_id][('*','*',orient)] = gene_repeat_num[gene_id].get(('*','*',orient),0) + 1 else: if not gene_id in gene_repeat_num: gene_repeat_num[gene_id] = {} gene_repeat_num[gene_id][(rep,fam)] = gene_repeat_num[gene_id].get((rep,fam),0) + 1 if add_star: gene_repeat_num[gene_id][('*',fam)] = gene_repeat_num[gene_id].get(('*',fam),0) + 1 gene_repeat_num[gene_id][('*','*')] = gene_repeat_num[gene_id].get(('*','*'),0) + 1 line = p.stdout.readline() p.communicate() return gene_repeat_num
def count_te_fragments(bam_file, te_gff, strand_split=False): # count fragments and hash multi-mappers num_fragments = 0 multi_maps = {} paired_poll = {False:0, True:0} for aligned_read in pysam.Samfile(bam_file, 'rb'): if aligned_read.is_paired: num_fragments += 0.5/aligned_read.opt('NH') else: num_fragments += 1.0/aligned_read.opt('NH') if aligned_read.opt('NH') > 1: multi_maps[aligned_read.qname] = aligned_read.opt('NH') paired_poll[aligned_read.is_paired] += 1 # guess paired-ness if paired_poll[True] > 0 and paired_poll[False] > 0: print >> sys.stderr, 'Paired-ness of the reads is ambiguous' if paired_poll[True] > paired_poll[False]: is_paired = True else: is_paired = False # hash read counts by TE family te_fragments = {} proc = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (bam_file, te_gff), shell=True, stdout=subprocess.PIPE) for line in proc.stdout: a = line.split('\t') te_kv = gff.gtf_kv(a[20]) rep = te_kv['repeat'] fam = te_kv['family'] if is_paired: read_inc = 0.5/multi_maps.get(a[3],1.0) else: read_inc = 1.0/multi_maps.get(a[3],1.0) rep_star = '*' if strand_split: rstrand = a[5] tstrand = a[18] if rstrand == tstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_fragments[(rep,fam)] = te_fragments.get((rep,fam),0.0) + read_inc te_fragments[(rep_star,fam)] = te_fragments.get((rep_star,fam),0.0) + read_inc te_fragments[(rep_star,'*')] = te_fragments.get((rep_star,'*'),0.0) + read_inc proc.communicate() return num_fragments, te_fragments
def te_target_size(te_gff, read_len): te_bp = {} active_te_intervals = {} for line in open(te_gff): a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] chrom = a[0] start = int(a[3]) end = int(a[4]) # process closed intervals for arep, afam in active_te_intervals.keys(): achrom, astart, aend = active_te_intervals[(arep, afam)] if achrom != chrom or aend + read_len < start: # add te_bp[(arep, afam)] = te_bp.get( (arep, afam), 0) + aend - astart + 1 + read_len # close del active_te_intervals[(arep, afam)] # update/add te if (rep, fam) in active_te_intervals: achrom, astart, aend = active_te_intervals[(rep, fam)] active_te_intervals[(rep, fam)] = (chrom, min(astart, start), max(aend, end)) else: active_te_intervals[(rep, fam)] = (chrom, start, end) if ('*', fam) in active_te_intervals: achrom, astart, aend = active_te_intervals[('*', fam)] active_te_intervals[('*', fam)] = (chrom, min(astart, start), max(aend, end)) else: active_te_intervals[('*', fam)] = (chrom, start, end) if ('*', '*') in active_te_intervals: achrom, astart, aend = active_te_intervals[('*', '*')] active_te_intervals[('*', '*')] = (chrom, min(astart, start), max(aend, end)) else: active_te_intervals[('*', '*')] = (chrom, start, end) # close remaining for arep, afam in active_te_intervals.keys(): achrom, astart, aend = active_te_intervals[(arep, afam)] # add te_bp[(arep, afam)] = te_bp.get( (arep, afam), 0) + aend - astart + 1 + read_len return te_bp
def count_te_fragments(bam_file, te_gff, strand_split): # count fragments and hash multi-mappers num_fragments = 0 multi_maps = {} paired_poll = {False: 0, True: 0} for aligned_read in pysam.Samfile(bam_file, 'rb'): if aligned_read.is_paired: num_fragments += 0.5 / aligned_read.opt('NH') else: num_fragments += 1.0 / aligned_read.opt('NH') if aligned_read.opt('NH') > 1: multi_maps[aligned_read.qname] = aligned_read.opt('NH') paired_poll[aligned_read.is_paired] += 1 # guess paired-ness if paired_poll[True] > 0 and paired_poll[False] > 0: print >> sys.stderr, 'Paired-ness of the reads is ambiguous' if paired_poll[True] > paired_poll[False]: is_paired = True else: is_paired = False # hash read counts by TE family te_fragments = {} proc = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (bam_file, te_gff), shell=True, stdout=subprocess.PIPE) for line in proc.stdout: a = line.split('\t') if is_paired: read_inc = 0.5 / multi_maps.get(a[3], 1.0) else: read_inc = 1.0 / multi_maps.get(a[3], 1.0) te_chrom = a[12] te_start = int(a[15]) te_kv = gff.gtf_kv(a[20]) if strand_split: rstrand = a[5] tstrand = a[18] if rstrand == tstrand: te_key = (te_chrom, te_start, '+') else: te_key = (te_chrom, te_start, '-') else: te_key = (te_chrom, te_start) te_fragments[te_key] = te_fragments.get(te_key, 0.0) + read_inc proc.communicate() return num_fragments, te_fragments
def main(): usage = 'usage: %prog [options] <chain_file> <net_file> <gtf_from> <gtf_to>' parser = OptionParser(usage) #parser.add_option() (options,args) = parser.parse_args() if len(args) != 4: parser.error('Must provide chain file and two GTF files') else: chain_file = args[0] net_file = args[1] gtf_from = args[2] gtf_to = args[3] # transmap to new genome from_map_gtf_fd, from_map_gtf_file = tempfile.mkstemp() subprocess.call('chain_map.py -k gene_id -n %s %s %s > %s' % (net_file,chain_file,gtf_from,from_map_gtf_file), shell=True) # intersect w/ gtf_to homologues = {} p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (from_map_gtf_file,gtf_to), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv_to = gff.gtf_kv(a[17]) gid_from = a[8].split(';')[1].strip() gid_to = kv_to['gene_id'] homologues.setdefault(gid_from,set()).add(gid_to) p.communicate() # find all genes genes = set() for line in open(gtf_from): a = line.split('\t') genes.add(gff.gtf_kv(a[8])['gene_id']) # print table for g in genes: print '%s\t%s' % (g,' '.join(homologues.get(g,['-']))) os.close(from_map_gtf_fd) os.remove(from_map_gtf_file)
def intersect_gene_te(gtf_file, upstream, downstream): # focus on promoter tmp_fd, tmp_file = tempfile.mkstemp() gff.promoters(gtf_file, upstream, downstream, tmp_file) # intersect genes w/ repeats # hash transposon nt by gene gene_trans = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file, hg19_reps_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene = gff.gtf_kv(a[8])['transcript_id'] rep_kv = gff.gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # add nt if gene not in gene_trans: gene_trans[gene] = {} gene_trans[gene][(rep, fam)] = gene_trans[gene].get( (rep, fam), 0) + int(a[18]) gene_trans[gene][('*', fam)] = gene_trans[gene].get( ('*', fam), 0) + int(a[18]) gene_trans[gene][('*', '*')] = gene_trans[gene].get( ('*', '*'), 0) + int(a[18]) line = p.stdout.readline() p.communicate() # create a fake family for dTE-lncRNAs for line in open(gtf_file): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] if tid not in gene_trans: gene_trans[tid] = {('n', 'n'): 1} return gene_trans
def hash_genes_repeats(gtf_file, repeats_gff, gene_key='gene_id', add_star=True, stranded=False): gene_repeats = {} for line in open(gtf_file): a = line.split('\t') gene_id = gtf_kv(a[8])[gene_key] gene_repeats[gene_id] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (gtf_file, repeats_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene_id = gtf_kv(a[8])[gene_key] rep_kv = gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # get strands gene_strand = a[6] te_strand = a[15] if stranded: if gene_strand == te_strand: orient = '+' else: orient = '-' gene_repeats[gene_id].add((rep,fam,orient)) if add_star: gene_repeats[gene_id].add(('*',fam,orient)) gene_repeats[gene_id].add(('*','*',orient)) else: gene_repeats[gene_id].add((rep,fam)) if add_star: gene_repeats[gene_id].add(('*',fam)) gene_repeats[gene_id].add(('*','*')) line = p.stdout.readline() p.communicate() return gene_repeats
def make_te_read_fastas(te_gff, bam_file, read_tes, out_dir, stranded, max_reads): # open TE read fasta files te_fastas = {} for line in open(te_gff): a = line.split('\t') dfam_te = gff.gtf_kv(a[8])['dfam'] if not (dfam_te,'fwd') in te_fastas: te_fastas[(dfam_te,'fwd')] = open('%s/%s_fwd.fa' % (out_dir,dfam_te), 'w') te_fastas[(dfam_te,'rev')] = open('%s/%s_rev.fa' % (out_dir,dfam_te), 'w') # initialize counters for total reads te_totals = {} for dfam_te, orient in te_fastas: te_totals[dfam_te, orient] = 0 # print reads to fasta files for aligned_read in pysam.Samfile(bam_file, 'rb'): this_read_tes = read_tes.get(aligned_read.qname,{}) for dfam_te in this_read_tes.keys(): if this_read_tes[dfam_te] != None: (rstrand, tstrand) = this_read_tes[dfam_te] # only print if we match the read strand if (aligned_read.is_reverse and rstrand == '-') or (not aligned_read.is_reverse and rstrand == '+'): # TE determines reversal if tstrand == '+': rseq = aligned_read.seq else: rseq = dna.rc(aligned_read.seq) # count, and print if not stranded or rstrand == tstrand: te_totals[(dfam_te,'fwd')] += 1 if te_totals[(dfam_te,'fwd')] < max_reads: print >> te_fastas[(dfam_te,'fwd')], '>%s\n%s' % (aligned_read.qname,rseq) else: te_totals[(dfam_te,'rev')] += 1 if te_totals[(dfam_te,'rev')] < max_reads: print >> te_fastas[(dfam_te,'rev')], '>%s\n%s' % (aligned_read.qname,rseq) # specify printed this_read_tes[dfam_te] = None # post-process fasta files te_renorm = {} for dfam_te, orient in te_fastas: # close te_fastas[(dfam_te, orient)].close() # return renormalization factors if te_totals[(dfam_te,orient)] > 10: te_renorm[(dfam_te,orient)] = max(1.0, te_totals[(dfam_te,orient)]/float(max_reads)) return te_renorm
def __init__(self, exon_gtf, promoter_length): a = exon_gtf.split('\t') a[-1] = a[-1].rstrip() self.gtf_kv = gff.gtf_kv(a[8]) self.chr = a[0] self.strand = a[6] if self.strand == '+': self.start = max(0, int(a[3]) - promoter_length) else: self.start = int(a[4])
def te_target_size_bed(te_gff, ref_bed, read_len): # hash TE intervals by BED region bed_te_intervals = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (ref_bed, te_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') bchrom = a[0] bstart = int(a[1]) bend = int(a[2]) bid = (bchrom,bstart) rep_kv = gff.gtf_kv(a[11]) rep = rep_kv['repeat'] fam = rep_kv['family'] tstart = int(a[6]) tend = int(a[7]) ostart = max(bstart, tstart) oend = min(bend, tend) if not bid in bed_te_intervals: bed_te_intervals[bid] = {} bed_te_intervals[bid].setdefault((rep,fam),[]).append((ostart,oend)) bed_te_intervals[bid].setdefault(('*',fam),[]).append((ostart,oend)) bed_te_intervals[bid].setdefault(('*','*'),[]).append((ostart,oend)) p.communicate() target_size = {} for bid in bed_te_intervals: bchrom, bstart = bid for te in bed_te_intervals[bid]: bt_intervals = bed_te_intervals[bid][te] bt_intervals.sort() # merge intervals, limited at the start by the BED region's start merged_intervals = [(max(bstart, bt_intervals[0][0]-read_len+1), bt_intervals[0][1])] for i in range(1,len(bt_intervals)): start1, end1 = merged_intervals[-1] start2, end2 = bt_intervals[i] if end1+1 < start2-read_len+1: merged_intervals.append((start2-read_len+1,end2)) else: merged_intervals[-1] = (start1, end2) # sum target_size[te] = target_size.get(te,0) + sum([e-s+1 for (s,e) in merged_intervals]) return target_size
def te_target_size(te_gff, read_len): te_bp = {} active_te_intervals = {} for line in open(te_gff): a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] chrom = a[0] start = int(a[3]) end = int(a[4]) # process closed intervals for arep, afam in active_te_intervals.keys(): achrom, astart, aend = active_te_intervals[(arep,afam)] if achrom != chrom or aend + read_len < start: # add te_bp[(arep,afam)] = te_bp.get((arep,afam),0) + aend - astart + 1 + read_len # close del active_te_intervals[(arep,afam)] # update/add te if (rep,fam) in active_te_intervals: achrom, astart, aend = active_te_intervals[(rep,fam)] active_te_intervals[(rep,fam)] = (chrom, min(astart,start), max(aend, end)) else: active_te_intervals[(rep,fam)] = (chrom, start, end) if ('*',fam) in active_te_intervals: achrom, astart, aend = active_te_intervals[('*',fam)] active_te_intervals[('*',fam)] = (chrom, min(astart,start), max(aend, end)) else: active_te_intervals[('*',fam)] = (chrom, start, end) if ('*','*') in active_te_intervals: achrom, astart, aend = active_te_intervals[('*','*')] active_te_intervals[('*','*')] = (chrom, min(astart,start), max(aend, end)) else: active_te_intervals[('*','*')] = (chrom, start, end) # close remaining for arep, afam in active_te_intervals.keys(): achrom, astart, aend = active_te_intervals[(arep,afam)] # add te_bp[(arep,afam)] = te_bp.get((arep,afam),0) + aend - astart + 1 + read_len return te_bp
def read_genes(gtf_file, key_id='transcript_id'): genes = {} for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if not kv[key_id] in genes: genes[kv[key_id]] = Gene(a[0], a[6], kv) if a[2] == 'exon': genes[kv[key_id]].add_exon(int(a[3]), int(a[4])) return genes
def map_dfam_family(): repeat_family = {} for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']): a = line.split('\t') kv = gtf_kv(a[8]) repeat_family[kv['repeat']] = kv['family'] dfam_family = {} for repeat in repeat_family: dfam_tes = map_rm_dfam(repeat, quiet=True) for dfam_te in dfam_tes: dfam_family[dfam_te] = repeat_family[repeat] return dfam_family
def map_dfam_repeat(): repeats = set() for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']): a = line.split('\t') kv = gtf_kv(a[8]) repeats.add(kv['repeat']) dfam_repeat = {} for repeat in repeats: dfam_tes = map_rm_dfam(repeat, quiet=True) for dfam_te in dfam_tes: dfam_repeat.setdefault(dfam_te, set()).add(repeat) return dfam_repeat
def main(): usage = "usage: %prog [options] <fpkm_tracking>" parser = OptionParser(usage) parser.add_option("-d", dest="diff_file", help="Limit to significantly differentially expressed genes") parser.add_option("-g", dest="gtf", help="GTF file of genes to display") parser.add_option("-m", dest="min_fpkm", default=0.125, help="Minimum FPKM (for logs) [Default: %default]") parser.add_option("-o", dest="out_pdf", default="cuff_heat.pdf", help="Output PDF [Default: %default]") parser.add_option("-s", dest="sample", default=1000, help="Sample genes rather than use all [Default: %default]") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide fpkm_tracking") else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split("\t") all_genes.add(gff.gtf_kv(a[8])["gene_id"]) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {"Gene": [], "FPKM": [], "Sample": []} for gene_id in display_genes: ge = cuff.gene_expr(gene_id) if not math.isnan(ge[0]): for i in range(len(cuff.experiments)): df["Gene"].append(gene_id) df["Sample"].append(cuff.experiments[i]) df["FPKM"].append(math.log(ge[i] + options.min_fpkm, 2)) # plot ggplot.plot("%s/cuff_heat.r" % os.environ["RDIR"], df, [options.out_pdf])
def map_dfam_repeat(): repeats = set() for line in open('%s/hg19.fa.out.tp.gff' % os.environ['MASK']): a = line.split('\t') kv = gtf_kv(a[8]) repeats.add(kv['repeat']) dfam_repeat = {} for repeat in repeats: dfam_tes = map_rm_dfam(repeat, quiet=True) for dfam_te in dfam_tes: dfam_repeat[dfam_te] = repeat return dfam_repeat
def intersect_gene_te(gtf_file, upstream, downstream): # focus on promoter tmp_fd, tmp_file = tempfile.mkstemp() gff.promoters(gtf_file, upstream, downstream, tmp_file) # intersect genes w/ repeats # hash transposon nt by gene gene_trans = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file,hg19_reps_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene = gff.gtf_kv(a[8])['transcript_id'] rep_kv = gff.gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # add nt if gene not in gene_trans: gene_trans[gene] = {} gene_trans[gene][(rep,fam)] = gene_trans[gene].get((rep,fam),0) + int(a[18]) gene_trans[gene][('*',fam)] = gene_trans[gene].get(('*',fam),0) + int(a[18]) gene_trans[gene][('*','*')] = gene_trans[gene].get(('*','*'),0) + int(a[18]) line = p.stdout.readline() p.communicate() # create a fake family for dTE-lncRNAs for line in open(gtf_file): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] if tid not in gene_trans: gene_trans[tid] = {('n','n'):1} return gene_trans
def measure_te(rm_file): repeat_bp = {} for line in open(rm_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] family = kv['family'] length = int(a[4]) - int(a[3]) + 1 repeat_bp[(rep,family)] = repeat_bp.get((rep,family),0) + length repeat_bp[('*',family)] = repeat_bp.get(('*',family),0) + length repeat_bp[('*','*')] = repeat_bp.get(('*','*'),0) + length return repeat_bp
def main(): usage = 'usage: %prog [options] <gtf file> <bam file>' parser = OptionParser(usage) parser.add_option('-i', dest='intersect_done', default=False, action='store_true', help='intersectBed is already done [Default: %default]') parser.add_option('-o', dest='output_prefix', help='Prefix for the intersectBed intermediate file [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and bam file') else: gtf_file = args[0] bam_file = args[1] if options.output_prefix: ib_file = '%s_reads_genes.gff' % options.output_prefix else: ib_file = 'reads_genes.gff' if not options.intersect_done: # overlap genes w/ aligned reads p = subprocess.Popen('intersectBed -s -wo -abam -bed -a %s -b %s > %s' % (bam_file,gtf_file,ib_file), shell=True) os.waitpid(p.pid,0) # count transcriptome alignments per read read_aligns = {} for line in open(ib_file): a = line.split('\t') chrom = a[0] start = int(a[1]) read_id = a[3] read_aligns.setdefault(read_id,set()).add((chrom,start)) # hash reads by gene gene_reads = {} for line in open(ib_file): a = line.split('\t') read_id = a[3] gene_id = gff.gtf_kv(a[14])['transcript_id'] gene_reads.setdefault(gene_id,[]).append(read_id) # print gene stats for gene_id in gene_reads: align_counts = [len(read_aligns[read_id]) for read_id in gene_reads[gene_id]] multi_count = float(len([ac for ac in align_counts if ac > 1])) cols = (gene_id, len(align_counts), util.mean(align_counts), multi_count/float(len(align_counts))) print '%-15s %7d %7.2f %7.2f' % cols
def map_genes(gtf_file, fpkm_file, pseudocount=0.125, all_isoforms=False, random_zeros=False): # get expression data if fpkm_file[-5:] == '.diff': transcript_fpkm = diff_fpkm(fpkm_file, pseudocount) else: transcript_fpkm = cuff_fpkm(fpkm_file, pseudocount) # get genes if all_isoforms: g2t = gff.g2t(gtf_file) else: g2t = {} for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_type'] not in [ 'intron', 'prerna', 'nonsense_mediated_decay', 'retained_intron', 'non_stop_decay' ]: g2t.setdefault(kv['gene_id'], set()).add(kv['transcript_id']) # map gene_id's to max expression isoform gene_max_iso = {} min_fpkm = math.log(pseudocount, 2) for gid in g2t: max_fpkm_tid = None max_fpkm = min_fpkm for tid in g2t[gid]: if transcript_fpkm.get(tid, min_fpkm) > max_fpkm: max_fpkm_tid = tid max_fpkm = transcript_fpkm[tid] gene_max_iso[gid] = max_fpkm_tid # choose isoforms for None if random_zeros: for gid in g2t: if gene_max_iso[gid] == None: gene_max_iso[gid] = random.choice(g2t[gid]) return gene_max_iso
def main(): usage = 'usage: %prog [options] <gene id>' parser = OptionParser(usage) parser.add_option( '-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]') parser.add_option( '-s', dest='span', action='store_true', default=False, help='Map the gene\'s entire span, i.e. introns too [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gene id') else: gene_id = args[0] # get human genome hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19() # get gene exon intervals gene_ivals = [] for line in open(options.lncrna_gtf): a = line.split('\t') if gff.gtf_kv(a[8])['gene_id'] == gene_id: chrom = a[0] start = int(a[3]) end = int(a[4]) # ignoring orientation at the moment gene_ivals.append(hg19[chrom][start:end]) # get hg19 msa msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way() # map returned sequences back to genome name idDict = ~(msa.seqDict) # print alignments for gi in gene_ivals: for src, dest, edg in msa[gi].edges(): print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()
def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes): # find chr transcripts for line in open(transcripts_gtf): a = line.split('\t') if a[0] == chrom: kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] gid = kv['gene_id'] exon_start = int(a[3]) exon_end = int(a[4]) exon_seq = seq[exon_start-1:exon_end] if a[6] == '+': transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq else: transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'') transcript_genes[tid] = gid
def main(): usage = 'usage: %prog [options] <trans id>' parser = OptionParser(usage) parser.add_option( '-l', dest='lnc_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA catalog file [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide transcript id') else: trans_id = args[0] for line in open(options.lnc_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) if kv['transcript_id'] == trans_id: print kv['gene_id'] break
def get_promoters(gtf_file, promoter_length): promoters = [] gene_id = '' for line in open(gtf_file): a = line.split('\t') a[-1] = a[-1].rstrip() this_gene_id = gff.gtf_kv(a[8])['gene_id'] if this_gene_id != gene_id: if gene_id: promoters.append(find_promoter(gene_id, exons, promoter_length)) gene_id = this_gene_id exons = [line] else: exons.append(line) promoters.append(find_promoter(gene_id, exons, promoter_length)) return promoters
def main(): usage = 'usage: %prog [options] <gtf file> <fpkm tracking>' parser = OptionParser(usage) #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error(usage) else: gtf_file = args[0] fpkm_tracking_file = args[1] # get genes genes = set() for line in open(gtf_file): a = line.split('\t') genes.add(gff.gtf_kv(a[8])['gene_id']) # get expression cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) log_fpkms = [] for gene_id in genes: max_fpkm = max(cuff.gene_expr(gene_id)) if max_fpkm > 0: log_fpkms.append(math.log(max_fpkm,2)) # construct R data objects fpkms_r = ro.FloatVector(log_fpkms) df = ro.DataFrame({'fpkm':fpkms_r}) # construct plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='fpkm') + \ ggplot2.geom_histogram(binwidth=0.2) # save to file gtf_pre = os.path.splitext(gtf_file)[0] grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre) gp.plot() grdevices.dev_off()
def main(): usage = 'usage: %prog [options] <gtf file>' parser = OptionParser(usage) parser.add_option( '-g', dest='greater', action='store_true', default=False, help= 'Keep genes w/ CSF value greater than the one given [Default: %default]' ) parser.add_option( '-l', dest='less', action='store_true', default=True, help= 'Keep genes w/ CSF value less than the one given [Default: %default]') parser.add_option('-t', dest='csf_t', type='float', default=100.0, help='CSF threshold [Default: %default]') (options, args) = parser.parse_args() if len(args) == 1: gtf_open = open(args[0]) else: gtf_open = sys.stdin line = gtf_open.readline() while line: a = line.split('\t') csf = float(gff.gtf_kv(a[8])['csf']) if (options.less and csf <= options.csf_t) or (options.greater and csf >= options.csf_t): print line, line = gtf_open.readline()
def gff_intervals(gff_file, gtf_key): chr_features = {} interval_map = {} for line in open(gff_file): a = line.split('\t') a[-1] = a[-1].rstrip() chrom = a[0] start = int(a[3]) end = int(a[4]) strand = a[6] if gtf_key: feature_id = gff.gtf_kv(a[8]).get(gtf_key, a[8]) else: feature_id = a[8] chr_features.setdefault(chrom, IntervalTree()).insert_interval( Interval(start, end)) interval_map.setdefault(chrom, {}).setdefault((start, end), []).append( (feature_id, strand)) return chr_features, interval_map
def main(): usage = 'usage: %prog [options] <gtf> <fpkm tracking | diff>' parser = OptionParser(usage) parser.add_option( '-a', dest='all_isoforms', default=False, action='store_true', help='Consider all isoforms. Default is to ignore bs ones') parser.add_option('-p', dest='pseudocount', default=0.125) parser.add_option( '-r', dest='random_zeros', default=False, action='store_true', help= 'Randomly choose an isoform for zero FPKM genes [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and fpkm tracking') else: gtf_file = args[0] fpkm_file = args[1] gene_max_iso = map_genes(gtf_file, fpkm_file, options.pseudocount, options.all_isoforms, options.random_zeros) # filter gtf file for line in open(gtf_file): a = line.split('\t') kv = gff.gtf_kv(a[8]) gene_id = kv['gene_id'] tid = kv['transcript_id'] if gene_max_iso.get(gene_id, None) == tid: print line,