def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) parser.add_option('-d', dest='downstream', type='int', default=0, help='Downstream promoter length [Default: %default]') parser.add_option('-u', dest='upstream', type='int', default=2000, help='Upstream promoter length [Default: %default]') parser.add_option('-o', dest='output_pre', default='promoter', help='Output file prefix [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gtf file') else: gtf_file = args[0] gff.promoters(gtf_file, options.upstream, options.downstream, '%s.gff'%options.output_pre) p = subprocess.Popen('gff2fa.py %s.gff > %s.fa' % (options.output_pre,options.output_pre), shell=True) os.waitpid(p.pid,0)
def intersect_gene_te(gtf_file, upstream, downstream): # focus on promoter tmp_fd, tmp_file = tempfile.mkstemp() gff.promoters(gtf_file, upstream, downstream, tmp_file) # intersect genes w/ repeats # hash transposon nt by gene gene_trans = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file, hg19_reps_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene = gff.gtf_kv(a[8])['transcript_id'] rep_kv = gff.gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # add nt if gene not in gene_trans: gene_trans[gene] = {} gene_trans[gene][(rep, fam)] = gene_trans[gene].get( (rep, fam), 0) + int(a[18]) gene_trans[gene][('*', fam)] = gene_trans[gene].get( ('*', fam), 0) + int(a[18]) gene_trans[gene][('*', '*')] = gene_trans[gene].get( ('*', '*'), 0) + int(a[18]) line = p.stdout.readline() p.communicate() # create a fake family for dTE-lncRNAs for line in open(gtf_file): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] if tid not in gene_trans: gene_trans[tid] = {('n', 'n'): 1} return gene_trans
def intersect_gene_te(gtf_file, upstream, downstream): # focus on promoter tmp_fd, tmp_file = tempfile.mkstemp() gff.promoters(gtf_file, upstream, downstream, tmp_file) # intersect genes w/ repeats # hash transposon nt by gene gene_trans = {} p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (tmp_file,hg19_reps_gff), shell=True, stdout=subprocess.PIPE) line = p.stdout.readline() while line: a = line.split('\t') # get names gene = gff.gtf_kv(a[8])['transcript_id'] rep_kv = gff.gtf_kv(a[17]) rep = rep_kv['repeat'] fam = rep_kv['family'] # add nt if gene not in gene_trans: gene_trans[gene] = {} gene_trans[gene][(rep,fam)] = gene_trans[gene].get((rep,fam),0) + int(a[18]) gene_trans[gene][('*',fam)] = gene_trans[gene].get(('*',fam),0) + int(a[18]) gene_trans[gene][('*','*')] = gene_trans[gene].get(('*','*'),0) + int(a[18]) line = p.stdout.readline() p.communicate() # create a fake family for dTE-lncRNAs for line in open(gtf_file): a = line.split('\t') tid = gff.gtf_kv(a[8])['transcript_id'] if tid not in gene_trans: gene_trans[tid] = {('n','n'):1} return gene_trans