def average_length(lines,sd): lengths = map(int,[process_feature(line)['transcript_id'] for line in lines.split('\n') if line.split('\t')[6] == sd]) return mean(array(lengths))
except IndexError: print >> sys.stderr, "Usage: ./script.py <GTF_filename>" sys.exit( 1 ) unwanted = [ '#', '-' ] # create a BED file f = open( fn_s ) new_fn_s = fn_s[:-4] g = open( new_fn_s + ".bed", 'w' ) c = 0 for row_s in f: if c > 1000: break if row_s[0] in unwanted: continue L = row_s.strip().split( '\t' ) features = process_feature( row_s ) print >> g, "\t".join( [ L[0], L[3], L[4], features['gene_id'], features['transcript_id'], L[6] ]) c += 0 f.close() g.close() cmd = "sort -k1,1 -k2,2n %s | bgzip > %s.gz" % (( new_fn_s + ".bed", )*2) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True ) p.communicate() cmd = "tabix -p bed %s.gz" % ( new_fn_s + ".bed" ) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True ) p.communicate() print >> sys.stderr, "Done!"
import key_functions #with open( "/home/paulk/MARS/9A_additional_analyses/1_processed_data/HTS_samplenames.txt" ) as f: # fns = [ fn.strip() for fn in f ] fns = map( lambda x: str( x ) + "C", range( 3, 11 )) txs_rpkm = dict() for fn in fns: # f = open( "/home/paulk/MARS/9A_additional_analyses/1_processed_data/HTS/%s/flux_capacitor_output/Brain_gene_transcript_rpkm.gtf" % fn ) f = open( "/home/paulk/MARS/F_GTEx/GTEx_RP_Data/HTS/%s/flux_capacitor_output/RP_gene_transcript_rpkm.gtf" % fn ) c = 0 for row in f: if c > 10: break pr = key_functions.process_feature( row ) tx_id = pr['transcript_id'].split( "." )[0] gene_id = pr['gene_id'].split( "." )[0] tx_rpkm = pr['RPKM'] if tx_id not in txs_rpkm: txs_rpkm[ tx_id ] = { "gene": gene_id, "rpkm": [ tx_rpkm ] } else: txs_rpkm[ tx_id ][ "rpkm" ] += [ tx_rpkm ] c += 0 f.close() print( "tx_id", "gene_id", *fns, sep="\t" ) for tx in txs_rpkm: print( tx, txs_rpkm[tx]["gene"], *txs_rpkm[tx]["rpkm"], sep="\t" )
meps = dict() for row in f: if c > 5: break l = row.strip().split('\t') st = int(l[1]) sp = int(l[2]) sd = l[4] region = ":".join([l[0][3:],"-".join([l[1],l[2]])]) try: results = tabixfile.fetch(region=region) except ValueError: results = None if results == None: continue for result in results: l2 = result.strip().split('\t') st2 = int(l2[3]) sp2 = int(l2[4]) sd2 = l2[6] if st - 1 <= st2 <= st + 1 and sp - 1 <= sp2 <= sp + 1 and sd == sd2 and l2[2] == 'exon': features = process_feature(result.strip()) exon = features['transcript_id'] + ':' + features['exon_number'] if exon not in meps: meps[region] = [exon] else: meps[region] += [exon] c += 0 f.close() for m in meps: print m+"\t"+",".join(meps[m])
if c > 5: break l = row.strip().split("\t") st = int(l[1]) sp = int(l[2]) sd = l[4] region = ":".join([l[0][3:], "-".join([l[1], l[2]])]) try: results = tabixfile.fetch(region=region) except ValueError: results = None if results == None: continue for result in results: l2 = result.strip().split("\t") st2 = int(l2[3]) sp2 = int(l2[4]) sd2 = l2[6] if st - 1 <= st2 <= st + 1 and sp - 1 <= sp2 <= sp + 1 and sd == sd2 and l2[2] == "exon": features = process_feature(result.strip()) exon = features["transcript_id"] + ":" + features["exon_number"] if exon not in meps: meps[region] = [exon] else: meps[region] += [exon] c += 0 f.close() for m in meps: print m + "\t" + ",".join(meps[m])