def average_length(lines,sd):
	lengths = map(int,[process_feature(line)['transcript_id'] for line in lines.split('\n') if line.split('\t')[6] == sd])
	return mean(array(lengths))
Exemple #2
0
except IndexError:
	print >> sys.stderr, "Usage: ./script.py <GTF_filename>"
	sys.exit( 1 )
	
unwanted = [ '#', '-' ]

# create a BED file
f = open( fn_s )
new_fn_s = fn_s[:-4]
g = open( new_fn_s + ".bed", 'w' )
c = 0
for row_s in f:
	if c > 1000: break
	if row_s[0] in unwanted: continue
	L = row_s.strip().split( '\t' )
	features = process_feature( row_s )
	print >> g, "\t".join( [ L[0], L[3], L[4], features['gene_id'], features['transcript_id'], L[6] ])
	c += 0
f.close()
g.close()

cmd = "sort -k1,1 -k2,2n %s | bgzip > %s.gz" % (( new_fn_s + ".bed", )*2)
p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True )
p.communicate()

cmd = "tabix -p bed %s.gz" % ( new_fn_s + ".bed" )
p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True )
p.communicate()

print >> sys.stderr, "Done!"
import key_functions

#with open( "/home/paulk/MARS/9A_additional_analyses/1_processed_data/HTS_samplenames.txt" ) as f:
#	fns = [ fn.strip() for fn in	f ]

fns = map( lambda x: str( x ) + "C", range( 3, 11 ))

txs_rpkm = dict()
for fn in fns:
#	f = open( "/home/paulk/MARS/9A_additional_analyses/1_processed_data/HTS/%s/flux_capacitor_output/Brain_gene_transcript_rpkm.gtf" % fn )
	f = open( "/home/paulk/MARS/F_GTEx/GTEx_RP_Data/HTS/%s/flux_capacitor_output/RP_gene_transcript_rpkm.gtf" % fn )
	
	c = 0
	for row in f:
		if c > 10: break
		pr = key_functions.process_feature( row )
		tx_id = pr['transcript_id'].split( "." )[0]
		gene_id = pr['gene_id'].split( "." )[0]
		tx_rpkm = pr['RPKM']
		
		if tx_id not in txs_rpkm:
			txs_rpkm[ tx_id ] = { "gene": gene_id, "rpkm": [ tx_rpkm ] }
		else:
			txs_rpkm[ tx_id ][ "rpkm" ] += [ tx_rpkm ]		
		c += 0

	f.close()
	
print( "tx_id", "gene_id", *fns, sep="\t" )
for tx in txs_rpkm:
	print( tx, txs_rpkm[tx]["gene"], *txs_rpkm[tx]["rpkm"], sep="\t" )
meps = dict()
for row in f:
	if c > 5: break
	l = row.strip().split('\t')
	st = int(l[1])
	sp = int(l[2])
	sd = l[4]
	region = ":".join([l[0][3:],"-".join([l[1],l[2]])])
	try:
		results = tabixfile.fetch(region=region)
	except ValueError:
		results = None
	if results == None: continue
	for result in results:
		l2 = result.strip().split('\t')
		st2 = int(l2[3])
		sp2 = int(l2[4])
		sd2 = l2[6]
		if st - 1 <= st2 <= st + 1 and sp - 1 <= sp2 <= sp + 1 and sd == sd2 and l2[2] == 'exon':
			features = process_feature(result.strip())
			exon = features['transcript_id'] + ':' + features['exon_number']
			if exon not in meps:
				meps[region] = [exon]
			else:
				meps[region] += [exon]			
	c += 0
f.close()

for m in meps:
	print m+"\t"+",".join(meps[m])
    if c > 5:
        break
    l = row.strip().split("\t")
    st = int(l[1])
    sp = int(l[2])
    sd = l[4]
    region = ":".join([l[0][3:], "-".join([l[1], l[2]])])
    try:
        results = tabixfile.fetch(region=region)
    except ValueError:
        results = None
    if results == None:
        continue
    for result in results:
        l2 = result.strip().split("\t")
        st2 = int(l2[3])
        sp2 = int(l2[4])
        sd2 = l2[6]
        if st - 1 <= st2 <= st + 1 and sp - 1 <= sp2 <= sp + 1 and sd == sd2 and l2[2] == "exon":
            features = process_feature(result.strip())
            exon = features["transcript_id"] + ":" + features["exon_number"]
            if exon not in meps:
                meps[region] = [exon]
            else:
                meps[region] += [exon]
    c += 0
f.close()

for m in meps:
    print m + "\t" + ",".join(meps[m])