-
Notifications
You must be signed in to change notification settings - Fork 2
/
LongestFeature_conservation.py
100 lines (76 loc) · 3.06 KB
/
LongestFeature_conservation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#python3
#Given a gff of the longest 'feature' for every gene (5UTR, CDS, or 3UTR), get conservation data about that feature
import gffutils
import os
import numpy as np
import pysam
import argparse
def maketempgff(gff, gene):
#For a given gene, we want to get the exonic coordinates and look at the conservation.
#So to do that with bedtools intersect, we can make a temporary gff that only consists of the exonic entries for that gene.
db_fn = os.path.abspath(gff) + '.db'
if not os.path.isfile(db_fn):
print('GFF db does not exist!')
sys.exit()
db = gffutils.FeatureDB(db_fn)
with open('temp.gff', 'w') as outfh:
for exon in db.children(gene, featuretype = 'exon'):
outfh.write(str(exon) + '\n')
def intersect(gff, tbx):
phastconsvalues = []
coords = [] #nested list of [chrm, start, stop] for each line in tempgff
utrexonlengths = [] #lengths of all utr exons
with open(gff, 'r') as infh:
for line in infh:
line = line.strip().split('\t')
chrm, start, stop = line[0], int(line[3]), int(line[4])
coords.append([chrm, start, stop])
for coord in coords:
utrexonlengths.append(coord[2] - coord[1])
for row in tbx.fetch(coord[0], coord[1], coord[2], parser = pysam.asBed()):
score = float(row.score)
phastconsvalues.append(score)
medphastcons = np.median(phastconsvalues)
#Check to see if we had scores for at least some fraction of the exonic nt
utrlength = sum(utrexonlengths)
if len(phastconsvalues) >= (utrlength * 0.5):
return medphastcons
else:
return None
def iterategff(gff, phastconsbed):
#Iterate through a "longest feature" gff, going through the genes one by one
#and intersecting them with a bed of phastcons values
scores = {} #{gene : score}
tbx = pysam.TabixFile(phastconsbed) #phastconsbed has to have a tabix index in the same directory
#Make gff database
print('Indexing gff...')
gff_fn = gff
db_fn = os.path.abspath(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn, merge_strategy = 'merge', verbose = True)
db = gffutils.FeatureDB(db_fn)
print('Done indexing!')
genes = db.features_of_type('gene')
genecount = 0
for gene in genes:
genecount +=1
if genecount % 100 == 0:
print('Gene {0}...'.format(genecount))
#Make a gff that is only the exonic coordinates of this gene
#This will be saved as temp.gff
maketempgff(gff, str(gene.id))
#Intersect these coordinates with the phastcons coordinates
medphastcons = intersect('temp.gff', tbx)
os.remove('temp.gff')
scores[str(gene.id).split(':')[1].split('.')[0]] = medphastcons
outfile = gff + '.phyloPmedians.txt'
with open(outfile, 'w') as outfh:
outfh.write(('\t').join(['gene', 'medianphastconsscore']) + '\n')
for gene in scores:
outfh.write(('\t').join([gene, str(scores[gene])]) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gff', type = str, help = 'Longest feature gff.')
parser.add_argument('--phastconsbed', type = str, help = 'Phastcons values in bed format. Can be compressed.')
args = parser.parse_args()
iterategff(args.gff, args.phastconsbed)