/
UTR5gfftosequence.py
90 lines (64 loc) · 3.38 KB
/
UTR5gfftosequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#Takes a gff file (e.g. from get5UTRcoords.py) and a directory of fasta files of each chromosome and retrieves sequence for each entry. get5UTRcoords.py is designed to leave start codons on each 5' UTR. Introns are removed and exons are joined together. This script is also designed to remove the last 3 nt of the last exon, which if the input gfffile came from get5UTRcoords.py, should be the start codon.
#Usage: python 5UTRgfftosequence.py <gfffile> <directory_containing_chromosome_sequences> <output.fasta>
#Returns dictionary where key is sequence name and value is sequence.
import os
import sys
from Bio import SeqIO
from Bio.SeqUtils import GC
import numpy as np
import gffutils
def gfftosequence(gff, sequencedir):
seqdirectory = os.path.abspath(sequencedir)
seqfiles = [os.path.join(seqdirectory, file) for file in os.listdir(seqdirectory)]
#Make index of fasta files
print 'Indexing sequences...'
seqdb = SeqIO.index_db('seqdb.idx', seqfiles, 'fasta')
print '{0} sequences indexed'.format(len(seqdb))
#Make gff database
gff_fn = gff
db_fn = os.path.basename(gff_fn) + '.db'
if os.path.isfile(db_fn) == False:
gffutils.create_db(gff_fn, db_fn)
db = gffutils.FeatureDB(db_fn)
seqs = {} #dictionary where key is ID and value is sequence
counter = 0
UTRs = db.features_of_type('5UTR')
for UTR in UTRs:
counter +=1
chrm = UTR.chrom
strand = UTR.strand
ID = UTR.attributes['ID']
exoncoords = []
UTRsequence = ''
for exon in db.children(UTR, featuretype = 'exon'):
exoncoords.append([exon.start, exon.stop])
number_of_exons = len(exoncoords)
if strand == '+':
for idx, exonstartstop in enumerate(exoncoords):
if idx + 1 < number_of_exons:
UTRsequence += seqdb[chrm].seq[exonstartstop[0]-1:exonstartstop[1]].upper()
elif idx + 1 == number_of_exons: #if at the last exon
#To check for start codon, take three more nucleotides (change [1]-1 to [1]+2)
UTRsequence += seqdb[chrm].seq[exonstartstop[0]-1:exonstartstop[1]-1].upper()
elif strand == '-':
for idx, exonstartstop in enumerate(reversed(exoncoords)): #reverse exon order since this is - strand
if idx + 1 < number_of_exons:
UTRsequence += seqdb[chrm].seq[exonstartstop[0]-1:exonstartstop[1]].upper().reverse_complement()
elif idx + 1 == number_of_exons: #if at the last exon
#To check for start codon, take three more nucleotides (change [0] to [0]-3)
UTRsequence += seqdb[chrm].seq[exonstartstop[0]:exonstartstop[1]].upper().reverse_complement()
seqs[ID] = UTRsequence
if counter <= 50 and counter % 10 == 0:
print 'Retrieving sequence %i' % (counter)
elif counter > 50 and counter % 50 == 0:
print 'Retrieving sequence %i' % (counter)
print 'Retrieved {0} sequences.'.format(len(seqs))
os.remove(db_fn)
os.remove('seqdb.idx')
return seqs
if __name__ == '__main__':
seqdictionary = gfftosequence(sys.argv[1], sys.argv[2])
outfh = open(sys.argv[3], 'w')
for ID in seqdictionary:
outfh.write('>' + ID + '\n' + str(seqdictionary[ID]) + '\n') #print in fasta format
outfh.close()