-
Notifications
You must be signed in to change notification settings - Fork 0
/
snp_utr_filter.py
executable file
·102 lines (85 loc) · 2.93 KB
/
snp_utr_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/python
# This script reads a nucleotide sequence-fasta-file, the output
# of ORFFinder and a SNP-file to divide SNPs into
# SNPS in the coding region and the UTRs
# The output can be found in snps_cds.tsv, snps_utr.tsv and snps_na.tsv
# USAGE: ./utr-filter.py some.fasta orffinder_results.tsv snps.tsv
from Bio import SeqIO
from Bio import Seq
import sys
# Read the FASTA-file, returns hash. Key = seq-identifier, value = Sequence-details
def sequence_reader(infile):
fasta = open(infile,"rU")
sequences = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
return sequences
# Read the ORFPredictor-Output, returns hash. Key = seq-identifier, value = array.
# in the array:
# 1. if needed the reverse complement of the cds, to be able to search in sequences
# 2. the cds itself
# [maybe_reverse_complement, original_sequence]
def orf_reader(infile):
orfs = {}
handle = open(infile,"r")
lines = handle.readlines()
for line in lines:
if line[0] != "#":
line_array = line.split("\t")
if int(line_array[1]) < 0:
orfs[line_array[0]] = [Seq.reverse_complement(line_array[4]),line_array[4]]
else:
orfs[line_array[0]] = [line_array[4],line_array[4]]
return orfs
# Search for UTRs in the original sequences
def utr_filter(sequences,orfs):
cds = {}
for orf in orfs:
sequence = str(sequences[orf].seq)
start = sequence.find(orfs[orf][0]) + 1
end = sequence.find(orfs[orf][0]) + 1 + len(orfs[orf][0])
cds[orf] = [start,end]
print orf
print start
print end
return cds
def snp_reader(infile):
handle = open(infile,"r")
lines = handle.readlines()
return lines
def snp_divider(cds,lines):
utr_snps = []
cds_snps = []
na_snps = []
for line in lines:
line_array = line.split("\t")
if cds.has_key(line_array[0]):
if int(line_array[1]) >= int(cds[line_array[0]][0]) and int(line_array[1]) <= int(cds[line_array[0]][1]):
cds_snps.append(line_array)
else:
utr_snps.append(line_array)
else:
na_snps.append(line_array)
utr_handle = open("snps_utr.tsv","w")
for line_array in utr_snps:
utr_handle.write("\t".join(line_array))
cds_handle = open("snps_cds.tsv","w")
for line_array in cds_snps:
cds_handle.write("\t".join(line_array))
na_handle = open("snps_na.tsv","w")
for line_array in na_snps:
na_handle.write("\t".join(line_array))
def main():
try:
sequences = sequence_reader(sys.argv[1])
orfs = orf_reader(sys.argv[2])
cds = utr_filter(sequences,orfs)
snps = snp_reader(sys.argv[3])
except:
print "#################################################################################"
print "You need to pass 2 parameters to use this script:"
print "1. A FASTA-file of reference nucleotide-sequences"
print "2. The tab-seperated output of ORFFINDER, that is used to divide the SNPs"
print "3. The SNPs.tsv that shall be divided"
print ""
print "Example: ./utr_filter.py nucleotide_sequences.fasta orffilter_results.tsv snps.tsv"
snp_divider(cds, snps)
main()