/
uniprot_data.py
73 lines (59 loc) · 1.7 KB
/
uniprot_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python
import re, os, json, requests, urllib
from ftplib import FTP
'''
Implemented for downloading 'proteome' sequence
data from the Uniprot FTP server.
'''
# Specify the species for which data is to be downloaded.
# The options are - Archae, Bacteria, Eukaryota and Viruses.
species = 'Viruses'
# Specify the location where you wish to store the downloaded data.
store = './dat/' + str(species) + '/'
def path_to_dir(store):
# Create the specified folder if it does not already exist.
if not os.path.exists(store) and not store == '':
os.makedirs(store)
# If no directory is specified to store the data, store it on user's desktop.
if store == '':
home = os.path.expanduser('~')
store = './dat/' + str(species) + '/'
os.makedirs(store)
i = 0
j = 0
'''
FTP Stuff here.
'''
def ftp_download():
ftp_host = 'ftp.uniprot.org'
ftp_user = 'anonymous'
ftp_pass = ''
ftp_path = '/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes'
ftp = FTP(ftp_host)
ftp.login(ftp_user, ftp_pass)
ftp.getwelcome()
ftp.cwd(ftp_path)
dirs = ftp.nlst()
# print(dirs)
p = 0
# Navigate to the required directory and thereby download data.
for dir in dirs:
if re.search(species, dir):
path = ftp_path + '/' + str(species)
# print(path)
ftp.cwd(path)
types = ftp.nlst()
for x in types:
if not re.search('DNA.fasta.gz', x) and re.search('fasta.gz', x):
final = path + '/' + str(x)
# print(final)
fullfilename = os.path.join(store + str(x))
urllib.urlretrieve('ftp://' + ftp_host + str(final), fullfilename)
p+=1
else:
pass
print("Number of viruses: " + str(p))
print(ftp.pwd())
if __name__ == '__main__':
path_to_dir(store)
ftp_download()