/
main.py
145 lines (127 loc) · 5.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Archivo principal del pipeline_ensembles
# Autor: GUISANDE DONADIO, CE
import os, glob, shutil, sys, csv
from pprint import pprint as pp
from Bio.PDB import PDBParser, PDBIO, PPBuilder
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import generic_protein# FUNCIONES Y DATOS GLOBALES
path = ""
verbose = True
data = {}
def cargar_txt_en_lista(file_path):
return_list = []
with open(file_path,"r") as openfile:
for lines in openfile:
return_list.append(lines.replace("\n",""))
return return_list
def get_uniprot_from_pdb(pdb_id,chain):
with open("archivos_importantes/pdb_chain_uniprot.csv","r") as openfile:
temp = csv.reader(openfile,delimiter=",")
for item in temp:
if (item[0].upper()== pdb_id) and (item[1]==chain):
return item[2]
break
else:
continue
return None
def get_seq_from_uniprot(uniprot):
seqfile = "fasta/seq_"+uniprot+".fasta"
cmd = "wget https://www.uniprot.org/uniprot/"+uniprot+".fasta -O "+seqfile
os.system(cmd)
if os.path.isfile:
for entry in SeqIO.parse(seqfile,"fasta"):
seq = str(entry.seq)
return seq
def bajar_estructura(pdb_id):
os.chdir(path+"/ent_files/")
filename ="pdb"+pdb_id.lower()+".ent"
url = "ftp://ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/"+pdb_id.lower()[1:3]+"/pdb"+pdb_id.lower()+".ent.gz"
if not os.path.isfile(filename):
#os.system("wget ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/pdb/pdb"+pdb_id.lower()+".ent.gz")
wget_code = os.system("wget "+url)
if wget_code == 2048:
os.chdir(path)
return ("OBSOLETE")
#Uncompress
os.system("gunzip pdb"+pdb_id.lower()+".ent.gz")
else:
if verbose: print (">> File "+filename+" already exists")
os.chdir(path)
return filename
def split_pdb_by_chain(pdb_id):
if not os.path.isdir("pdb_chains/"+pdb_id.upper()):
os.mkdir("pdb_chains/"+pdb_id.upper())
actual_pdbfile = PDBParser().get_structure(pdb_id,"ent_files/pdb"+pdb_id.lower()+".ent")
return_dict = dict()
for model in actual_pdbfile:
for chain in model:
outfilename = pdb_id.upper() + "-" + str(model.get_id()+1) + "_" + str(chain.get_id()) + ".pdb"
if not os.path.isfile("pdb_chains/"+pdb_id.upper()+"/"+outfilename):
io = PDBIO()
io.set_structure(chain)
io.save("pdb_chains/"+pdb_id.upper()+"/"+outfilename)
ppb = PPBuilder().build_peptides(chain)
this_seq = Seq("",generic_protein)
for pp in ppb:
this_seq += pp.get_sequence()
return_dict[outfilename]=this_seq
return return_dict
def read_expdata(path_to_pdbfile):
with open(path_to_pdbfile,"r") as openfile:
expdata = ""
nummodel = 1
for row in openfile:
if "EXPDTA" in row:
expdata += (row[9:-1].strip())
if "X-RAY" in expdata:
break
else:
continue
elif "NUMMDL" in row:
nummodel = int((row[9:-1].strip()))
break
elif "REMARK" in row:
break
elif "ATOM" in row:
break
return expdata,nummodel
def read_atom_full(pdb_file):
return_lst = []
with open(pdb_file,"r") as openfile:
for row in openfile:
if row[0:5] == "ATOM ":
tmp_lst = []
tmp_lst.append(int(row[6:10+1])) #Integer serial Atom serial number.
tmp_lst.append(row[12:15+1].strip()) #Atom name Atom name.
tmp_lst.append(row[16+1]) #Character altLoc Alternate location indicator.
tmp_lst.append(row[17:19+1]) #Residue name resName Residue name.
tmp_lst.append(row[21+1]) #Character chainID Chain identifier.
tmp_lst.append(int(row[22:25+1])) #Integer resSeq Residue sequence number.
#tmp_lst.append(row[26+1]) #AChar iCode Code for insertion of residues.
#tmp_lst.append(row[30:37+1]) #Real(8.3) x Orthogonal coordinates for X in Angstroms.
#tmp_lst.append(row[38:45+1]) #Real(8.3) y Orthogonal coordinates for Y in Angstroms.
#tmp_lst.append(row[46:53+1]) #Real(8.3) z Orthogonal coordinates for Z in Angstroms.
#tmp_lst.append(row[54:59+1]) #Real(6.2) occupancy Occupancy.
#tmp_lst.append(row[60:65+1]) #Real(6.2) tempFactor Temperature factor.
#tmp_lst.append(row[76:77+1]) #LString(2) element Element symbol, right-justified.
#tmp_lst.append(row[78:89+1].replace("\n","")) #LString(2) charge Charge on the atom.
return_lst.append(tmp_lst)
return return_lst
def print_fixed(query_str,fix=100):
for index,char in enumerate(query_str):
print (char,end="")
if (index+1)%fix ==0:
print ("")
print ("")
return
def save_alidic_to_fasta(ali_dic,outpath):
with open (outpath,"w") as openfile:
for itemsin in ali_dic:
openfile.write (">"+item+"\n")
openfile.write (ali_dic[item]+"\n")
d = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'TER':'*',
'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M','XAA':'X'}