def extract_feature(self): seed(self.seed) counter = 0 print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search(residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface atoms shape distribution for {0} ... ". format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): print_info("{0}".format(protein.name)) pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb' surface, normals = get_surface_atoms(pdb_file_name) distributions = np.zeros( (len(protein.residues), 2 * (self.number_of_bins + 1))) for i in range(len(protein.residues)): residue = protein.residues[i] distributions[i, :] = self.get_distributions( residue.center, surface, normals) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: protrusion_file = self.__get_dir_name() + protein.name if not os.path.exists(protrusion_file+".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" result_dict = run_psaia(pdb_file) protrusion_array = np.zeros((len(protein.residues), 5 + 5 + 5 + 6 + 6 + 1)) if result_dict is not None: for index, residue in enumerate(protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in result_dict: values = result_dict[key] protrusion_array[index, :] = self._normalize_features(*values) else: print('key not found in PSAIA processing!') np.save(protrusion_file, protrusion_array) protrusion_array = np.load(protrusion_file+".npy") for index, residue in enumerate(protein.residues): residue.add_feature(Features.PROTRUSION_INDEX, protrusion_array[index, 21:]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): secondary_structure_dict = dict( zip(ss_abbreviations, range(len(ss_abbreviations)))) print_info_nn( " >>> Adding secondary structure for database {0} ... ".format( self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: stride_x_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(stride_x_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" n = len(protein.residues) stride_x = stride_dict_from_pdb_file(pdb_file) stride_x_array = np.zeros((n, 11)) for index, residue in enumerate( protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in stride_x: (_, s, phi, psi, asa, rasa) = stride_x[key] if s not in secondary_structure_dict: raise ValueError( "unknown secondary structure! Add to dictionary!" ) ss = np.zeros(len(secondary_structure_dict)) ss[secondary_structure_dict[s]] = 1 stride_x_array[index, :7] = ss stride_x_array[index, 7] = phi stride_x_array[index, 8] = psi stride_x_array[index, 9] = asa stride_x_array[index, 10] = rasa np.save(stride_x_file, stride_x_array) stride_x = np.load(stride_x_file) for i, res in enumerate(protein.residues): res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7]) res.add_feature(Features.PHI, stride_x[i, 7]) res.add_feature(Features.PSI, stride_x[i, 8]) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def get_rfpp(self): if self.pyml_result is None: return None example_range = self.database.complexes_example_range example_complex_map = {} for complex_name in example_range: interval = example_range[complex_name] for example in range(interval[0], interval[2]): example_complex_map[example] = complex_name example_index_map = self.database.get_pair_index_map() complex_length_folds = [] for fold in self.pyml_result: complex_length_map = {} for i in range(len(fold.L)): complex_name = self.database.example_complex[fold.patternID[i]] if complex_name not in complex_length_map: complex_length_map[complex_name] = 0 complex_length_map[complex_name] += 1 complex_length_folds.append(complex_length_map) rfpp = {} fold_no = 0 for fold in self.pyml_result: complex_performance_map = {} complex_length_map = complex_length_folds[fold_no] for i in range(len(fold.L)): pid = fold.patternID[i] complex_name = self.database.example_complex[pid] if complex_name not in complex_performance_map: example_no = complex_length_map[complex_name] complex_performance_map[complex_name] = ndarray((example_no, 3)) complex_length_map[complex_name] = 0 perf_table = complex_performance_map[complex_name] length = complex_length_map[complex_name] perf_table[length, :] = [int(fold.Y[i]), int(fold.givenY[i]), fold.decisionFunc[i]] complex_length_map[complex_name] += 1 number_of_examples_in_complex = perf_table.shape[0] if complex_length_map[complex_name] == number_of_examples_in_complex: sorted_perf = perf_table[(-perf_table[:, 2]).argsort()] complex_performance_map[complex_name] = sorted_perf for complex_name in complex_performance_map: perf_table = complex_performance_map[complex_name] for i in range(perf_table.shape[0]): if perf_table[i, 0] > 0 and perf_table[i, 1] > 0: rfpp[complex_name] = (i+1, perf_table.shape[0]) break fold_no += 1 average = 0 for complex_name in rfpp: rank, n = rfpp[complex_name] percent = math.ceil((rank * 100) / n) average += percent print_info("{0} : {1}".format(complex_name, percent)) print_info("Average RFPP {0}".format(average)) return rfpp
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate(protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[int(neighbour_index)] if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue['CA'].get_vector() residue_index = self._residue_index_table[neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface shape distribution for database {0} ... ". format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search( residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index( nearby_residue) residue = protein.residues[residues_index] if residue.get_feature( Features.RELATIVE_ACCESSIBLE_SURFACE_AREA ) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution( nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn( " >>> Adding secondary structure for database {0} ... ".format( self._database.name)) overall_time = datetime.now() if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: dssp_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(dssp_file): print_info_nn("... running DSSP for protein " + protein.name) start_time = datetime.now() dssp = DSSP( protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb") dssp_array = np.ndarray((len(protein.residues), 6)) for (i, res) in enumerate(protein.biopython_residues): (_, _, cid, rid) = res.get_full_id() key = (cid, rid) if key in dssp: dssp_array[i, 2:] = (dssp[key])[2:] else: dssp_array[i, 2:] = [0, 0, 0, 0] # print_error("WTH") # sys.exit(0) # print('here') # pdb.set_trace() # self.SS[:, index] = np.nan # self.ASA[index] = np.nan # self.rASA[index] = np.nan # self.Phi[index] = np.nan # self.Psi[index] = np.nan np.save(dssp_file, dssp_array) print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds)) dssp = np.load(dssp_file) for i, res in enumerate(protein.residues): # (_, s, ASA, rASA, phi, psi) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3]) res.add_feature(Features.PHI, dssp[i, 4]) res.add_feature(Features.PSI, dssp[i, 5]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def __compute_profiles(self, db='nr', niter=3): print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name)) start_time = datetime.now() for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta" output_file = self._database.directory + pssm_directory + protein.name if not os.path.exists(output_file + ".mat"): print_info("... processing protein {0} ... ".format(protein.name)) command = "cd {4} \n " \ "{5} " \ "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \ .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable) print_info(command) error_code = os.system(command) if error_code == 0: print_info('Successful!') else: print_error('Failed with error code {0}'.format(error_code)) pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat") wpssm = ProfileExtractor.__get_wpsm(pssm) wpsfm = ProfileExtractor.__get_wpsm(psfm) for i, res in enumerate(protein.residues): res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i])) res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i])) print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def __save_sequences_to_fasta(self): for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: sequence = protein.sequence fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta" if os.path.exists(fasta_file): continue print_info("... Saving sequence for protein " + protein.name) f = open(fasta_file, "w+") f.write(">{0}\n".format(protein.name)) f.write(sequence + "\n") f.close()
def __extract_examples(self): """ This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C, wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are considered as negative examples. Extracted examples are saved in self.examples """ print_info("Finding the positive and negative examples in DBD4 ... {0}".format(self.positives_size)) start_time = datetime.now() counter = 1 start_index = 0 neg_no = 0 pos_no = 0 for complex_name in self.complexes.keys(): print_info_nn("{0}/{1}... processing complex {2}".format(counter, len(self.complexes), complex_name)) protein_complex = self.complexes[complex_name] bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues bound_ligand_residues = protein_complex.bound_formation.ligand.residues bound_receptor_residues = protein_complex.bound_formation.receptor.residues pos = [] neg = [] for i in range(len(bound_ligand_bio_residues)): for j in range(len(bound_receptor_bio_residues)): bound_ligand_residue = bound_ligand_bio_residues[i] bound_receptor_residue = bound_receptor_bio_residues[j] l_atoms = [atom.get_coord() for atom in bound_ligand_residue.get_list()] r_atoms = [atom.get_coord() for atom in bound_receptor_residue.get_list()] dist_mat = cdist(l_atoms, r_atoms) ligand_b2u = protein_complex.ligand_bound_to_unbound receptor_b2u = protein_complex.receptor_bound_to_unbound # if the residues have an unbound counterpart # this is due to the fact that the unbound and bound formations may have slightly different residues if bound_ligand_residues[i] in ligand_b2u and bound_receptor_residues[j] in receptor_b2u: unbound_ligand_res = ligand_b2u[bound_ligand_residues[i]] unbound_receptor_res = receptor_b2u[bound_receptor_residues[j]] unbound_ligand_res_index = self.__get_residue_index(unbound_ligand_res) unbound_receptor_res_index = self.__get_residue_index(unbound_receptor_res) if dist_mat.min() < self.interaction_threshold: pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1)) else: neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1)) self.examples.extend(copy.copy(pos)) self.examples.extend(copy.copy(neg)) pos_no += len(pos) neg_no += len(neg) self.complexes_example_range[complex_name] = ( start_index, start_index + len(pos), start_index + len(neg) + len(pos)) print_info(" ( {0:03d}/{1:05d} ) -{2}".format(len(pos), len(neg), self.complexes_example_range[complex_name])) start_index += len(pos) + len(neg) counter += 1 all_e = pos + neg for e in all_e: self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ") print_info("The total number of examples found: " + str(pos_no + neg_no))
def extract_feature(self): secondary_structure_dict = dict(zip(ss_abbreviations, range(len(ss_abbreviations)))) print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: stride_x_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(stride_x_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" n = len(protein.residues) stride_x = stride_dict_from_pdb_file(pdb_file) stride_x_array = np.zeros((n, 11)) for index, residue in enumerate(protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in stride_x: (_, s, phi, psi, asa, rasa) = stride_x[key] if s not in secondary_structure_dict: raise ValueError("unknown secondary structure! Add to dictionary!") ss = np.zeros(len(secondary_structure_dict)) ss[secondary_structure_dict[s]] = 1 stride_x_array[index, :7] = ss stride_x_array[index, 7] = phi stride_x_array[index, 8] = psi stride_x_array[index, 9] = asa stride_x_array[index, 10] = rasa np.save(stride_x_file, stride_x_array) stride_x = np.load(stride_x_file) for i, res in enumerate(protein.residues): res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7]) res.add_feature(Features.PHI, stride_x[i, 7]) res.add_feature(Features.PSI, stride_x[i, 8]) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: residue_depth_file = self._get_dir_name() + protein.name + ".npy" if not os.path.exists(residue_depth_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" rd = ResidueDepth(protein.structure[0], pdb_file) rd_array = np.ndarray((len(protein.residues), 2)) # self.number_of_bins + # surface = get_surface(pdb_file) for (i, res) in enumerate(protein.biopython_residues): (_, _, c, (h, rn, ic)) = res.get_full_id() key = (c, (h, rn, ic)) if key in rd: rdv = rd[key] if rdv[0] is None: rdv = (0, rdv[1]) print "WTH?" if rdv[1] is None: rdv = (rdv[0], 0) print "WTH?" rd_array[i, :2] = rdv else: print_error('WTH') rd_array[i, :2] = [0, 0] # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center) np.save(residue_depth_file, rd_array) surface_features = np.load(residue_depth_file) for i, res in enumerate(protein.residues): res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2])) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) counter = 0 overall_time = datetime.now() print_info_nn( " >>> Adding D2 shape distribution for database {0} ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins)) # distributions = np.zeros((len(protein.residues), self.number_of_bins+2)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search( residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution( nearby_residues) # distributions[i:, -1] = len(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D2_PLAIN_SHAPE_DISTRIBUTION, distributions[i, :]) # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): counter = 0 print_info_nn(" >>> Adding Residue Neighbourhood ... ") overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: residue_neighbourhood_file = self._get_dir_name() + protein.name if not os.path.exists(residue_neighbourhood_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) neighbourhood = [] max_length = 0 for i, query_residue in enumerate(protein.residues): neighbourhood.append([]) for j, neighbour_residue in enumerate(protein.residues): # if i == j: # continue distance = cdist(query_residue.get_coordinates(), neighbour_residue.get_coordinates()).min() similarity = np.exp(-(distance ** 2) / self._sigma) if distance <= 7.5: neighbourhood[-1].append(j) if len(neighbourhood[-1]) > max_length: max_length = len(neighbourhood[-1]) neighbourhood_array = -np.ones((len(protein.residues), max_length)) # print len(neighbourhood) for i, residue_neighbourhood in enumerate(neighbourhood): for j, neighbour_index in enumerate(neighbourhood[i]): neighbourhood_array[i, j] = neighbourhood[i][j] # print neighbourhood_array[i, :] np.save(residue_neighbourhood_file, neighbourhood_array) neighbourhood_array = np.load(residue_neighbourhood_file+".npy") for index, residue in enumerate(protein.residues): residue.add_feature(Features.RESIDUE_NEIGHBOURHOOD, neighbourhood_array[index, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: dssp_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(dssp_file): print_info_nn("... running DSSP for protein " + protein.name) start_time = datetime.now() dssp = DSSP(protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb") dssp_array = np.ndarray((len(protein.residues), 6)) for (i, res) in enumerate(protein.biopython_residues): (_, _, cid, rid) = res.get_full_id() key = (cid, rid) if key in dssp: dssp_array[i, 2:] = (dssp[key])[2:] else: dssp_array[i, 2:] = [0, 0, 0, 0] # print_error("WTH") # sys.exit(0) # print('here') # pdb.set_trace() # self.SS[:, index] = np.nan # self.ASA[index] = np.nan # self.rASA[index] = np.nan # self.Phi[index] = np.nan # self.Psi[index] = np.nan np.save(dssp_file, dssp_array) print_info("took {0} seconds.".format((datetime.now() - start_time).seconds)) dssp = np.load(dssp_file) for i, res in enumerate(protein.residues): # (_, s, ASA, rASA, phi, psi) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3]) res.add_feature(Features.PHI, dssp[i, 4]) res.add_feature(Features.PSI, dssp[i, 5]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def __read_pdb_files(self): print_info("Parsing the pdb files in directory {0} ....".format( os.path.abspath(self.directory))) ligand_bound_files = glob.glob(self.directory + pdb_directory + "*_l_b.pdb") ligand_bound_files.sort() counter = 0 for ligand_bound_file in ligand_bound_files: complex_name = basename(ligand_bound_file).replace("_l_b.pdb", "") receptor_bound_file = ligand_bound_file.replace( "_l_b.pdb", "_r_b.pdb") ligand_unbound_file = ligand_bound_file.replace( "_l_b.pdb", "_l_u.pdb") receptor_unbound_file = ligand_bound_file.replace( "_l_b.pdb", "_r_u.pdb") print_info("Reading complex " + complex_name) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) ligand_bound = Protein(*read_pdb_file(ligand_bound_file)) receptor_bound = Protein(*read_pdb_file(receptor_bound_file)) ligand_unbound = Protein(*read_pdb_file(ligand_unbound_file)) receptor_unbound = Protein( *read_pdb_file(receptor_unbound_file)) bound_formation = ProteinPair(ligand_bound, receptor_bound) unbound_formation = ProteinPair(ligand_unbound, receptor_unbound) self.complexes[complex_name] = ProteinComplex( complex_name, unbound_formation, bound_formation) counter += 1 print_info("Total number of complexes processed : " + str(counter))
def extract_feature(self): seed(self.seed) print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index(nearby_residue) residue = protein.residues[residues_index] if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution(nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def _load(self, file_name=None): """ This function load all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name) print_info_nn("Loading the object model from {0} ... ".format( object_model_file_name)) start_time = datetime.now() (self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex) = cPickle.load(f) f.close() gc.collect() print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds))
def _save(self, file_name=None): """ This function saves all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if not os.path.exists(self.directory + pickle_directory): os.mkdir(self.directory + pickle_directory) if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name, "wb") print_info_nn("Saving the object model into {0} ... ".format( object_model_file_name)) start_time = datetime.now() cPickle.dump((self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex), f) f.close() print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds))
def _load(self, file_name=None): """ This function load all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name) print_info_nn("Loading the object model from {0} ... ".format(object_model_file_name)) start_time = datetime.now() (self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex) = cPickle.load(f) f.close() gc.collect() print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def _save(self, file_name=None): """ This function saves all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if not os.path.exists(self.directory + pickle_directory): os.mkdir(self.directory + pickle_directory) if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name, "wb") print_info_nn("Saving the object model into {0} ... ".format(object_model_file_name)) start_time = datetime.now() cPickle.dump((self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex), f) f.close() print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def extract_feature(self): counter = 0 overall_time = datetime.now() print_info_nn(" >>> Adding B Factor ... ".format(self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: b_factor_filename = self._get_dir_name() + protein.name if not os.path.exists(b_factor_filename + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) b_factor_array = np.zeros(len(protein.residues)) for (index, residue) in enumerate(protein.biopython_residues): b_factor_array[index] = max( [atom.get_bfactor() for atom in residue]) np.save(b_factor_filename, b_factor_array) b_factor_array = np.load(b_factor_filename + ".npy") # print b_factor_array for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.B_VALUE, b_factor_array[i]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def __read_pdb_files(self): print_info("Parsing the pdb files in directory {0} ....".format(os.path.abspath(self.directory))) ligand_bound_files = glob.glob(self.directory + pdb_directory + "*_l_b.pdb") ligand_bound_files.sort() counter = 0 for ligand_bound_file in ligand_bound_files: complex_name = basename(ligand_bound_file).replace("_l_b.pdb", "") receptor_bound_file = ligand_bound_file.replace("_l_b.pdb", "_r_b.pdb") ligand_unbound_file = ligand_bound_file.replace("_l_b.pdb", "_l_u.pdb") receptor_unbound_file = ligand_bound_file.replace("_l_b.pdb", "_r_u.pdb") print_info("Reading complex " + complex_name) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) ligand_bound = Protein(*read_pdb_file(ligand_bound_file)) receptor_bound = Protein(*read_pdb_file(receptor_bound_file)) ligand_unbound = Protein(*read_pdb_file(ligand_unbound_file)) receptor_unbound = Protein(*read_pdb_file(receptor_unbound_file)) bound_formation = ProteinPair(ligand_bound, receptor_bound) unbound_formation = ProteinPair(ligand_unbound, receptor_unbound) self.complexes[complex_name] = ProteinComplex(complex_name, unbound_formation, bound_formation) counter += 1 print_info("Total number of complexes processed : " + str(counter))
def main(): print_info("Starting the experiment") start_time = datetime.now() seed = 1 #number_of_samples = 5000 number_of_samples = 20000 dbd4 = DBD4(size=number_of_samples, ratio=-1, seed=seed) mtrand.seed(seed) feature_sets = [ #[ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, #], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D2_PLAIN_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D1_PLAIN_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # Features.D2_SURFACE_SHAPE_DISTRIBUTION # ], [ Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, Features.D1_SURFACE_SHAPE_DISTRIBUTION ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # Features.D2_CATEGORY_SHAPE_DISTRIBUTION # ], # [ # Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, # # Features.PROTRUSION_INDEX, # # Features.B_VALUE, # Features.HALF_SPHERE_EXPOSURE, # Features.SECONDARY_STRUCTURE, # Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, # Features.POSITION_SPECIFIC_SCORING_MATRIX, # Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # # # Features.PHI, # # # Features.PSI, # # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, # Features.D2_SURFACE_SHAPE_DISTRIBUTION, # # Features.D1_SURFACE_SHAPE_DISTRIBUTION, # # Features.D2_PLAIN_SHAPE_DISTRIBUTION, # # Features.D1_SURFACE_SHAPE_DISTRIBUTION, # Features.RESIDUE_DEPTH # ] ] results = [] for feature_set in feature_sets: print_special("Feature set {0}".format(feature_set)) e = Experiment(feature_set, dbd4, Classifier.SVM) e.run(number_of_bins=20, radius=15, number_of_samples=-1, seed=seed, gamma=0.5, save=True, folds=5, rASA=.5) results.append(e.pyml_result) print_info("Took {0} seconds.".format((datetime.now() - start_time).seconds)) save_results(number_of_samples, results, feature_sets)
def __extract_examples(self): """ This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C, wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are considered as negative examples. Extracted examples are saved in self.examples """ print_info( "Finding the positive and negative examples in DBD4 ... {0}". format(self.positives_size)) start_time = datetime.now() counter = 1 start_index = 0 neg_no = 0 pos_no = 0 for complex_name in self.complexes.keys(): print_info_nn("{0}/{1}... processing complex {2}".format( counter, len(self.complexes), complex_name)) protein_complex = self.complexes[complex_name] bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues bound_ligand_residues = protein_complex.bound_formation.ligand.residues bound_receptor_residues = protein_complex.bound_formation.receptor.residues pos = [] neg = [] for i in range(len(bound_ligand_bio_residues)): for j in range(len(bound_receptor_bio_residues)): bound_ligand_residue = bound_ligand_bio_residues[i] bound_receptor_residue = bound_receptor_bio_residues[j] l_atoms = [ atom.get_coord() for atom in bound_ligand_residue.get_list() ] r_atoms = [ atom.get_coord() for atom in bound_receptor_residue.get_list() ] dist_mat = cdist(l_atoms, r_atoms) ligand_b2u = protein_complex.ligand_bound_to_unbound receptor_b2u = protein_complex.receptor_bound_to_unbound # if the residues have an unbound counterpart # this is due to the fact that the unbound and bound formations may have slightly different residues if bound_ligand_residues[ i] in ligand_b2u and bound_receptor_residues[ j] in receptor_b2u: unbound_ligand_res = ligand_b2u[ bound_ligand_residues[i]] unbound_receptor_res = receptor_b2u[ bound_receptor_residues[j]] unbound_ligand_res_index = self.__get_residue_index( unbound_ligand_res) unbound_receptor_res_index = self.__get_residue_index( unbound_receptor_res) if dist_mat.min() < self.interaction_threshold: pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1)) else: neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1)) self.examples.extend(copy.copy(pos)) self.examples.extend(copy.copy(neg)) pos_no += len(pos) neg_no += len(neg) self.complexes_example_range[complex_name] = (start_index, start_index + len(pos), start_index + len(neg) + len(pos)) print_info(" ( {0:03d}/{1:05d} ) -{2}".format( len(pos), len(neg), self.complexes_example_range[complex_name])) start_index += len(pos) + len(neg) counter += 1 all_e = pos + neg for e in all_e: self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ") print_info("The total number of examples found: " + str(pos_no + neg_no))
def main(): seed = 1 number_of_samples = 20000 dbd4 = DBD4(size=number_of_samples, ratio=1, thresh=6, seed=seed) SecondaryStructureExtractor(dbd4).extract_feature() for complex_name in dbd4.complexes: print_info(complex_name) c = dbd4.complexes[complex_name] b_ligand, b_receptor = (c.bound_formation.ligand, c.bound_formation.receptor) u_ligand, u_receptor = (c.unbound_formation.ligand, c.unbound_formation.receptor) b_ligand_bio_residues, b_receptor_bio_residues = b_ligand.biopython_residues, b_receptor.biopython_residues # b_l_ns, b_r_ns = NeighborSearch(b_ligand.atoms), NeighborSearch(b_receptor.atoms) u_l_ns, u_r_ns = NeighborSearch(u_ligand.atoms), NeighborSearch( u_receptor.atoms) index = randint(1, 30) positives = 0 negatives = 0 lb2u = c.ligand_bound_to_unbound rb2u = c.receptor_bound_to_unbound p = False n = False with PdfPages('{0}/geometry/figures/{1}.pdf'.format( reports_directory, complex_name)) as pdf: try: for i in range(len(b_ligand_bio_residues)): for j in range(len(b_receptor_bio_residues)): if b_ligand.residues[ i] not in lb2u or b_receptor.residues[ j] not in rb2u: continue l_atoms = b_ligand_bio_residues[i].get_list() r_atoms = b_receptor_bio_residues[j].get_list() dist_mat = cdist( [atom.get_coord() for atom in l_atoms], [atom.get_coord() for atom in r_atoms]) if p and n: print "getting out of loop..." raise GetOutOfLoop if dist_mat.min() < dbd4.interaction_threshold: if p: continue positives += 1 if positives != index: continue # b_l_points, b_l_dist = get_coords(b_l_ns, b_ligand, b_ligand.residues[i], 0.5, False) # b_r_points, b_r_dist = get_coords(b_r_ns, b_receptor, b_receptor.residues[j], 0.5, False) b_l_points, b_l_dist = get_coords( u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False) b_r_points, b_r_dist = get_coords( u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, False) u_l_points, u_l_dist = get_coords( u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True) u_r_points, u_r_dist = get_coords( u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, True) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r') ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b') plt.title( "Interacting Residues Bound Conformation") pdf.savefig() plt.close() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r') ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b') plt.title( "Interacting Surface Residues Bound Conformation" ) pdf.savefig() plt.close() plt.figure() plt.plot(u_l_dist) plt.plot(u_r_dist) plt.legend([ "bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand", "unbound receptor" ]) pdf.savefig() plt.close() p = True else: if n: continue lb2u = c.ligand_bound_to_unbound rb2u = c.receptor_bound_to_unbound b_l_points, b_l_dist = get_coords( u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False) b_r_points, b_r_dist = get_coords( u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, False) u_l_points, u_l_dist = get_coords( u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True) u_r_points, u_r_dist = get_coords( u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, True) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r') ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b') plt.title( "Non-Interacting Residues Bound Conformation") pdf.savefig() plt.close() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r') ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b') plt.title( "Non-Interacting Surface Residues Bound Conformation" ) pdf.savefig() plt.close() plt.figure() plt.plot(u_l_dist) plt.plot(u_r_dist) plt.legend([ "bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand", "unbound receptor" ]) pdf.savefig() plt.close() n = True except GetOutOfLoop: pass
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate( protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[ residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[ index].get_feature( Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[ int(neighbour_index)] if is_aa(neighbour_residue ) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue[ 'CA'].get_vector() residue_index = self._residue_index_table[ neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def main(): seed = 1 number_of_samples = 20000 dbd4 = DBD4(size=number_of_samples, ratio=1, thresh=6, seed=seed) SecondaryStructureExtractor(dbd4).extract_feature() for complex_name in dbd4.complexes: print_info(complex_name) c = dbd4.complexes[complex_name] b_ligand, b_receptor = (c.bound_formation.ligand, c.bound_formation.receptor) u_ligand, u_receptor = (c.unbound_formation.ligand, c.unbound_formation.receptor) b_ligand_bio_residues, b_receptor_bio_residues = b_ligand.biopython_residues, b_receptor.biopython_residues # b_l_ns, b_r_ns = NeighborSearch(b_ligand.atoms), NeighborSearch(b_receptor.atoms) u_l_ns, u_r_ns = NeighborSearch(u_ligand.atoms), NeighborSearch(u_receptor.atoms) index = randint(1, 30) positives = 0 negatives = 0 lb2u = c.ligand_bound_to_unbound rb2u = c.receptor_bound_to_unbound p = False n = False with PdfPages('{0}/geometry/figures/{1}.pdf'.format(reports_directory, complex_name)) as pdf: try: for i in range(len(b_ligand_bio_residues)): for j in range(len(b_receptor_bio_residues)): if b_ligand.residues[i] not in lb2u or b_receptor.residues[j] not in rb2u: continue l_atoms = b_ligand_bio_residues[i].get_list() r_atoms = b_receptor_bio_residues[j].get_list() dist_mat = cdist([atom.get_coord() for atom in l_atoms], [atom.get_coord() for atom in r_atoms]) if p and n: print "getting out of loop..." raise GetOutOfLoop if dist_mat.min() < dbd4.interaction_threshold: if p: continue positives += 1 if positives != index: continue # b_l_points, b_l_dist = get_coords(b_l_ns, b_ligand, b_ligand.residues[i], 0.5, False) # b_r_points, b_r_dist = get_coords(b_r_ns, b_receptor, b_receptor.residues[j], 0.5, False) b_l_points, b_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False) b_r_points, b_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, False) u_l_points, u_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True) u_r_points, u_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, True) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r') ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b') plt.title("Interacting Residues Bound Conformation") pdf.savefig() plt.close() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r') ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b') plt.title("Interacting Surface Residues Bound Conformation") pdf.savefig() plt.close() plt.figure() plt.plot(u_l_dist) plt.plot(u_r_dist) plt.legend(["bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand", "unbound receptor"]) pdf.savefig() plt.close() p = True else: if n: continue lb2u = c.ligand_bound_to_unbound rb2u = c.receptor_bound_to_unbound b_l_points, b_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False) b_r_points, b_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, False) u_l_points, u_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True) u_r_points, u_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5, True) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r') ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b') plt.title("Non-Interacting Residues Bound Conformation") pdf.savefig() plt.close() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r') ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b') plt.title("Non-Interacting Surface Residues Bound Conformation") pdf.savefig() plt.close() plt.figure() plt.plot(u_l_dist) plt.plot(u_r_dist) plt.legend(["bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand", "unbound receptor"]) pdf.savefig() plt.close() n = True except GetOutOfLoop: pass