Beispiel #1
0
 def extract_feature(self):
     print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     counter = 0
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             protrusion_file = self.__get_dir_name() + protein.name
             if not os.path.exists(protrusion_file+".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                 result_dict = run_psaia(pdb_file)
                 protrusion_array = np.zeros((len(protein.residues), 5 + 5 + 5 + 6 + 6 + 1))
                 if result_dict is not None:
                     for index, residue in enumerate(protein.biopython_residues):
                         key = self.get_residue_id(residue.get_full_id())
                         if key in result_dict:
                             values = result_dict[key]
                             protrusion_array[index, :] = self._normalize_features(*values)
                         else:
                             print('key not found in PSAIA processing!')
                 np.save(protrusion_file, protrusion_array)
             protrusion_array = np.load(protrusion_file+".npy")
             for index, residue in enumerate(protein.residues):
                 residue.add_feature(Features.PROTRUSION_INDEX, protrusion_array[index, 21:])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Beispiel #2
0
 def __compute_profiles(self, db='nr', niter=3):
     print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name))
     start_time = datetime.now()
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta"
             output_file = self._database.directory + pssm_directory + protein.name
             if not os.path.exists(output_file + ".mat"):
                 print_info("... processing protein {0} ...    ".format(protein.name))
                 command = "cd {4} \n " \
                           "{5} " \
                           "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \
                     .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable)
                 print_info(command)
                 error_code = os.system(command)
                 if error_code == 0:
                     print_info('Successful!')
                 else:
                     print_error('Failed with error code {0}'.format(error_code))
             pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat")
             wpssm = ProfileExtractor.__get_wpsm(pssm)
             wpsfm = ProfileExtractor.__get_wpsm(psfm)
             for i, res in enumerate(protein.residues):
                 res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i]))
                 res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i]))
     print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
 def extract_feature(self):
     seed(self.seed)
     counter = 0
     print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros((len(protein.residues), self.number_of_bins))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface atoms shape distribution for {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    print_info("{0}".format(protein.name))
                    pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb'
                    surface, normals = get_surface_atoms(pdb_file_name)
                    distributions = np.zeros(
                        (len(protein.residues), 2 * (self.number_of_bins + 1)))

                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        distributions[i, :] = self.get_distributions(
                            residue.center, surface, normals)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
    def extract_feature(self):
        secondary_structure_dict = dict(
            zip(ss_abbreviations, range(len(ss_abbreviations))))
        print_info_nn(
            " >>> Adding secondary structure for database {0} ... ".format(
                self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self.__get_dir_name()):
            os.mkdir(self.__get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                stride_x_file = self.__get_dir_name() + protein.name + ".npy"
                if not os.path.exists(stride_x_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    n = len(protein.residues)
                    stride_x = stride_dict_from_pdb_file(pdb_file)
                    stride_x_array = np.zeros((n, 11))
                    for index, residue in enumerate(
                            protein.biopython_residues):
                        key = self.get_residue_id(residue.get_full_id())
                        if key in stride_x:
                            (_, s, phi, psi, asa, rasa) = stride_x[key]
                            if s not in secondary_structure_dict:
                                raise ValueError(
                                    "unknown secondary structure! Add to dictionary!"
                                )
                            ss = np.zeros(len(secondary_structure_dict))
                            ss[secondary_structure_dict[s]] = 1
                            stride_x_array[index, :7] = ss
                            stride_x_array[index, 7] = phi
                            stride_x_array[index, 8] = psi
                            stride_x_array[index, 9] = asa
                            stride_x_array[index, 10] = rasa
                    np.save(stride_x_file, stride_x_array)
                stride_x = np.load(stride_x_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.SECONDARY_STRUCTURE,
                                    stride_x[i, :7])
                    res.add_feature(Features.PHI, stride_x[i, 7])
                    res.add_feature(Features.PSI, stride_x[i, 8])
                    res.add_feature(Features.ACCESSIBLE_SURFACE_AREA,
                                    stride_x[i, 9])
                    res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
                                    stride_x[i, 10])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[int(neighbour_index)]
                             if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue['CA'].get_vector()
                                 residue_index = self._residue_index_table[neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Beispiel #7
0
    def __extract_examples(self):
        """
        This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C,
        wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the
        bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are
        considered as negative examples. Extracted examples are saved in self.examples
        """
        print_info("Finding the positive and negative examples in DBD4 ... {0}".format(self.positives_size))
        start_time = datetime.now()
        counter = 1
        start_index = 0
        neg_no = 0
        pos_no = 0
        for complex_name in self.complexes.keys():
            print_info_nn("{0}/{1}... processing complex {2}".format(counter, len(self.complexes), complex_name))
            protein_complex = self.complexes[complex_name]
            bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues
            bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues
            bound_ligand_residues = protein_complex.bound_formation.ligand.residues
            bound_receptor_residues = protein_complex.bound_formation.receptor.residues
            pos = []
            neg = []
            for i in range(len(bound_ligand_bio_residues)):
                for j in range(len(bound_receptor_bio_residues)):
                    bound_ligand_residue = bound_ligand_bio_residues[i]
                    bound_receptor_residue = bound_receptor_bio_residues[j]
                    l_atoms = [atom.get_coord() for atom in bound_ligand_residue.get_list()]
                    r_atoms = [atom.get_coord() for atom in bound_receptor_residue.get_list()]
                    dist_mat = cdist(l_atoms, r_atoms)
                    ligand_b2u = protein_complex.ligand_bound_to_unbound
                    receptor_b2u = protein_complex.receptor_bound_to_unbound
                    # if the residues have an unbound counterpart
                    # this is due to the fact that the unbound and bound formations may have slightly different residues
                    if bound_ligand_residues[i] in ligand_b2u and bound_receptor_residues[j] in receptor_b2u:
                        unbound_ligand_res = ligand_b2u[bound_ligand_residues[i]]
                        unbound_receptor_res = receptor_b2u[bound_receptor_residues[j]]
                        unbound_ligand_res_index = self.__get_residue_index(unbound_ligand_res)
                        unbound_receptor_res_index = self.__get_residue_index(unbound_receptor_res)
                        if dist_mat.min() < self.interaction_threshold:
                            pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1))
                        else:
                            neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1))
            self.examples.extend(copy.copy(pos))
            self.examples.extend(copy.copy(neg))
            pos_no += len(pos)
            neg_no += len(neg)
            self.complexes_example_range[complex_name] = (
                start_index, start_index + len(pos), start_index + len(neg) + len(pos))
            print_info(" ( {0:03d}/{1:05d} ) -{2}".format(len(pos), len(neg), self.complexes_example_range[complex_name]))
            start_index += len(pos) + len(neg)
            counter += 1
            all_e = pos + neg
            for e in all_e:
                self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name

        print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ")
        print_info("The total number of examples found: " + str(pos_no + neg_no))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface shape distribution for database {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros(
                        (len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(
                            residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(
                                nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(
                                    Features.RELATIVE_ACCESSIBLE_SURFACE_AREA
                            ) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(
                            nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
 def extract_feature(self):
     print_info_nn(
         " >>> Adding secondary structure for database {0} ... ".format(
             self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             dssp_file = self.__get_dir_name() + protein.name + ".npy"
             if not os.path.exists(dssp_file):
                 print_info_nn("... running DSSP for protein " +
                               protein.name)
                 start_time = datetime.now()
                 dssp = DSSP(
                     protein.structure[0], self._database.directory +
                     pdb_directory + protein.name + ".pdb")
                 dssp_array = np.ndarray((len(protein.residues), 6))
                 for (i, res) in enumerate(protein.biopython_residues):
                     (_, _, cid, rid) = res.get_full_id()
                     key = (cid, rid)
                     if key in dssp:
                         dssp_array[i, 2:] = (dssp[key])[2:]
                     else:
                         dssp_array[i, 2:] = [0, 0, 0, 0]
                         # print_error("WTH")
                         # sys.exit(0)
                         # print('here')
                         # pdb.set_trace()
                         # self.SS[:, index] = np.nan
                         # self.ASA[index] = np.nan
                         # self.rASA[index] = np.nan
                         # self.Phi[index] = np.nan
                         # self.Psi[index] = np.nan
                 np.save(dssp_file, dssp_array)
                 print_info("took {0} seconds.".format(
                     (datetime.now() - start_time).seconds))
             dssp = np.load(dssp_file)
             for i, res in enumerate(protein.residues):
                 # (_, s, ASA, rASA, phi, psi)
                 res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i,
                                                                        2])
                 res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
                                 dssp[i, 3])
                 res.add_feature(Features.PHI, dssp[i, 4])
                 res.add_feature(Features.PSI, dssp[i, 5])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
    def extract_feature(self):
        secondary_structure_dict = dict(zip(ss_abbreviations, range(len(ss_abbreviations))))
        print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self.__get_dir_name()):
            os.mkdir(self.__get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                stride_x_file = self.__get_dir_name() + protein.name + ".npy"
                if not os.path.exists(stride_x_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    n = len(protein.residues)
                    stride_x = stride_dict_from_pdb_file(pdb_file)
                    stride_x_array = np.zeros((n, 11))
                    for index, residue in enumerate(protein.biopython_residues):
                        key = self.get_residue_id(residue.get_full_id())
                        if key in stride_x:
                            (_, s, phi, psi, asa, rasa) = stride_x[key]
                            if s not in secondary_structure_dict:
                                raise ValueError("unknown secondary structure! Add to dictionary!")
                            ss = np.zeros(len(secondary_structure_dict))
                            ss[secondary_structure_dict[s]] = 1
                            stride_x_array[index, :7] = ss
                            stride_x_array[index, 7] = phi
                            stride_x_array[index, 8] = psi
                            stride_x_array[index, 9] = asa
                            stride_x_array[index, 10] = rasa
                    np.save(stride_x_file, stride_x_array)
                stride_x = np.load(stride_x_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7])
                    res.add_feature(Features.PHI, stride_x[i, 7])
                    res.add_feature(Features.PSI, stride_x[i, 8])
                    res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9])
                    res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10])
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                residue_depth_file = self._get_dir_name() + protein.name + ".npy"
                if not os.path.exists(residue_depth_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    rd = ResidueDepth(protein.structure[0], pdb_file)
                    rd_array = np.ndarray((len(protein.residues), 2))  # self.number_of_bins +
                    # surface = get_surface(pdb_file)
                    for (i, res) in enumerate(protein.biopython_residues):
                        (_, _, c, (h, rn, ic)) = res.get_full_id()
                        key = (c, (h, rn, ic))
                        if key in rd:
                            rdv = rd[key]
                            if rdv[0] is None:
                                rdv = (0, rdv[1])
                                print "WTH?"
                            if rdv[1] is None:
                                rdv = (rdv[0], 0)
                                print "WTH?"
                            rd_array[i, :2] = rdv
                        else:
                            print_error('WTH')
                            rd_array[i, :2] = [0, 0]
                            # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center)

                    np.save(residue_depth_file, rd_array)
                surface_features = np.load(residue_depth_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2]))
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
 def extract_feature(self):
     seed(self.seed)
     counter = 0
     overall_time = datetime.now()
     print_info_nn(
         " >>> Adding D2 shape distribution for database {0} ... ".format(
             self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros(
                     (len(protein.residues), self.number_of_bins))
                 # distributions = np.zeros((len(protein.residues), self.number_of_bins+2))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(
                         residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(
                         nearby_residues)
                     # distributions[i:, -1] = len(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.D2_PLAIN_SHAPE_DISTRIBUTION,
                     distributions[i, :])
                 # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
Beispiel #13
0
 def extract_feature(self):
     counter = 0
     print_info_nn(" >>> Adding Residue Neighbourhood ... ")
     overall_time = datetime.now()
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             residue_neighbourhood_file = self._get_dir_name() + protein.name
             if not os.path.exists(residue_neighbourhood_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 neighbourhood = []
                 max_length = 0
                 for i, query_residue in enumerate(protein.residues):
                     neighbourhood.append([])
                     for j, neighbour_residue in enumerate(protein.residues):
                         # if i == j:
                         #     continue
                         distance = cdist(query_residue.get_coordinates(), neighbour_residue.get_coordinates()).min()
                         similarity = np.exp(-(distance ** 2) / self._sigma)
                         if distance <= 7.5:
                             neighbourhood[-1].append(j)
                     if len(neighbourhood[-1]) > max_length:
                         max_length = len(neighbourhood[-1])
                 neighbourhood_array = -np.ones((len(protein.residues), max_length))
                 # print len(neighbourhood)
                 for i, residue_neighbourhood in enumerate(neighbourhood):
                     for j, neighbour_index in enumerate(neighbourhood[i]):
                         neighbourhood_array[i, j] = neighbourhood[i][j]
                     # print neighbourhood_array[i, :]
                 np.save(residue_neighbourhood_file, neighbourhood_array)
             neighbourhood_array = np.load(residue_neighbourhood_file+".npy")
             for index, residue in enumerate(protein.residues):
                 residue.add_feature(Features.RESIDUE_NEIGHBOURHOOD, neighbourhood_array[index, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
 def extract_feature(self):
     print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             dssp_file = self.__get_dir_name() + protein.name + ".npy"
             if not os.path.exists(dssp_file):
                 print_info_nn("... running DSSP for protein " + protein.name)
                 start_time = datetime.now()
                 dssp = DSSP(protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb")
                 dssp_array = np.ndarray((len(protein.residues), 6))
                 for (i, res) in enumerate(protein.biopython_residues):
                     (_, _, cid, rid) = res.get_full_id()
                     key = (cid, rid)
                     if key in dssp:
                         dssp_array[i, 2:] = (dssp[key])[2:]
                     else:
                         dssp_array[i, 2:] = [0, 0, 0, 0]
                         # print_error("WTH")
                         # sys.exit(0)
                         # print('here')
                         # pdb.set_trace()
                         # self.SS[:, index] = np.nan
                         # self.ASA[index] = np.nan
                         # self.rASA[index] = np.nan
                         # self.Phi[index] = np.nan
                         # self.Psi[index] = np.nan
                 np.save(dssp_file, dssp_array)
                 print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
             dssp = np.load(dssp_file)
             for i, res in enumerate(protein.residues):
                 # (_, s, ASA, rASA, phi, psi)
                 res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2])
                 res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3])
                 res.add_feature(Features.PHI, dssp[i, 4])
                 res.add_feature(Features.PSI, dssp[i, 5])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros((len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :])
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Beispiel #16
0
    def _load(self, file_name=None):
        """
        This function load all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """

        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name)
        print_info_nn("Loading the object model from {0} ... ".format(
            object_model_file_name))
        start_time = datetime.now()
        (self.directory, self.complexes, self.residues,
         self.complexes_example_range, self.examples,
         self.example_complex) = cPickle.load(f)
        f.close()
        gc.collect()
        print_info("took {0} seconds.".format(
            (datetime.now() - start_time).seconds))
Beispiel #17
0
    def _save(self, file_name=None):
        """
        This function saves all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """
        if not os.path.exists(self.directory + pickle_directory):
            os.mkdir(self.directory + pickle_directory)
        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name, "wb")
        print_info_nn("Saving the object model into {0} ... ".format(
            object_model_file_name))
        start_time = datetime.now()
        cPickle.dump((self.directory, self.complexes, self.residues,
                      self.complexes_example_range, self.examples,
                      self.example_complex), f)
        f.close()
        print_info("took {0} seconds.".format(
            (datetime.now() - start_time).seconds))
Beispiel #18
0
    def _load(self, file_name=None):
        """
        This function load all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """

        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name)
        print_info_nn("Loading the object model from {0} ... ".format(object_model_file_name))
        start_time = datetime.now()
        (self.directory,
         self.complexes,
         self.residues,
         self.complexes_example_range,
         self.examples,
         self.example_complex) = cPickle.load(f)
        f.close()
        gc.collect()
        print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
Beispiel #19
0
    def _save(self, file_name=None):
        """
        This function saves all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """
        if not os.path.exists(self.directory + pickle_directory):
            os.mkdir(self.directory + pickle_directory)
        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name, "wb")
        print_info_nn("Saving the object model into {0} ... ".format(object_model_file_name))
        start_time = datetime.now()
        cPickle.dump((self.directory,
                      self.complexes,
                      self.residues,
                      self.complexes_example_range,
                      self.examples,
                      self.example_complex), f)
        f.close()
        print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
    def extract_feature(self):
        counter = 0
        overall_time = datetime.now()
        print_info_nn(" >>> Adding B Factor ... ".format(self._database.name))
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                b_factor_filename = self._get_dir_name() + protein.name
                if not os.path.exists(b_factor_filename + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    b_factor_array = np.zeros(len(protein.residues))
                    for (index,
                         residue) in enumerate(protein.biopython_residues):
                        b_factor_array[index] = max(
                            [atom.get_bfactor() for atom in residue])

                    np.save(b_factor_filename, b_factor_array)
                b_factor_array = np.load(b_factor_filename + ".npy")
                # print b_factor_array
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(Features.B_VALUE,
                                                    b_factor_array[i])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
Beispiel #21
0
    def __extract_examples(self):
        """
        This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C,
        wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the
        bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are
        considered as negative examples. Extracted examples are saved in self.examples
        """
        print_info(
            "Finding the positive and negative examples in DBD4 ... {0}".
            format(self.positives_size))
        start_time = datetime.now()
        counter = 1
        start_index = 0
        neg_no = 0
        pos_no = 0
        for complex_name in self.complexes.keys():
            print_info_nn("{0}/{1}... processing complex {2}".format(
                counter, len(self.complexes), complex_name))
            protein_complex = self.complexes[complex_name]
            bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues
            bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues
            bound_ligand_residues = protein_complex.bound_formation.ligand.residues
            bound_receptor_residues = protein_complex.bound_formation.receptor.residues
            pos = []
            neg = []
            for i in range(len(bound_ligand_bio_residues)):
                for j in range(len(bound_receptor_bio_residues)):
                    bound_ligand_residue = bound_ligand_bio_residues[i]
                    bound_receptor_residue = bound_receptor_bio_residues[j]
                    l_atoms = [
                        atom.get_coord()
                        for atom in bound_ligand_residue.get_list()
                    ]
                    r_atoms = [
                        atom.get_coord()
                        for atom in bound_receptor_residue.get_list()
                    ]
                    dist_mat = cdist(l_atoms, r_atoms)
                    ligand_b2u = protein_complex.ligand_bound_to_unbound
                    receptor_b2u = protein_complex.receptor_bound_to_unbound
                    # if the residues have an unbound counterpart
                    # this is due to the fact that the unbound and bound formations may have slightly different residues
                    if bound_ligand_residues[
                            i] in ligand_b2u and bound_receptor_residues[
                                j] in receptor_b2u:
                        unbound_ligand_res = ligand_b2u[
                            bound_ligand_residues[i]]
                        unbound_receptor_res = receptor_b2u[
                            bound_receptor_residues[j]]
                        unbound_ligand_res_index = self.__get_residue_index(
                            unbound_ligand_res)
                        unbound_receptor_res_index = self.__get_residue_index(
                            unbound_receptor_res)
                        if dist_mat.min() < self.interaction_threshold:
                            pos.append((unbound_ligand_res_index,
                                        unbound_receptor_res_index, +1))
                        else:
                            neg.append((unbound_ligand_res_index,
                                        unbound_receptor_res_index, -1))
            self.examples.extend(copy.copy(pos))
            self.examples.extend(copy.copy(neg))
            pos_no += len(pos)
            neg_no += len(neg)
            self.complexes_example_range[complex_name] = (start_index,
                                                          start_index +
                                                          len(pos),
                                                          start_index +
                                                          len(neg) + len(pos))
            print_info(" ( {0:03d}/{1:05d} ) -{2}".format(
                len(pos), len(neg),
                self.complexes_example_range[complex_name]))
            start_index += len(pos) + len(neg)
            counter += 1
            all_e = pos + neg
            for e in all_e:
                self.example_complex["{0}_{1}".format(e[0],
                                                      e[1])] = complex_name

        print_info("Finding examples in DBD4 took " +
                   str((datetime.now() - start_time).seconds) + " seconds. ")
        print_info("The total number of examples found: " +
                   str(pos_no + neg_no))
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(
         self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(
                         protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[
                             residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[
                             index].get_feature(
                                 Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[
                                 int(neighbour_index)]
                             if is_aa(neighbour_residue
                                      ) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue[
                                     'CA'].get_vector()
                                 residue_index = self._residue_index_table[
                                     neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector -
                                                u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))