def extract_feature(self):
     seed(self.seed)
     counter = 0
     print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros((len(protein.residues), self.number_of_bins))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface atoms shape distribution for {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    print_info("{0}".format(protein.name))
                    pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb'
                    surface, normals = get_surface_atoms(pdb_file_name)
                    distributions = np.zeros(
                        (len(protein.residues), 2 * (self.number_of_bins + 1)))

                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        distributions[i, :] = self.get_distributions(
                            residue.center, surface, normals)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
Example #3
0
 def extract_feature(self):
     print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     counter = 0
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             protrusion_file = self.__get_dir_name() + protein.name
             if not os.path.exists(protrusion_file+".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                 result_dict = run_psaia(pdb_file)
                 protrusion_array = np.zeros((len(protein.residues), 5 + 5 + 5 + 6 + 6 + 1))
                 if result_dict is not None:
                     for index, residue in enumerate(protein.biopython_residues):
                         key = self.get_residue_id(residue.get_full_id())
                         if key in result_dict:
                             values = result_dict[key]
                             protrusion_array[index, :] = self._normalize_features(*values)
                         else:
                             print('key not found in PSAIA processing!')
                 np.save(protrusion_file, protrusion_array)
             protrusion_array = np.load(protrusion_file+".npy")
             for index, residue in enumerate(protein.residues):
                 residue.add_feature(Features.PROTRUSION_INDEX, protrusion_array[index, 21:])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        secondary_structure_dict = dict(
            zip(ss_abbreviations, range(len(ss_abbreviations))))
        print_info_nn(
            " >>> Adding secondary structure for database {0} ... ".format(
                self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self.__get_dir_name()):
            os.mkdir(self.__get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                stride_x_file = self.__get_dir_name() + protein.name + ".npy"
                if not os.path.exists(stride_x_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    n = len(protein.residues)
                    stride_x = stride_dict_from_pdb_file(pdb_file)
                    stride_x_array = np.zeros((n, 11))
                    for index, residue in enumerate(
                            protein.biopython_residues):
                        key = self.get_residue_id(residue.get_full_id())
                        if key in stride_x:
                            (_, s, phi, psi, asa, rasa) = stride_x[key]
                            if s not in secondary_structure_dict:
                                raise ValueError(
                                    "unknown secondary structure! Add to dictionary!"
                                )
                            ss = np.zeros(len(secondary_structure_dict))
                            ss[secondary_structure_dict[s]] = 1
                            stride_x_array[index, :7] = ss
                            stride_x_array[index, 7] = phi
                            stride_x_array[index, 8] = psi
                            stride_x_array[index, 9] = asa
                            stride_x_array[index, 10] = rasa
                    np.save(stride_x_file, stride_x_array)
                stride_x = np.load(stride_x_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.SECONDARY_STRUCTURE,
                                    stride_x[i, :7])
                    res.add_feature(Features.PHI, stride_x[i, 7])
                    res.add_feature(Features.PSI, stride_x[i, 8])
                    res.add_feature(Features.ACCESSIBLE_SURFACE_AREA,
                                    stride_x[i, 9])
                    res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
                                    stride_x[i, 10])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
Example #5
0
    def get_rfpp(self):
        if self.pyml_result is None:
            return None
        example_range = self.database.complexes_example_range
        example_complex_map = {}
        for complex_name in example_range:
            interval = example_range[complex_name]
            for example in range(interval[0], interval[2]):
                example_complex_map[example] = complex_name

        example_index_map = self.database.get_pair_index_map()
        complex_length_folds = []
        for fold in self.pyml_result:
            complex_length_map = {}
            for i in range(len(fold.L)):
                complex_name = self.database.example_complex[fold.patternID[i]]
                if complex_name not in complex_length_map:
                    complex_length_map[complex_name] = 0
                complex_length_map[complex_name] += 1
            complex_length_folds.append(complex_length_map)

        rfpp = {}
        fold_no = 0
        for fold in self.pyml_result:
            complex_performance_map = {}
            complex_length_map = complex_length_folds[fold_no]
            for i in range(len(fold.L)):
                pid = fold.patternID[i]
                complex_name = self.database.example_complex[pid]
                if complex_name not in complex_performance_map:
                    example_no = complex_length_map[complex_name]
                    complex_performance_map[complex_name] = ndarray((example_no, 3))
                    complex_length_map[complex_name] = 0

                perf_table = complex_performance_map[complex_name]
                length = complex_length_map[complex_name]
                perf_table[length, :] = [int(fold.Y[i]), int(fold.givenY[i]), fold.decisionFunc[i]]
                complex_length_map[complex_name] += 1
                number_of_examples_in_complex = perf_table.shape[0]
                if complex_length_map[complex_name] == number_of_examples_in_complex:
                    sorted_perf = perf_table[(-perf_table[:, 2]).argsort()]
                    complex_performance_map[complex_name] = sorted_perf

            for complex_name in complex_performance_map:
                perf_table = complex_performance_map[complex_name]
                for i in range(perf_table.shape[0]):
                    if perf_table[i, 0] > 0 and perf_table[i, 1] > 0:
                        rfpp[complex_name] = (i+1, perf_table.shape[0])
                        break
            fold_no += 1
        average = 0
        for complex_name in rfpp:
            rank, n = rfpp[complex_name]
            percent = math.ceil((rank * 100) / n)
            average += percent
            print_info("{0} : {1}".format(complex_name, percent))

        print_info("Average RFPP {0}".format(average))
        return rfpp
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[int(neighbour_index)]
                             if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue['CA'].get_vector()
                                 residue_index = self._residue_index_table[neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(
            " >>> Adding D1 surface shape distribution for database {0} ... ".
            format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros(
                        (len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(
                            residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(
                                nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(
                                    Features.RELATIVE_ACCESSIBLE_SURFACE_AREA
                            ) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(
                            nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(
                        Features.D1_SURFACE_SHAPE_DISTRIBUTION,
                        distributions[i, :])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
 def extract_feature(self):
     print_info_nn(
         " >>> Adding secondary structure for database {0} ... ".format(
             self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             dssp_file = self.__get_dir_name() + protein.name + ".npy"
             if not os.path.exists(dssp_file):
                 print_info_nn("... running DSSP for protein " +
                               protein.name)
                 start_time = datetime.now()
                 dssp = DSSP(
                     protein.structure[0], self._database.directory +
                     pdb_directory + protein.name + ".pdb")
                 dssp_array = np.ndarray((len(protein.residues), 6))
                 for (i, res) in enumerate(protein.biopython_residues):
                     (_, _, cid, rid) = res.get_full_id()
                     key = (cid, rid)
                     if key in dssp:
                         dssp_array[i, 2:] = (dssp[key])[2:]
                     else:
                         dssp_array[i, 2:] = [0, 0, 0, 0]
                         # print_error("WTH")
                         # sys.exit(0)
                         # print('here')
                         # pdb.set_trace()
                         # self.SS[:, index] = np.nan
                         # self.ASA[index] = np.nan
                         # self.rASA[index] = np.nan
                         # self.Phi[index] = np.nan
                         # self.Psi[index] = np.nan
                 np.save(dssp_file, dssp_array)
                 print_info("took {0} seconds.".format(
                     (datetime.now() - start_time).seconds))
             dssp = np.load(dssp_file)
             for i, res in enumerate(protein.residues):
                 # (_, s, ASA, rASA, phi, psi)
                 res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i,
                                                                        2])
                 res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
                                 dssp[i, 3])
                 res.add_feature(Features.PHI, dssp[i, 4])
                 res.add_feature(Features.PSI, dssp[i, 5])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
Example #9
0
 def __compute_profiles(self, db='nr', niter=3):
     print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name))
     start_time = datetime.now()
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta"
             output_file = self._database.directory + pssm_directory + protein.name
             if not os.path.exists(output_file + ".mat"):
                 print_info("... processing protein {0} ...    ".format(protein.name))
                 command = "cd {4} \n " \
                           "{5} " \
                           "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \
                     .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable)
                 print_info(command)
                 error_code = os.system(command)
                 if error_code == 0:
                     print_info('Successful!')
                 else:
                     print_error('Failed with error code {0}'.format(error_code))
             pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat")
             wpssm = ProfileExtractor.__get_wpsm(pssm)
             wpsfm = ProfileExtractor.__get_wpsm(psfm)
             for i, res in enumerate(protein.residues):
                 res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i]))
                 res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i]))
     print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
Example #10
0
 def __save_sequences_to_fasta(self):
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             sequence = protein.sequence
             fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta"
             if os.path.exists(fasta_file):
                 continue
             print_info("... Saving sequence for protein " + protein.name)
             f = open(fasta_file, "w+")
             f.write(">{0}\n".format(protein.name))
             f.write(sequence + "\n")
             f.close()
Example #11
0
    def __extract_examples(self):
        """
        This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C,
        wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the
        bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are
        considered as negative examples. Extracted examples are saved in self.examples
        """
        print_info("Finding the positive and negative examples in DBD4 ... {0}".format(self.positives_size))
        start_time = datetime.now()
        counter = 1
        start_index = 0
        neg_no = 0
        pos_no = 0
        for complex_name in self.complexes.keys():
            print_info_nn("{0}/{1}... processing complex {2}".format(counter, len(self.complexes), complex_name))
            protein_complex = self.complexes[complex_name]
            bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues
            bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues
            bound_ligand_residues = protein_complex.bound_formation.ligand.residues
            bound_receptor_residues = protein_complex.bound_formation.receptor.residues
            pos = []
            neg = []
            for i in range(len(bound_ligand_bio_residues)):
                for j in range(len(bound_receptor_bio_residues)):
                    bound_ligand_residue = bound_ligand_bio_residues[i]
                    bound_receptor_residue = bound_receptor_bio_residues[j]
                    l_atoms = [atom.get_coord() for atom in bound_ligand_residue.get_list()]
                    r_atoms = [atom.get_coord() for atom in bound_receptor_residue.get_list()]
                    dist_mat = cdist(l_atoms, r_atoms)
                    ligand_b2u = protein_complex.ligand_bound_to_unbound
                    receptor_b2u = protein_complex.receptor_bound_to_unbound
                    # if the residues have an unbound counterpart
                    # this is due to the fact that the unbound and bound formations may have slightly different residues
                    if bound_ligand_residues[i] in ligand_b2u and bound_receptor_residues[j] in receptor_b2u:
                        unbound_ligand_res = ligand_b2u[bound_ligand_residues[i]]
                        unbound_receptor_res = receptor_b2u[bound_receptor_residues[j]]
                        unbound_ligand_res_index = self.__get_residue_index(unbound_ligand_res)
                        unbound_receptor_res_index = self.__get_residue_index(unbound_receptor_res)
                        if dist_mat.min() < self.interaction_threshold:
                            pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1))
                        else:
                            neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1))
            self.examples.extend(copy.copy(pos))
            self.examples.extend(copy.copy(neg))
            pos_no += len(pos)
            neg_no += len(neg)
            self.complexes_example_range[complex_name] = (
                start_index, start_index + len(pos), start_index + len(neg) + len(pos))
            print_info(" ( {0:03d}/{1:05d} ) -{2}".format(len(pos), len(neg), self.complexes_example_range[complex_name]))
            start_index += len(pos) + len(neg)
            counter += 1
            all_e = pos + neg
            for e in all_e:
                self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name

        print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ")
        print_info("The total number of examples found: " + str(pos_no + neg_no))
    def extract_feature(self):
        secondary_structure_dict = dict(zip(ss_abbreviations, range(len(ss_abbreviations))))
        print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self.__get_dir_name()):
            os.mkdir(self.__get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                stride_x_file = self.__get_dir_name() + protein.name + ".npy"
                if not os.path.exists(stride_x_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    n = len(protein.residues)
                    stride_x = stride_dict_from_pdb_file(pdb_file)
                    stride_x_array = np.zeros((n, 11))
                    for index, residue in enumerate(protein.biopython_residues):
                        key = self.get_residue_id(residue.get_full_id())
                        if key in stride_x:
                            (_, s, phi, psi, asa, rasa) = stride_x[key]
                            if s not in secondary_structure_dict:
                                raise ValueError("unknown secondary structure! Add to dictionary!")
                            ss = np.zeros(len(secondary_structure_dict))
                            ss[secondary_structure_dict[s]] = 1
                            stride_x_array[index, :7] = ss
                            stride_x_array[index, 7] = phi
                            stride_x_array[index, 8] = psi
                            stride_x_array[index, 9] = asa
                            stride_x_array[index, 10] = rasa
                    np.save(stride_x_file, stride_x_array)
                stride_x = np.load(stride_x_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7])
                    res.add_feature(Features.PHI, stride_x[i, 7])
                    res.add_feature(Features.PSI, stride_x[i, 8])
                    res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9])
                    res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10])
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
    def extract_feature(self):
        print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                residue_depth_file = self._get_dir_name() + protein.name + ".npy"
                if not os.path.exists(residue_depth_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    rd = ResidueDepth(protein.structure[0], pdb_file)
                    rd_array = np.ndarray((len(protein.residues), 2))  # self.number_of_bins +
                    # surface = get_surface(pdb_file)
                    for (i, res) in enumerate(protein.biopython_residues):
                        (_, _, c, (h, rn, ic)) = res.get_full_id()
                        key = (c, (h, rn, ic))
                        if key in rd:
                            rdv = rd[key]
                            if rdv[0] is None:
                                rdv = (0, rdv[1])
                                print "WTH?"
                            if rdv[1] is None:
                                rdv = (rdv[0], 0)
                                print "WTH?"
                            rd_array[i, :2] = rdv
                        else:
                            print_error('WTH')
                            rd_array[i, :2] = [0, 0]
                            # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center)

                    np.save(residue_depth_file, rd_array)
                surface_features = np.load(residue_depth_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2]))
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
 def extract_feature(self):
     seed(self.seed)
     counter = 0
     overall_time = datetime.now()
     print_info_nn(
         " >>> Adding D2 shape distribution for database {0} ... ".format(
             self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             shape_dist_file = self._get_dir_name() + protein.name
             if not os.path.exists(shape_dist_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 atoms = protein.atoms
                 neighbour_search = NeighborSearch(atoms)
                 distributions = np.zeros(
                     (len(protein.residues), self.number_of_bins))
                 # distributions = np.zeros((len(protein.residues), self.number_of_bins+2))
                 for i in range(len(protein.residues)):
                     residue = protein.residues[i]
                     nearby_residues = neighbour_search.search(
                         residue.center, self.radius, "R")
                     distributions[i, :] = self._compute_distribution(
                         nearby_residues)
                     # distributions[i:, -1] = len(nearby_residues)
                 np.save(shape_dist_file, distributions)
             distributions = np.load(shape_dist_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.D2_PLAIN_SHAPE_DISTRIBUTION,
                     distributions[i, :])
                 # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
Example #15
0
 def extract_feature(self):
     counter = 0
     print_info_nn(" >>> Adding Residue Neighbourhood ... ")
     overall_time = datetime.now()
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             residue_neighbourhood_file = self._get_dir_name() + protein.name
             if not os.path.exists(residue_neighbourhood_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 neighbourhood = []
                 max_length = 0
                 for i, query_residue in enumerate(protein.residues):
                     neighbourhood.append([])
                     for j, neighbour_residue in enumerate(protein.residues):
                         # if i == j:
                         #     continue
                         distance = cdist(query_residue.get_coordinates(), neighbour_residue.get_coordinates()).min()
                         similarity = np.exp(-(distance ** 2) / self._sigma)
                         if distance <= 7.5:
                             neighbourhood[-1].append(j)
                     if len(neighbourhood[-1]) > max_length:
                         max_length = len(neighbourhood[-1])
                 neighbourhood_array = -np.ones((len(protein.residues), max_length))
                 # print len(neighbourhood)
                 for i, residue_neighbourhood in enumerate(neighbourhood):
                     for j, neighbour_index in enumerate(neighbourhood[i]):
                         neighbourhood_array[i, j] = neighbourhood[i][j]
                     # print neighbourhood_array[i, :]
                 np.save(residue_neighbourhood_file, neighbourhood_array)
             neighbourhood_array = np.load(residue_neighbourhood_file+".npy")
             for index, residue in enumerate(protein.residues):
                 residue.add_feature(Features.RESIDUE_NEIGHBOURHOOD, neighbourhood_array[index, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
 def extract_feature(self):
     print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name))
     overall_time = datetime.now()
     if not os.path.exists(self.__get_dir_name()):
         os.mkdir(self.__get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             dssp_file = self.__get_dir_name() + protein.name + ".npy"
             if not os.path.exists(dssp_file):
                 print_info_nn("... running DSSP for protein " + protein.name)
                 start_time = datetime.now()
                 dssp = DSSP(protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb")
                 dssp_array = np.ndarray((len(protein.residues), 6))
                 for (i, res) in enumerate(protein.biopython_residues):
                     (_, _, cid, rid) = res.get_full_id()
                     key = (cid, rid)
                     if key in dssp:
                         dssp_array[i, 2:] = (dssp[key])[2:]
                     else:
                         dssp_array[i, 2:] = [0, 0, 0, 0]
                         # print_error("WTH")
                         # sys.exit(0)
                         # print('here')
                         # pdb.set_trace()
                         # self.SS[:, index] = np.nan
                         # self.ASA[index] = np.nan
                         # self.rASA[index] = np.nan
                         # self.Phi[index] = np.nan
                         # self.Psi[index] = np.nan
                 np.save(dssp_file, dssp_array)
                 print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
             dssp = np.load(dssp_file)
             for i, res in enumerate(protein.residues):
                 # (_, s, ASA, rASA, phi, psi)
                 res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2])
                 res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3])
                 res.add_feature(Features.PHI, dssp[i, 4])
                 res.add_feature(Features.PSI, dssp[i, 5])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Example #17
0
    def __read_pdb_files(self):
        print_info("Parsing the pdb files in directory {0} ....".format(
            os.path.abspath(self.directory)))
        ligand_bound_files = glob.glob(self.directory + pdb_directory +
                                       "*_l_b.pdb")
        ligand_bound_files.sort()
        counter = 0
        for ligand_bound_file in ligand_bound_files:
            complex_name = basename(ligand_bound_file).replace("_l_b.pdb", "")
            receptor_bound_file = ligand_bound_file.replace(
                "_l_b.pdb", "_r_b.pdb")
            ligand_unbound_file = ligand_bound_file.replace(
                "_l_b.pdb", "_l_u.pdb")
            receptor_unbound_file = ligand_bound_file.replace(
                "_l_b.pdb", "_r_u.pdb")

            print_info("Reading complex " + complex_name)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", PDBConstructionWarning)
                ligand_bound = Protein(*read_pdb_file(ligand_bound_file))
                receptor_bound = Protein(*read_pdb_file(receptor_bound_file))
                ligand_unbound = Protein(*read_pdb_file(ligand_unbound_file))
                receptor_unbound = Protein(
                    *read_pdb_file(receptor_unbound_file))
                bound_formation = ProteinPair(ligand_bound, receptor_bound)
                unbound_formation = ProteinPair(ligand_unbound,
                                                receptor_unbound)
                self.complexes[complex_name] = ProteinComplex(
                    complex_name, unbound_formation, bound_formation)

            counter += 1
        print_info("Total number of complexes processed : " + str(counter))
    def extract_feature(self):
        seed(self.seed)
        print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                shape_dist_file = self._get_dir_name() + protein.name
                if not os.path.exists(shape_dist_file + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))
                    atoms = protein.atoms
                    neighbour_search = NeighborSearch(atoms)
                    distributions = np.zeros((len(protein.residues), self.number_of_bins + 1))
                    for i in range(len(protein.residues)):
                        residue = protein.residues[i]
                        nearby_residues = [protein.biopython_residues[i]]
                        temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R")
                        for nearby_residue in temp_nearby_residues:
                            if nearby_residue not in protein.biopython_residues:
                                continue
                            residues_index = protein.biopython_residues.index(nearby_residue)
                            residue = protein.residues[residues_index]

                            if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold:
                                nearby_residues.append(nearby_residue)
                        distributions[i, :] = self._compute_distribution(nearby_residues, residue.center)
                    np.save(shape_dist_file, distributions)
                distributions = np.load(shape_dist_file + ".npy")
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :])
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Example #19
0
    def _load(self, file_name=None):
        """
        This function load all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """

        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name)
        print_info_nn("Loading the object model from {0} ... ".format(
            object_model_file_name))
        start_time = datetime.now()
        (self.directory, self.complexes, self.residues,
         self.complexes_example_range, self.examples,
         self.example_complex) = cPickle.load(f)
        f.close()
        gc.collect()
        print_info("took {0} seconds.".format(
            (datetime.now() - start_time).seconds))
Example #20
0
    def _save(self, file_name=None):
        """
        This function saves all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """
        if not os.path.exists(self.directory + pickle_directory):
            os.mkdir(self.directory + pickle_directory)
        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name, "wb")
        print_info_nn("Saving the object model into {0} ... ".format(
            object_model_file_name))
        start_time = datetime.now()
        cPickle.dump((self.directory, self.complexes, self.residues,
                      self.complexes_example_range, self.examples,
                      self.example_complex), f)
        f.close()
        print_info("took {0} seconds.".format(
            (datetime.now() - start_time).seconds))
Example #21
0
    def _load(self, file_name=None):
        """
        This function load all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """

        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name)
        print_info_nn("Loading the object model from {0} ... ".format(object_model_file_name))
        start_time = datetime.now()
        (self.directory,
         self.complexes,
         self.residues,
         self.complexes_example_range,
         self.examples,
         self.example_complex) = cPickle.load(f)
        f.close()
        gc.collect()
        print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
Example #22
0
    def _save(self, file_name=None):
        """
        This function saves all the attributes of the class: positive and negative examples, ligands and receptors and
        complex names are saved in pickle format.

        """
        if not os.path.exists(self.directory + pickle_directory):
            os.mkdir(self.directory + pickle_directory)
        if file_name is None:
            object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file
        else:
            object_model_file_name = file_name

        f = open(object_model_file_name, "wb")
        print_info_nn("Saving the object model into {0} ... ".format(object_model_file_name))
        start_time = datetime.now()
        cPickle.dump((self.directory,
                      self.complexes,
                      self.residues,
                      self.complexes_example_range,
                      self.examples,
                      self.example_complex), f)
        f.close()
        print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
    def extract_feature(self):
        counter = 0
        overall_time = datetime.now()
        print_info_nn(" >>> Adding B Factor ... ".format(self._database.name))
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [
                protein_complex.unbound_formation.ligand,
                protein_complex.unbound_formation.receptor
            ]
            for protein in proteins:
                b_factor_filename = self._get_dir_name() + protein.name
                if not os.path.exists(b_factor_filename + ".npy"):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    b_factor_array = np.zeros(len(protein.residues))
                    for (index,
                         residue) in enumerate(protein.biopython_residues):
                        b_factor_array[index] = max(
                            [atom.get_bfactor() for atom in residue])

                    np.save(b_factor_filename, b_factor_array)
                b_factor_array = np.load(b_factor_filename + ".npy")
                # print b_factor_array
                for i in range(len(protein.residues)):
                    protein.residues[i].add_feature(Features.B_VALUE,
                                                    b_factor_array[i])
        print_info("took {0} seconds.".format(
            (datetime.now() - overall_time).seconds))
Example #24
0
    def __read_pdb_files(self):
        print_info("Parsing the pdb files in directory {0} ....".format(os.path.abspath(self.directory)))
        ligand_bound_files = glob.glob(self.directory + pdb_directory + "*_l_b.pdb")
        ligand_bound_files.sort()
        counter = 0
        for ligand_bound_file in ligand_bound_files:
            complex_name = basename(ligand_bound_file).replace("_l_b.pdb", "")
            receptor_bound_file = ligand_bound_file.replace("_l_b.pdb", "_r_b.pdb")
            ligand_unbound_file = ligand_bound_file.replace("_l_b.pdb", "_l_u.pdb")
            receptor_unbound_file = ligand_bound_file.replace("_l_b.pdb", "_r_u.pdb")

            print_info("Reading complex " + complex_name)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", PDBConstructionWarning)
                ligand_bound = Protein(*read_pdb_file(ligand_bound_file))
                receptor_bound = Protein(*read_pdb_file(receptor_bound_file))
                ligand_unbound = Protein(*read_pdb_file(ligand_unbound_file))
                receptor_unbound = Protein(*read_pdb_file(receptor_unbound_file))
                bound_formation = ProteinPair(ligand_bound, receptor_bound)
                unbound_formation = ProteinPair(ligand_unbound, receptor_unbound)
                self.complexes[complex_name] = ProteinComplex(complex_name, unbound_formation, bound_formation)

            counter += 1
        print_info("Total number of complexes processed : " + str(counter))
Example #25
0
def main():
    print_info("Starting the experiment")
    start_time = datetime.now()
    seed = 1
    #number_of_samples = 5000
    number_of_samples = 20000
    dbd4 = DBD4(size=number_of_samples, ratio=-1, seed=seed)
    mtrand.seed(seed)
    feature_sets = [
        #[
        #    Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #    Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D2_PLAIN_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D1_PLAIN_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     Features.D2_SURFACE_SHAPE_DISTRIBUTION
        # ],
         [
             Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
             Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
             Features.D1_SURFACE_SHAPE_DISTRIBUTION
         ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.D2_CATEGORY_SHAPE_DISTRIBUTION
        # ],
        # [
        #     Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX,
        #     # Features.PROTRUSION_INDEX,
        #     # Features.B_VALUE,
        #     Features.HALF_SPHERE_EXPOSURE,
        #     Features.SECONDARY_STRUCTURE,
        #     Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #     Features.POSITION_SPECIFIC_SCORING_MATRIX,
        #     Features.POSITION_SPECIFIC_FREQUENCY_MATRIX,
        #     Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     # # Features.PHI,
        #     # # Features.PSI,
        #     # Features.RELATIVE_ACCESSIBLE_SURFACE_AREA,
        #     Features.D2_SURFACE_SHAPE_DISTRIBUTION,
        #     # Features.D1_SURFACE_SHAPE_DISTRIBUTION,
        #     # Features.D2_PLAIN_SHAPE_DISTRIBUTION,
        #     # Features.D1_SURFACE_SHAPE_DISTRIBUTION,
        #     Features.RESIDUE_DEPTH
        # ]
    ]
    results = []
    for feature_set in feature_sets:
        print_special("Feature set {0}".format(feature_set))
        e = Experiment(feature_set, dbd4, Classifier.SVM)
        e.run(number_of_bins=20, radius=15, number_of_samples=-1, seed=seed, gamma=0.5, save=True, folds=5, rASA=.5)
        results.append(e.pyml_result)
        print_info("Took {0} seconds.".format((datetime.now() - start_time).seconds))
    save_results(number_of_samples, results, feature_sets)
Example #26
0
    def __extract_examples(self):
        """
        This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C,
        wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the
        bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are
        considered as negative examples. Extracted examples are saved in self.examples
        """
        print_info(
            "Finding the positive and negative examples in DBD4 ... {0}".
            format(self.positives_size))
        start_time = datetime.now()
        counter = 1
        start_index = 0
        neg_no = 0
        pos_no = 0
        for complex_name in self.complexes.keys():
            print_info_nn("{0}/{1}... processing complex {2}".format(
                counter, len(self.complexes), complex_name))
            protein_complex = self.complexes[complex_name]
            bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues
            bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues
            bound_ligand_residues = protein_complex.bound_formation.ligand.residues
            bound_receptor_residues = protein_complex.bound_formation.receptor.residues
            pos = []
            neg = []
            for i in range(len(bound_ligand_bio_residues)):
                for j in range(len(bound_receptor_bio_residues)):
                    bound_ligand_residue = bound_ligand_bio_residues[i]
                    bound_receptor_residue = bound_receptor_bio_residues[j]
                    l_atoms = [
                        atom.get_coord()
                        for atom in bound_ligand_residue.get_list()
                    ]
                    r_atoms = [
                        atom.get_coord()
                        for atom in bound_receptor_residue.get_list()
                    ]
                    dist_mat = cdist(l_atoms, r_atoms)
                    ligand_b2u = protein_complex.ligand_bound_to_unbound
                    receptor_b2u = protein_complex.receptor_bound_to_unbound
                    # if the residues have an unbound counterpart
                    # this is due to the fact that the unbound and bound formations may have slightly different residues
                    if bound_ligand_residues[
                            i] in ligand_b2u and bound_receptor_residues[
                                j] in receptor_b2u:
                        unbound_ligand_res = ligand_b2u[
                            bound_ligand_residues[i]]
                        unbound_receptor_res = receptor_b2u[
                            bound_receptor_residues[j]]
                        unbound_ligand_res_index = self.__get_residue_index(
                            unbound_ligand_res)
                        unbound_receptor_res_index = self.__get_residue_index(
                            unbound_receptor_res)
                        if dist_mat.min() < self.interaction_threshold:
                            pos.append((unbound_ligand_res_index,
                                        unbound_receptor_res_index, +1))
                        else:
                            neg.append((unbound_ligand_res_index,
                                        unbound_receptor_res_index, -1))
            self.examples.extend(copy.copy(pos))
            self.examples.extend(copy.copy(neg))
            pos_no += len(pos)
            neg_no += len(neg)
            self.complexes_example_range[complex_name] = (start_index,
                                                          start_index +
                                                          len(pos),
                                                          start_index +
                                                          len(neg) + len(pos))
            print_info(" ( {0:03d}/{1:05d} ) -{2}".format(
                len(pos), len(neg),
                self.complexes_example_range[complex_name]))
            start_index += len(pos) + len(neg)
            counter += 1
            all_e = pos + neg
            for e in all_e:
                self.example_complex["{0}_{1}".format(e[0],
                                                      e[1])] = complex_name

        print_info("Finding examples in DBD4 took " +
                   str((datetime.now() - start_time).seconds) + " seconds. ")
        print_info("The total number of examples found: " +
                   str(pos_no + neg_no))
def main():
    seed = 1
    number_of_samples = 20000
    dbd4 = DBD4(size=number_of_samples, ratio=1, thresh=6, seed=seed)
    SecondaryStructureExtractor(dbd4).extract_feature()

    for complex_name in dbd4.complexes:
        print_info(complex_name)
        c = dbd4.complexes[complex_name]
        b_ligand, b_receptor = (c.bound_formation.ligand,
                                c.bound_formation.receptor)
        u_ligand, u_receptor = (c.unbound_formation.ligand,
                                c.unbound_formation.receptor)
        b_ligand_bio_residues, b_receptor_bio_residues = b_ligand.biopython_residues, b_receptor.biopython_residues
        # b_l_ns, b_r_ns = NeighborSearch(b_ligand.atoms), NeighborSearch(b_receptor.atoms)
        u_l_ns, u_r_ns = NeighborSearch(u_ligand.atoms), NeighborSearch(
            u_receptor.atoms)
        index = randint(1, 30)
        positives = 0
        negatives = 0
        lb2u = c.ligand_bound_to_unbound
        rb2u = c.receptor_bound_to_unbound
        p = False
        n = False
        with PdfPages('{0}/geometry/figures/{1}.pdf'.format(
                reports_directory, complex_name)) as pdf:
            try:
                for i in range(len(b_ligand_bio_residues)):
                    for j in range(len(b_receptor_bio_residues)):
                        if b_ligand.residues[
                                i] not in lb2u or b_receptor.residues[
                                    j] not in rb2u:
                            continue
                        l_atoms = b_ligand_bio_residues[i].get_list()
                        r_atoms = b_receptor_bio_residues[j].get_list()
                        dist_mat = cdist(
                            [atom.get_coord() for atom in l_atoms],
                            [atom.get_coord() for atom in r_atoms])
                        if p and n:
                            print "getting out of loop..."
                            raise GetOutOfLoop
                        if dist_mat.min() < dbd4.interaction_threshold:
                            if p:
                                continue
                            positives += 1
                            if positives != index:
                                continue
                            # b_l_points, b_l_dist = get_coords(b_l_ns, b_ligand, b_ligand.residues[i], 0.5, False)
                            # b_r_points, b_r_dist = get_coords(b_r_ns, b_receptor, b_receptor.residues[j], 0.5, False)

                            b_l_points, b_l_dist = get_coords(
                                u_l_ns, u_ligand, lb2u[b_ligand.residues[i]],
                                0.5, False)
                            b_r_points, b_r_dist = get_coords(
                                u_r_ns, u_receptor,
                                rb2u[b_receptor.residues[j]], 0.5, False)

                            u_l_points, u_l_dist = get_coords(
                                u_l_ns, u_ligand, lb2u[b_ligand.residues[i]],
                                0.5, True)
                            u_r_points, u_r_dist = get_coords(
                                u_r_ns, u_receptor,
                                rb2u[b_receptor.residues[j]], 0.5, True)

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(b_l_points[:, 0],
                                       b_l_points[:, 1],
                                       b_l_points[:, 2],
                                       c='r')
                            ax.scatter(b_r_points[:, 0],
                                       b_r_points[:, 1],
                                       b_r_points[:, 2],
                                       c='b')
                            plt.title(
                                "Interacting Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(u_l_points[:, 0],
                                       u_l_points[:, 1],
                                       u_l_points[:, 2],
                                       c='r')
                            ax.scatter(u_r_points[:, 0],
                                       u_r_points[:, 1],
                                       u_r_points[:, 2],
                                       c='b')
                            plt.title(
                                "Interacting Surface Residues Bound Conformation"
                            )
                            pdf.savefig()
                            plt.close()

                            plt.figure()
                            plt.plot(u_l_dist)
                            plt.plot(u_r_dist)
                            plt.legend([
                                "bound ligand {0}".format(i),
                                "bound receptor {0}".format(j),
                                "unbound ligand", "unbound receptor"
                            ])
                            pdf.savefig()
                            plt.close()
                            p = True
                        else:
                            if n:
                                continue
                            lb2u = c.ligand_bound_to_unbound
                            rb2u = c.receptor_bound_to_unbound

                            b_l_points, b_l_dist = get_coords(
                                u_l_ns, u_ligand, lb2u[b_ligand.residues[i]],
                                0.5, False)
                            b_r_points, b_r_dist = get_coords(
                                u_r_ns, u_receptor,
                                rb2u[b_receptor.residues[j]], 0.5, False)

                            u_l_points, u_l_dist = get_coords(
                                u_l_ns, u_ligand, lb2u[b_ligand.residues[i]],
                                0.5, True)
                            u_r_points, u_r_dist = get_coords(
                                u_r_ns, u_receptor,
                                rb2u[b_receptor.residues[j]], 0.5, True)

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(b_l_points[:, 0],
                                       b_l_points[:, 1],
                                       b_l_points[:, 2],
                                       c='r')
                            ax.scatter(b_r_points[:, 0],
                                       b_r_points[:, 1],
                                       b_r_points[:, 2],
                                       c='b')
                            plt.title(
                                "Non-Interacting Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(u_l_points[:, 0],
                                       u_l_points[:, 1],
                                       u_l_points[:, 2],
                                       c='r')
                            ax.scatter(u_r_points[:, 0],
                                       u_r_points[:, 1],
                                       u_r_points[:, 2],
                                       c='b')
                            plt.title(
                                "Non-Interacting Surface Residues Bound Conformation"
                            )
                            pdf.savefig()
                            plt.close()

                            plt.figure()
                            plt.plot(u_l_dist)
                            plt.plot(u_r_dist)
                            plt.legend([
                                "bound ligand {0}".format(i),
                                "bound receptor {0}".format(j),
                                "unbound ligand", "unbound receptor"
                            ])
                            pdf.savefig()
                            plt.close()
                            n = True

            except GetOutOfLoop:
                pass
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(
         self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(
                         protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[
                             residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[
                             index].get_feature(
                                 Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[
                                 int(neighbour_index)]
                             if is_aa(neighbour_residue
                                      ) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue[
                                     'CA'].get_vector()
                                 residue_index = self._residue_index_table[
                                     neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector -
                                                u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
def main():
    seed = 1
    number_of_samples = 20000
    dbd4 = DBD4(size=number_of_samples, ratio=1, thresh=6, seed=seed)
    SecondaryStructureExtractor(dbd4).extract_feature()

    for complex_name in dbd4.complexes:
        print_info(complex_name)
        c = dbd4.complexes[complex_name]
        b_ligand, b_receptor = (c.bound_formation.ligand, c.bound_formation.receptor)
        u_ligand, u_receptor = (c.unbound_formation.ligand, c.unbound_formation.receptor)
        b_ligand_bio_residues, b_receptor_bio_residues = b_ligand.biopython_residues, b_receptor.biopython_residues
        # b_l_ns, b_r_ns = NeighborSearch(b_ligand.atoms), NeighborSearch(b_receptor.atoms)
        u_l_ns, u_r_ns = NeighborSearch(u_ligand.atoms), NeighborSearch(u_receptor.atoms)
        index = randint(1, 30)
        positives = 0
        negatives = 0
        lb2u = c.ligand_bound_to_unbound
        rb2u = c.receptor_bound_to_unbound
        p = False
        n = False
        with PdfPages('{0}/geometry/figures/{1}.pdf'.format(reports_directory, complex_name)) as pdf:
            try:
                for i in range(len(b_ligand_bio_residues)):
                    for j in range(len(b_receptor_bio_residues)):
                        if b_ligand.residues[i] not in lb2u or b_receptor.residues[j] not in rb2u:
                            continue
                        l_atoms = b_ligand_bio_residues[i].get_list()
                        r_atoms = b_receptor_bio_residues[j].get_list()
                        dist_mat = cdist([atom.get_coord() for atom in l_atoms], [atom.get_coord() for atom in r_atoms])
                        if p and n:
                            print "getting out of loop..."
                            raise GetOutOfLoop
                        if dist_mat.min() < dbd4.interaction_threshold:
                            if p:
                                continue
                            positives += 1
                            if positives != index:
                                continue
                            # b_l_points, b_l_dist = get_coords(b_l_ns, b_ligand, b_ligand.residues[i], 0.5, False)
                            # b_r_points, b_r_dist = get_coords(b_r_ns, b_receptor, b_receptor.residues[j], 0.5, False)

                            b_l_points, b_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False)
                            b_r_points, b_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5,
                                                              False)

                            u_l_points, u_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True)
                            u_r_points, u_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5,
                                                              True)

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r')
                            ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b')
                            plt.title("Interacting Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r')
                            ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b')
                            plt.title("Interacting Surface Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            plt.figure()
                            plt.plot(u_l_dist)
                            plt.plot(u_r_dist)
                            plt.legend(["bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand",
                                        "unbound receptor"])
                            pdf.savefig()
                            plt.close()
                            p = True
                        else:
                            if n:
                                continue
                            lb2u = c.ligand_bound_to_unbound
                            rb2u = c.receptor_bound_to_unbound

                            b_l_points, b_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, False)
                            b_r_points, b_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5,
                                                              False)

                            u_l_points, u_l_dist = get_coords(u_l_ns, u_ligand, lb2u[b_ligand.residues[i]], 0.5, True)
                            u_r_points, u_r_dist = get_coords(u_r_ns, u_receptor, rb2u[b_receptor.residues[j]], 0.5,
                                                              True)

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(b_l_points[:, 0], b_l_points[:, 1], b_l_points[:, 2], c='r')
                            ax.scatter(b_r_points[:, 0], b_r_points[:, 1], b_r_points[:, 2], c='b')
                            plt.title("Non-Interacting Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            fig = plt.figure()
                            ax = fig.add_subplot(111, projection='3d')
                            ax.scatter(u_l_points[:, 0], u_l_points[:, 1], u_l_points[:, 2], c='r')
                            ax.scatter(u_r_points[:, 0], u_r_points[:, 1], u_r_points[:, 2], c='b')
                            plt.title("Non-Interacting Surface Residues Bound Conformation")
                            pdf.savefig()
                            plt.close()

                            plt.figure()
                            plt.plot(u_l_dist)
                            plt.plot(u_r_dist)
                            plt.legend(["bound ligand {0}".format(i), "bound receptor {0}".format(j), "unbound ligand",
                                        "unbound receptor"])
                            pdf.savefig()
                            plt.close()
                            n = True

            except GetOutOfLoop:
                pass