def __compute_profiles(self, db='nr', niter=3): print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name)) start_time = datetime.now() for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta" output_file = self._database.directory + pssm_directory + protein.name if not os.path.exists(output_file + ".mat"): print_info("... processing protein {0} ... ".format(protein.name)) command = "cd {4} \n " \ "{5} " \ "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \ .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable) print_info(command) error_code = os.system(command) if error_code == 0: print_info('Successful!') else: print_error('Failed with error code {0}'.format(error_code)) pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat") wpssm = ProfileExtractor.__get_wpsm(pssm) wpsfm = ProfileExtractor.__get_wpsm(psfm) for i, res in enumerate(protein.residues): res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i])) res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i])) print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def make_psaia_dict(filename): psaia = {} line_number = 0 try: for line in open(filename, "r"): line_number += 1 line_parts = line.split() # the line containing 'chain' is the last line before real data starts if len(line_parts) < 5 or line_parts[0] == 'chain': continue protein_id = line_parts[0] if protein_id == '*': protein_id = ' ' residue_id = (protein_id, line_parts[6]) casa = np.array(map(float, line_parts[1:6])) rasa = np.array(map(float, line_parts[8:13])) rrasa = np.array(map(float, line_parts[13:18])) rdpx = np.array(map(float, line_parts[18:24])) rcx = np.array(map(float, line_parts[24:30])) rhph = np.array(float(line_parts[-1])) psaia[residue_id] = (casa, rasa, rrasa, rdpx, rcx, rhph) except Exception as e: print_error('Error Processing psaia file {0}: {1}'.format(filename, e)) print_error('Error occurred while processing line: {0}'.format(line_number)) raise e return psaia
def make_psaia_dict(filename): psaia = {} line_number = 0 try: for line in open(filename, "r"): line_number += 1 line_parts = line.split() # the line containing 'chain' is the last line before real data starts if len(line_parts) < 5 or line_parts[0] == 'chain': continue protein_id = line_parts[0] if protein_id == '*': protein_id = ' ' residue_id = (protein_id, line_parts[6]) casa = np.array(map(float, line_parts[1:6])) rasa = np.array(map(float, line_parts[8:13])) rrasa = np.array(map(float, line_parts[13:18])) rdpx = np.array(map(float, line_parts[18:24])) rcx = np.array(map(float, line_parts[24:30])) rhph = np.array(float(line_parts[-1])) psaia[residue_id] = (casa, rasa, rrasa, rdpx, rcx, rhph) except Exception as e: print_error('Error Processing psaia file {0}: {1}'.format(filename, e)) print_error( 'Error occurred while processing line: {0}'.format(line_number)) raise e return psaia
def get_vector_form(self, features): uncomputed_features = set(features) - set(self.get_computed_features()) if uncomputed_features != set([]): print_error( "Following features {0} is still not computed for this residue {1}".format(uncomputed_features, self)) return None temp = None for feature in features: vector = self.computed_features[feature] if temp is None: temp = vector else: temp = np.hstack((vector, temp)) return temp
def extract_feature(self): print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: residue_depth_file = self._get_dir_name() + protein.name + ".npy" if not os.path.exists(residue_depth_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" rd = ResidueDepth(protein.structure[0], pdb_file) rd_array = np.ndarray((len(protein.residues), 2)) # self.number_of_bins + # surface = get_surface(pdb_file) for (i, res) in enumerate(protein.biopython_residues): (_, _, c, (h, rn, ic)) = res.get_full_id() key = (c, (h, rn, ic)) if key in rd: rdv = rd[key] if rdv[0] is None: rdv = (0, rdv[1]) print "WTH?" if rdv[1] is None: rdv = (rdv[0], 0) print "WTH?" rd_array[i, :2] = rdv else: print_error('WTH') rd_array[i, :2] = [0, 0] # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center) np.save(residue_depth_file, rd_array) surface_features = np.load(residue_depth_file) for i, res in enumerate(protein.residues): res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2])) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def get_feature(self, feature): if feature not in self.computed_features: print_error("Feature {0} is not computed!".format(feature)) else: return self.computed_features[feature]