Ejemplo n.º 1
0
 def __compute_profiles(self, db='nr', niter=3):
     print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name))
     start_time = datetime.now()
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta"
             output_file = self._database.directory + pssm_directory + protein.name
             if not os.path.exists(output_file + ".mat"):
                 print_info("... processing protein {0} ...    ".format(protein.name))
                 command = "cd {4} \n " \
                           "{5} " \
                           "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \
                     .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable)
                 print_info(command)
                 error_code = os.system(command)
                 if error_code == 0:
                     print_info('Successful!')
                 else:
                     print_error('Failed with error code {0}'.format(error_code))
             pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat")
             wpssm = ProfileExtractor.__get_wpsm(pssm)
             wpsfm = ProfileExtractor.__get_wpsm(psfm)
             for i, res in enumerate(protein.residues):
                 res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i]))
                 res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i]))
                 res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i]))
     print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
Ejemplo n.º 2
0
def make_psaia_dict(filename):
    psaia = {}
    line_number = 0
    try:
        for line in open(filename, "r"):
            line_number += 1
            line_parts = line.split()
            # the line containing 'chain' is the last line before real data starts
            if len(line_parts) < 5 or line_parts[0] == 'chain':
                continue
            protein_id = line_parts[0]
            if protein_id == '*':
                protein_id = ' '
            residue_id = (protein_id, line_parts[6])
            casa = np.array(map(float, line_parts[1:6]))
            rasa = np.array(map(float, line_parts[8:13]))
            rrasa = np.array(map(float, line_parts[13:18]))
            rdpx = np.array(map(float, line_parts[18:24]))
            rcx = np.array(map(float, line_parts[24:30]))
            rhph = np.array(float(line_parts[-1]))
            psaia[residue_id] = (casa, rasa, rrasa, rdpx, rcx, rhph)
    except Exception as e:
        print_error('Error Processing psaia file {0}: {1}'.format(filename, e))
        print_error('Error occurred while processing line: {0}'.format(line_number))
        raise e
    return psaia
Ejemplo n.º 3
0
def make_psaia_dict(filename):
    psaia = {}
    line_number = 0
    try:
        for line in open(filename, "r"):
            line_number += 1
            line_parts = line.split()
            # the line containing 'chain' is the last line before real data starts
            if len(line_parts) < 5 or line_parts[0] == 'chain':
                continue
            protein_id = line_parts[0]
            if protein_id == '*':
                protein_id = ' '
            residue_id = (protein_id, line_parts[6])
            casa = np.array(map(float, line_parts[1:6]))
            rasa = np.array(map(float, line_parts[8:13]))
            rrasa = np.array(map(float, line_parts[13:18]))
            rdpx = np.array(map(float, line_parts[18:24]))
            rcx = np.array(map(float, line_parts[24:30]))
            rhph = np.array(float(line_parts[-1]))
            psaia[residue_id] = (casa, rasa, rrasa, rdpx, rcx, rhph)
    except Exception as e:
        print_error('Error Processing psaia file {0}: {1}'.format(filename, e))
        print_error(
            'Error occurred while processing line: {0}'.format(line_number))
        raise e
    return psaia
Ejemplo n.º 4
0
    def get_vector_form(self, features):
        uncomputed_features = set(features) - set(self.get_computed_features())
        if uncomputed_features != set([]):
            print_error(
                "Following features {0} is still not computed for this residue {1}".format(uncomputed_features, self))
            return None
        temp = None
        for feature in features:

            vector = self.computed_features[feature]
            if temp is None:
                temp = vector
            else:
                temp = np.hstack((vector, temp))
        return temp
    def extract_feature(self):
        print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name))
        overall_time = datetime.now()
        counter = 0
        if not os.path.exists(self._get_dir_name()):
            os.makedirs(self._get_dir_name())
        for complex_name in self._database.complexes.keys():
            protein_complex = self._database.complexes[complex_name]
            proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
            for protein in proteins:
                residue_depth_file = self._get_dir_name() + protein.name + ".npy"
                if not os.path.exists(residue_depth_file):
                    counter += 1
                    if counter <= 15:
                        print_info_nn("{0}, ".format(protein.name))
                    else:
                        counter = 0
                        print_info("{0}".format(protein.name))

                    pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb"
                    rd = ResidueDepth(protein.structure[0], pdb_file)
                    rd_array = np.ndarray((len(protein.residues), 2))  # self.number_of_bins +
                    # surface = get_surface(pdb_file)
                    for (i, res) in enumerate(protein.biopython_residues):
                        (_, _, c, (h, rn, ic)) = res.get_full_id()
                        key = (c, (h, rn, ic))
                        if key in rd:
                            rdv = rd[key]
                            if rdv[0] is None:
                                rdv = (0, rdv[1])
                                print "WTH?"
                            if rdv[1] is None:
                                rdv = (rdv[0], 0)
                                print "WTH?"
                            rd_array[i, :2] = rdv
                        else:
                            print_error('WTH')
                            rd_array[i, :2] = [0, 0]
                            # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center)

                    np.save(residue_depth_file, rd_array)
                surface_features = np.load(residue_depth_file)
                for i, res in enumerate(protein.residues):
                    res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2]))
        print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Ejemplo n.º 6
0
 def get_feature(self, feature):
     if feature not in self.computed_features:
         print_error("Feature {0} is not computed!".format(feature))
     else:
         return self.computed_features[feature]