def __init__(self, parameters, observer): """ Class creator. It parses the needed files and extracts info and coordinates. """ super(TrajectoryHandler,self).__init__(observer) self.parameters = parameters matrix_parameters = parameters["data"]["matrix"]['parameters'] parameters["data"]["files"] = self.expand_file_lists(parameters["data"]["files"]) self.files = parameters["data"]["files"] self.pdbs = [] if len(self.files) == 0: common.print_and_flush( "[ERROR] no pdbs. Exiting...\n") self.notify("SHUTDOWN","No pdbs defined in script.") exit() self.notify("Loading","Loading Trajectories") # Bookmarking structure self.bookmarking = { "pdb": None, "selections": {} } self.coordsets = self.getMergedStructure().getCoordsets() self.number_of_conformations = self.coordsets.shape[0] self.number_of_atoms = self.coordsets.shape[1] self.handle_selection_parameters(matrix_parameters)
def close(self): """ Must return the result of merging all the previously loaded data pieces as a Data Object. If the number of loaded elements is 0 the program must exit (we cannot perform any useful analysis without data!) """ if self.number_of_elements == 0: common.print_and_flush("[ERROR DataLoader:close] No loaded data. Exiting...\n") exit()
def close(self): """ Must return the result of merging all the previously loaded data pieces as a Data Object. If the number of loaded elements is 0 the program must exit (we cannot perform any useful analysis without data!) """ if self.number_of_elements == 0: common.print_and_flush( "[ERROR DataLoader:close] No loaded data. Exiting...\n") exit()
def check_extension(self, ext): """ Helper function to check if the file extension is allowed. If not it shuts down the program. @param ext: The extension string (with the separating period!) @return: Nothing (exits if the condition is not fulfilled) """ if not ext in [".dcd",".pdb"]: common.print_and_flush( "[ERROR] pyProCT cannot read this file format.\n") self.notify("SHUTDOWN","Wrong file format.") exit()
def do_graph(clustering,num_elems_of_traj_2,std_deviations,filename): """ """ graph = digraph() labels = populate_nodes_with_labels(clustering, num_elems_of_traj_2,std_deviations, graph) prob_matrix = calculate_probability_matrix(clustering) add_graph_edges(graph,labels,clustering,prob_matrix) tmp_file = open("tmp_dot","w") tmp_file.write(write(graph)) tmp_file.close() common.print_and_flush("delegating to dot...") os.system("cat tmp_dot|dot -Tpng -o "+filename+";rm tmp_dot")
def check_extension(self, ext): """ Helper function to check if the file extension is allowed. If not it shuts down the program. @param ext: The extension string (with the separating period!) @return: Nothing (exits if the condition is not fulfilled) """ if not ext in [".dcd", ".pdb"]: common.print_and_flush( "[ERROR] pyProCT cannot read this file format.\n") self.notify("SHUTDOWN", "Wrong file format.") exit()
def purge_mixed_clusters_and_do_graph(mixed, pure_clusters_traj1,condensed_distance_matrix,std_devs_from_A,path): """ """ common.print_and_flush( "Purging clusters...") # Purge all mixed clusters of elements from traj2 purged = [] num_elems_of_traj_2 = [] for i in range(len(mixed)): cluster, elems_in_traj1, elems_in_traj2 = mixed[i] #@UnusedVariable num_elems_of_traj_2.append(len(elems_in_traj2)) # We rebuild the cluster with only elements of traj 1 purged.append(Cluster(prototype=None,elements = elems_in_traj1)) # print "l ",len(elems_in_traj1)," ",len(elems_in_traj2) # we also need to have traj 1 pure clusters purged.extend(pure_clusters_traj1) # Those don't have any element of traj 2, so we put 0s in the number of # elements list num_elems_of_traj_2.extend([0]*len(pure_clusters_traj1)) #Calculate statistics for the remaining clusters for i in range(len(pure_clusters_traj1)): medoid = pure_clusters_traj1[i].calculate_medoid(condensed_distance_matrix) std_devs_from_A.append(get_distance_std_dev_for_elems(pure_clusters_traj1[i].all_elements,medoid,condensed_distance_matrix)) common.print_and_flush( "Done.\n") common.print_and_flush("Trying to draw state graph...") do_graph(Clustering(purged,sort = False),num_elems_of_traj_2,std_devs_from_A,path) common.print_and_flush("Done.\n")
def __init__(self, parameters, observer): """ Class creator. It parses the needed files and extracts info and coordinates. """ super(TrajectoryHandler, self).__init__(observer) print "Reading conformations..." prody.confProDy(verbosity="none") self.parameters = parameters matrix_parameters = parameters.get_value( "data.matrix.parameters", default_value=ProtocolParameters.empty()) parameters["data"]["files"] = self.expand_file_lists( parameters["data"]["files"]) self.files = parameters["data"]["files"] self.pdbs = [] if len(self.files) == 0: common.print_and_flush("[ERROR] no pdbs. Exiting...\n") self.notify("SHUTDOWN", "No pdbs defined in script.") exit() self.notify("Loading", "Loading Trajectories") # Bookmarking structure self.bookmarking = {"pdb": None, "selections": {}} merged_structure = self.getMergedStructure() self.coordsets = merged_structure.getCoordsets() self.number_of_conformations = self.coordsets.shape[0] self.number_of_atoms = self.coordsets.shape[1] self.handle_selection_parameters(matrix_parameters) print "%d conformations of %d atoms were read." % ( merged_structure.numCoordsets(), merged_structure.numAtoms())
def load_data_from_source(self, source): """ Loads a structure file (pdb or dcd) and updates source info. :param source: Is a DataSource object with one of this sets of keywords: - For 'pdb' files: { "source": ... , "base_selection": ... } Where 'file' contains the path of the pdb file we want to load. - For 'dcd' files: { "source": ..., "atoms_source": ..., "base_selection": ... } Where 'file' contains the path of the 'dcd' file we want to load and 'atoms_file' the source of the pdb file containing the atomic information. In both cases 'base_selection' is a Prody selection string that performs an initial selection of the atoms. This is useful when we want to load more than one file with different number of atoms and its goal is to allow the selection of the common atoms. It is up to the user to maintain a 1 to 1 mapping between the atoms of each of the files. The source object will be enriched with some extra information from the loaded structure ensemble. :return: Prody's structure object with the loaded ensemble """ _, ext = os.path.splitext(source.get_path()) if ext == ".dcd": structure = prody.parsePDB(source.get_info("atoms_source")) # Leave only atomic information removeAllCoordsetsFromStructure(structure) dcd_data = prody.DCDFile(source.get_path()) coordsets = dcd_data.getCoordsets() # Add all coordsets to atomic information for coordset in coordsets: structure.addCoordset(coordset) elif ext == ".pdb": structure = prody.parsePDB(source.get_path()) else: print "[ERROR][ProteinStructureEnsembleData::get_structure] pyProCT does not know hot to load the file %s (unknown extension '%s')"%(source.get_path(),ext) exit() if source.has_info("base_selection"): structure = structure.select(source.get_info("base_selection")).copy() if structure is None: common.print_and_flush("[ERROR ProteinStructureEnsembleData::get_structure] Improductive base selection (%s). Exiting...\n"%source.get_info("base_selection")) exit() source.add_info("number_of_conformations", structure.numCoordsets()) source.add_info("number_of_atoms", structure.numAtoms()) self.model_numbers.extend(self.get_model_numbers(source, structure.numCoordsets())) self.model_remarks.extend(self.get_remarks(source, structure.numCoordsets())) return structure, structure.numCoordsets()
def get_structure(self, file_info): """ Loads a structure file (pdb or dcd) and fills its structure_info data for logging. @param file_info: Is a string containing the path of the file or a dictionary with this structure: 'pdb' files: { "file": ... , "base_selection": ... } Where 'file' contains the path of the pdb file we want to load. 'dcd' files: { "file": ..., "atoms_file": ..., "base_selection": ... } Where 'file' contains the path of the 'dcd' file we want to load and 'atoms_file' the source of the pdb file containing the atomic information. In both cases 'base_selection' is a Prody selection string that performs an initial selection of the atoms. This is useful when we want to load more than one file with different number of atoms and its goal is to allow the selection of the common atoms. It is up to the user to maintain a 1 to 1 mapping between the atoms of each of the files. @return: A tuple containing the structure object and a structure_info dictionary. """ structure_info = { "source": "", "source of atoms": "", "base selection": "", "number of conformations": "", "number of atoms": "" } if isinstance(file_info, basestring): # Then is a path, and must be a pdb path = file_info structure_info["source"] = path name, ext = os.path.splitext(path) self.check_extension(ext) if ext == ".dcd": common.print_and_flush( "[ERROR TrajectoryHandler::get_structure] Path format can only be used with pdb files. Exiting...\n" ) self.notify("SHUTDOWN", "Fatal error reading pdbs.") exit() else: structure = prody.parsePDB(path) structure_info[ "number of conformations"] = structure.numCoordsets() structure_info["number of atoms"] = structure.numAtoms() return structure, structure_info else: # {"file": , "selection": } object or # {"file": , "atoms_file":, "selection"} if the file is a dcd file path = file_info["file"] structure_info["source"] = path name, ext = os.path.splitext(path) self.check_extension(ext) if ext == ".dcd": structure_info["source of atoms"] = file_info["atoms_file"] structure = prody.parsePDB(file_info["atoms_file"]) removeAllCoordsetsFromStructureLeavingFirst(structure) dcd_data = prody.DCDFile(path) coordsets = dcd_data.getCoordsets() for coordset in coordsets: structure.addCoordset(coordset) else: structure = prody.parsePDB(path) if "base_selection" in file_info and file_info[ "base_selection"] != "": structure = structure.select(file_info["base_selection"]) structure_info["base selection"] = file_info["base_selection"] structure_info["number of conformations"] = structure.numCoordsets( ) structure_info["number of atoms"] = structure.numAtoms() return structure, structure_info
def load_data_from_source(self, source): """ Loads a structure file (pdb or dcd) and updates source info. :param source: Is a DataSource object with one of this sets of keywords: - For 'pdb' files: { "source": ... , "base_selection": ... } Where 'file' contains the path of the pdb file we want to load. - For 'dcd' files: { "source": ..., "atoms_source": ..., "base_selection": ... } Where 'file' contains the path of the 'dcd' file we want to load and 'atoms_file' the source of the pdb file containing the atomic information. In both cases 'base_selection' is a Prody selection string that performs an initial selection of the atoms. This is useful when we want to load more than one file with different number of atoms and its goal is to allow the selection of the common atoms. It is up to the user to maintain a 1 to 1 mapping between the atoms of each of the files. The source object will be enriched with some extra information from the loaded structure ensemble. :return: Prody's structure object with the loaded ensemble """ _, ext = os.path.splitext(source.get_path()) if ext == ".dcd": structure = prody.parsePDB(source.get_info("atoms_source")) # Leave only atomic information removeAllCoordsetsFromStructure(structure) dcd_data = prody.DCDFile(source.get_path()) coordsets = dcd_data.getCoordsets() # Add all coordsets to atomic information for coordset in coordsets: structure.addCoordset(coordset) elif ext == ".pdb": structure = prody.parsePDB(source.get_path()) else: print "[ERROR][ProteinStructureEnsembleData::get_structure] pyProCT does not know how to load the file %s (unknown extension '%s')"%(source.get_path(),ext) exit() if source.has_info("base_selection"): structure = structure.select(source.get_info("base_selection")).copy() if structure is None: common.print_and_flush("[ERROR ProteinStructureEnsembleData::get_structure] Improductive base selection (%s). Exiting...\n"%source.get_info("base_selection")) exit() print "Loaded %d conformations with %d atoms from %s."%(structure.numCoordsets(), structure.numAtoms(), source.get_path()) # prody.writePDB("%s_test"%source.get_path(), structure, csets= [1]) source.add_info("number_of_conformations", structure.numCoordsets()) source.add_info("number_of_atoms", structure.numAtoms()) self.model_numbers.extend(self.get_model_numbers(source, structure.numCoordsets())) self.model_remarks.extend(self.get_remarks(source, structure.numCoordsets())) return structure, structure.numCoordsets()
def test_print_and_flush(self): handler = cStringIO.StringIO() print_and_flush("Hello", handler) self.assertEqual(handler.getvalue(), "Hello")
def get_structure(self, file_info): """ Loads a structure file (pdb or dcd) and fills its structure_info data for logging. @param file_info: Is a string containing the path of the file or a dictionary with this structure: 'pdb' files: { "file": ... , "base_selection": ... } Where 'file' contains the path of the pdb file we want to load. 'dcd' files: { "file": ..., "atoms_file": ..., "base_selection": ... } Where 'file' contains the path of the 'dcd' file we want to load and 'atoms_file' the source of the pdb file containing the atomic information. In both cases 'base_selection' is a Prody selection string that performs an initial selection of the atoms. This is useful when we want to load more than one file with different number of atoms and its goal is to allow the selection of the common atoms. It is up to the user to maintain a 1 to 1 mapping between the atoms of each of the files. @return: A tuple containing the structure object and a structure_info dictionary. """ structure_info = { "source":"", "source of atoms":"", "base selection": "", "number of conformations": "", "number of atoms": "" } if isinstance(file_info, basestring): # Then is a path, and must be a pdb path = file_info structure_info["source"] = path name, ext = os.path.splitext(path) self.check_extension(ext) if ext == ".dcd": common.print_and_flush( "[ERROR TrajectoryHandler::get_structure] Path format can only be used with pdb files. Exiting...\n") self.notify("SHUTDOWN", "Fatal error reading pdbs.") exit() else: structure = prody.parsePDB(path) structure_info["number of conformations"] = structure.numCoordsets() structure_info["number of atoms"] = structure.numAtoms() return structure, structure_info else: # {"file": , "selection": } object or # {"file": , "atoms_file":, "selection"} if the file is a dcd file path = file_info["file"] structure_info["source"] = path name, ext = os.path.splitext(path) self.check_extension(ext) if ext == ".dcd": structure_info["source of atoms"] = file_info["atoms_file"] structure = prody.parsePDB(file_info["atoms_file"]) removeAllCoordsetsFromStructureLeavingFirst(structure) dcd_data = prody.DCDFile(path) coordsets = dcd_data.getCoordsets() for coordset in coordsets: structure.addCoordset(coordset) else: structure = prody.parsePDB(path) if "base_selection" in file_info and file_info["base_selection"] != "": structure = structure.select(file_info["base_selection"]) structure_info["base selection"]=file_info["base_selection"] structure_info["number of conformations"] = structure.numCoordsets() structure_info["number of atoms"] = structure.numAtoms() return structure, structure_info