def __init__(self, Metric, projfn="my_project.yaml", clcenterfn="centers.txt", clusterfn="clusters.txt", stepsize=None, timestep=None, flag_nopreprocess=False): ''' load the project information and cluster output ''' prj = Project(existing_project_file=projfn) trajs_lengths = np.array(prj.get_trajectory_lengths()) # frame numbers always correspond to full trajectory irrespective of stride #trajs_lengths = trajs_lengths self.trajs_lengths = trajs_lengths.astype(np.int) self.trajnames = prj.get_trajectory_filepaths() self.ntrajs = len(self.trajnames) self.stride = prj.get_stride() self.frames_per_traj=np.ceil(self.trajs_lengths/(self.stride*1.)) self.ndim = prj.get_number_dimensions() self.trajectory_type = prj.get_trajectory_type() self.grof = prj.gro_filepath self.tprf = prj.tpr_filepath self.ndxf = prj.ndx_filepath self.Metric = Metric # # Get input data centids = txtreader.readcols(clcenterfn) self.centids = centids[:, 1] self.cltags = txtreader.readcols(clusterfn) self.assignments = self._get_assignments() self.nodesizes = np.bincount(self.assignments[self.assignments > -1]) self.stepsize = stepsize self.timestep = timestep logger.debug("Dimensionality %d",self.ndim)
def cluster(Metric, project_filepath, cutoff, checkpoint_filepath=None, flag_nopreprocess = False): """ Cluster data Parameters ---------- project_filepath : String The path to the YAML project file. cutoff : Floating The cutoff distance passed to the Metric class. chekpoint_filepath : string,optional flag_nopreprocess : bool,optional switches off preprocessing Returns ------- clusters: dict a map (dictionary) from cluster center vertices C to lists of vertices, where the lists represent the set of vertices belonging to the cluster with center C. i.e. a map: FrameID -> [FrameID] """ # ================================================================ # Instantiation of helper classes. # Initialize MPI. comm = MPI.COMM_WORLD mpi_size = comm.Get_size() my_rank = comm.Get_rank() comm.Barrier() # Print only at node 0. # def my_print(x): # print x # print0 = lambda x: my_print(x) if my_rank == 0 else None # Say hello. print0(rank=my_rank,msg="Initialized MPI.") logger.debug("Hello, from node %s",my_rank) logger.info("Reading project file at node %s",my_rank) project = Project(existing_project_file = project_filepath) logger.debug("Initializing manager at node %s",my_rank) manager = Loadmanager(project.get_trajectory_lengths(), project.get_trajectory_filepaths(), mpi_size,my_rank) # metric has to be instantiated by only one mpi process as it needs user input # ie index groups if my_rank == 0: metric = Metric(tpr_filepath = project.get_tpr_filepath(), stx_filepath = project.get_gro_filepath(), ndx_filepath = project.get_ndx_filepath(), number_dimensions = project.get_number_dimensions() ) # since we have to broadcast it we need to destroy all pointers to arrays metric.destroy_pointers() logger.debug("Metric initialized at node 0") else: metric = None metric = comm.bcast(metric, root = 0) print0(rank=my_rank,msg="metric object broadcasted.") # recreate all pointers in the object's instance metric.create_pointers() manager.do_partition() # Take work share. my_partition = manager.myworkshare (my_trajectory_filepaths, my_trajectory_lengths, \ my_trajectory_ID_offsets, my_trajectory_ID_ranges) = \ map(list, my_partition) #print0(my_rank,"\tDistribution: {0}".format(frame_globalID_distribution)) logger.info("Reading trajectories at %s",my_rank) my_frames = Framecollection.from_files( stride = project.get_stride(), trajectory_type = project.get_trajectory_type(), trajectory_globalID_offsets = my_trajectory_ID_offsets, trajectory_filepath_list = my_trajectory_filepaths, trajectory_length_list = my_trajectory_lengths, ) if flag_nopreprocess == False: # ---------------------------------------------------------------- # Preprocess trajectories (modifying them in-place). # Metric preprocessing. logger.debug(" Preprocessing trajectories at rank %s",my_rank) metric.preprocess( frame_array_pointer = my_frames.get_first_frame_pointer(), number_frames = my_frames.number_frames, number_atoms = my_frames.number_atoms) else: print0(rank=my_rank,msg=" Will not preprocess trajectories") # ================================================================ # Initial round of all-to-all neighbour counting. # Count the number of neighbours for all frames. # If frames are vertices and edges join frames having rmsd within the cutoff, # then we compute and record the degree of each vertex. if checkpoint_filepath is None: print0(rank=my_rank,msg="Counting 'neighbours' for all frames.") my_neighbour_count = allToAll_neighbourCount(cutoff, comm, mpi_size, my_rank, metric, my_frames,manager) # :: Map Integer Integer print0(rank=my_rank,msg="Synchronizing neighbour counts.") neighbour_count_recvList = comm.allgather(my_neighbour_count) neighbour_counts = {} for node_neighbour_counts in neighbour_count_recvList: for frameID in node_neighbour_counts: try: neighbour_counts[frameID] += node_neighbour_counts[frameID] except KeyError: neighbour_counts[frameID] = node_neighbour_counts[frameID] else : print0(rank=my_rank,msg="Using checkpoint file.") neighbour_counts = None print0(rank=my_rank,msg="Start clustering.") T=time() clusters = daura_clustering(neighbour_counts, cutoff, comm, mpi_size, my_rank, manager, metric, my_frames, checkpoint_filepath) print0(rank=my_rank,msg=" Finished ... Total time: {0}".format(time()-T)) return clusters