Beispiel #1
0
def main(argv=None):

    args = process_command_line(argv)

    tick = time.perf_counter()

    with open(args.centers, 'rb') as f:
        centers = concatenate_trjs(
            pickle.load(f), args.atoms,
            enspara.util.parallel.auto_nprocs())
    logger.info('Loaded %s centers with %s atoms using selection "%s" '
                'in %.1f seconds.',
                len(centers), centers.n_atoms, args.atoms,
                time.perf_counter() - tick)

    assig, dist = reassign(
        args.topologies, args.trajectories, [args.atoms]*len(args.topologies),
        centers=centers, frac_mem=args.mem_fraction)

    mem_highwater = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    logger.info(
        "Finished reassignments in %.1f seconds. Process memory high-water "
        "mark was %.2f GB (VRAM size is %.2f GB).",
        time.perf_counter() - tick,
        (mem_highwater / 1024**2),
        psutil.virtual_memory().total / 1024**3)

    ra.save(args.distances, dist)
    ra.save(args.assignments, assig)

    logger.info("Wrote distances at %s.", args.distances)
    logger.info("Wrote assignments at %s.", args.assignments)

    return 0
Beispiel #2
0
 def run(self):
     # load and concat trjs
     if self.mem_efficient:
         trj_lengths, xyzs = load_as_concatenated(
             filenames=self.trj_filenames,
             processes=self.n_procs,
             top=self.base_struct_md,
             atom_indices=self.atom_indices_vals)
         trjs_sub = md.Trajectory(
             xyzs,
             self.base_struct_md.atom_slice(
                 self.atom_indices_vals).topology)
     else:
         trj_lengths, xyzs = load_as_concatenated(
             filenames=self.trj_filenames,
             processes=self.n_procs,
             top=self.base_struct_md)
         trjs = md.Trajectory(xyzs, self.base_struct_md.topology)
         trjs_sub = trjs.atom_slice(self.atom_indices_vals)
     # determine if rebuilding all msm stuff
     if self.build_full:
         base_struct_centers = self.base_struct_md.atom_slice(
             self.atom_indices_vals)
         base_struct_centers.save_pdb("./centers.pdb")
         self.base_struct_md.save_pdb("./prot_masses.pdb")
         init_centers = None
     else:
         init_centers = md.load("./data/centers.xtc", top="./centers.pdb")
     # fit data with base clustering object
     self.base_clust_obj.fit(trjs_sub, init_centers=init_centers)
     center_indices, distances, assignments, centers = \
         self.base_clust_obj.result_.partition(trj_lengths)
     # save data
     ra.save("./data/assignments.h5", assignments)
     ra.save("./data/distances.h5", distances)
     trjs_sub = trjs_sub[self.base_clust_obj.center_indices_]
     trjs_sub.superpose(trjs_sub[0])
     trjs_sub.save_xtc("./data/centers.xtc")
     if not self.mem_efficient:
         full_centers = trjs[self.base_clust_obj.center_indices_]
         full_centers.superpose(self.base_struct_md)
         full_centers.save_xtc("./data/full_centers.xtc")
     # save states
     n_states = len(self.base_clust_obj.center_indices_)
     unique_states = np.arange(n_states)
     if init_centers is not None:
         unique_states = unique_states[-(n_states - len(init_centers)):]
     np.save("./data/unique_states.npy", unique_states)
Beispiel #3
0
def write_assignments_and_distances_with_reassign(result, args):
    if args.subsample == 1:
        logger.debug("Subsampling was 1, not reassigning.")
        ra.save(args.distances, result.distances)
        ra.save(args.assignments, result.assignments)
    elif not args.no_reassign:
        logger.debug("Reassigning data from subsampling of %s", args.subsample)
        assig, dist = reassign(args.topologies,
                               args.trajectories,
                               args.atoms,
                               centers=result.centers)

        ra.save(args.distances, dist)
        ra.save(args.assignments, assig)
    else:
        logger.debug("Got --no-reassign, not doing reassigment")
Beispiel #4
0
def main(argv=None):
    args = process_command_line(argv)

    try:
        features = ra.load(args.features, keys=...)
    except exception.DataInvalid:
        features = ra.load(args.features)

    logger.info("Loaded data from %s with shape %s", args.features,
                features.shape)

    if args.cluster_algorithm == 'khybrid':
        clustering = KHybrid(metric=args.cluster_distance,
                             cluster_radius=args.cluster_radius,
                             kmedoids_updates=args.kmedoids_updates)
    elif args.cluster_algorithm == 'kcenters':
        clustering = KCenters(cluster_radius=args.cluster_radius,
                              metric=args.cluster_distance)

    logger.info("Clustering with %s", clustering)

    clustering.fit(features._data)

    result = clustering.result_.partition(features.lengths)
    del features

    ra.save(args.distances, result.distances)
    logger.info("Wrote distances with shape %s to %s", result.distances.shape,
                args.distances)

    ra.save(args.assignments, result.assignments)
    logger.info("Wrote assignments with shape %s to %s",
                result.assignments.shape, args.cluster_centers)

    ra.save(args.cluster_centers, result.centers)
    logger.info("Wrote cluster_centers with shape %s to %s",
                result.centers.shape, args.cluster_centers)

    pickle.dump(result.center_indices, open(args.center_indices, 'wb'))
    logger.info("Wrote %s center_indices with shape to %s",
                len(result.center_indices), args.center_indices)

    return 0
Beispiel #5
0
def main(argv=None):
    '''Run the driver script for this module. This code only runs if we're
    being run as a script. Otherwise, it's silent and just exposes methods.'''
    args = process_command_line(argv)

    top = md.load(args.topology).top
    atom_ids = top.select(args.selection)

    logging.info("Running with %s total workers.", MPI_SIZE)

    logging.info(
        "Loading trajectories [%s::%s]; selection == '%s' w/ "
        "subsampling %s", MPI_RANK, MPI_SIZE, args.selection, args.subsample)

    with timed("load_as_concatenated took %.2f sec", logging.info):
        global_lengths, my_xyz = mpi.io.load_as_striped(
            filenames=args.trajectories,
            top=top,
            atom_indices=atom_ids,
            stride=args.subsample,
            processes=args.processes)

    with timed("Turned over array in %.2f min", logging.info):
        xyz = my_xyz.copy()
        del my_xyz
        my_xyz = xyz

    logging.info("Loaded %s frames in %s trjs (%.2fG).", len(my_xyz),
                 len(args.trajectories) // MPI_SIZE,
                 my_xyz.data.nbytes / 1024**3)

    trjs = md.Trajectory(my_xyz, topology=top.subset(atom_ids))

    logging.info(
        "Beginning kcenters clustering with memory footprint of %.2fG "
        "RAM (coords are %.2fG; total VRAM is %.2fG)",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3,
        psutil.virtual_memory().total / 1024**3)

    if len(args.cluster_radii) > 1:
        raise NotImplementedError(
            "Multiple cluster radii are not yet supported")

    tick = time.perf_counter()
    local_dists, local_assigs, local_ctr_inds = kcenters_mpi(
        trjs, md.rmsd, dist_cutoff=args.cluster_radii[0])
    tock = time.perf_counter()

    logging.info(
        "Finished kcenters clustering using %.2fG RAM (coords are "
        "%.2fG) in %.2f min.",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3, (tock - tick) / 60)

    for i in range(args.kmedoids_iters):
        with timed("KMedoids iteration {i} took %.2f sec".format(i=i),
                   logging.info):
            local_ctr_inds, local_dists, local_assigs = _kmedoids_pam_update(
                X=trjs,
                metric=md.rmsd,
                medoid_inds=local_ctr_inds,
                assignments=local_assigs,
                distances=local_dists,
                random_state=args.random_state)

    with timed("Reassembled dist and assign arrays in %.2f sec", logging.info):
        all_dists = mpi.ops.assemble_striped_ragged_array(
            local_dists, global_lengths)
        all_assigs = mpi.ops.assemble_striped_ragged_array(
            local_assigs, global_lengths)

        ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds,
                                                 global_lengths)
        ctr_inds = partition_indices(ctr_inds, global_lengths)

    if MPI_RANK == 0:
        logging.info("Dumping center indices to %s", args.center_indices)

        with open(args.center_indices, 'wb') as f:
            pickle.dump([(trj, frame * args.subsample)
                         for trj, frame in ctr_inds], f)

        if args.distances:
            ra.save(args.distances,
                    ra.RaggedArray(all_dists, lengths=global_lengths))
        if args.assignments:
            ra.save(args.assignments,
                    ra.RaggedArray(all_assigs, lengths=global_lengths))

        centers = load_frames(args.trajectories,
                              ctr_inds,
                              stride=args.subsample,
                              top=md.load(args.topology).top)

        with open(args.center_structures, 'wb') as f:
            pickle.dump(centers, f)
        logging.info("Wrote %s centers to %s", len(centers),
                     args.center_structures)

    return 0
Beispiel #6
0
def main(argv=None):
    '''Run the driver script for this module. This code only runs if we're
    being run as a script. Otherwise, it's silent and just exposes methods.'''
    args = process_command_line(argv)

    targets = {
        os.path.basename(topf): "%s xtcs" % len(trjfs)
        for topf, trjfs in zip(args.topologies, args.trajectories)
    }
    logger.info("Beginning RMSD Clustering app. Operating on targets:\n%s",
                json.dumps(targets, indent=4))

    tick = time.perf_counter()
    lengths, xyz, select_top = load(args.topologies,
                                    args.trajectories,
                                    selections=args.atoms,
                                    stride=args.subsample,
                                    processes=args.processes)

    logger.info(
        "Loading finished in %.1f s. Clustering using %s atoms matching '%s'.",
        round(time.perf_counter() - tick, 2), xyz.shape[1], args.atoms)

    clustering = args.Clusterer(metric=md.rmsd,
                                n_clusters=args.n_clusters,
                                cluster_radius=args.rmsd_cutoff)

    # md.rmsd requires an md.Trajectory object, so wrap `xyz` in
    # the topology.
    clustering.fit(md.Trajectory(xyz=xyz, topology=select_top))

    logger.info("Clustered %s frames into %s clusters in %s seconds.",
                sum(lengths), len(clustering.centers_), clustering.runtime_)

    result = clustering.result_.partition(lengths)

    outdir = os.path.dirname(args.centers)
    logger.info("Saving cluster centers at %s", outdir)

    try:
        os.makedirs(outdir)
    except FileExistsError:
        pass

    centers = load_asymm_frames(result.center_indices, args.trajectories,
                                args.topologies, args.subsample)
    with open(args.centers, 'wb') as f:
        pickle.dump(centers, f)

    if args.subsample == 1:
        logger.debug("Subsampling was 1, not reassigning.")
        ra.save(args.distances, result.distances)
        ra.save(args.assignments, result.assignments)
    if not args.no_reassign:
        logger.debug("Reassigning data from subsampling of %s", args.subsample)
        # overwrite temporary output with actual results
        assig, dist = reassign(args.topologies,
                               args.trajectories,
                               args.atoms,
                               centers=result.centers)

        ra.save(args.distances, dist)
        ra.save(args.assignments, assig)
    else:
        logger.debug("Got --no-reassign, not doing reassigment")

    logger.info("Success! Data can be found in %s.",
                os.path.dirname(args.distances))

    return 0
Beispiel #7
0
def entry_point():

    if True:
        # filenames
        filenames = np.sort([
            os.path.abspath(pathname)
            for pathname in glob.glob("./trajectories/*.xtc")
        ])

        print("obtained filenames!")

        # load atom indices
        pdb = md.load("prot_masses.pdb")
        iis = pdb.topology.select("backbone and resid 72 to 87")
        # iis = np.loadtxt("./atom-indices-bb.dat", dtype=int)

        # topology filename
        prot_masses = "./prot_masses.pdb"
        prot_masses = md.load(prot_masses)

        # load trjs
        print("about to load!!")
        centers = prot_masses.atom_slice(iis)
        trj_lengths, xyzs = load_as_concatenated(filenames=filenames,
                                                 atom_indices=iis,
                                                 processes=48,
                                                 top=prot_masses)
        trjs_sub = md.Trajectory(xyz=xyzs, topology=centers.top)
        del xyzs

    if True:
        # get subset

        n_clusters = 10000
        #n_clusters = None
        dist_cutoff = 0.01
        clusterer = cluster.KCenters(metric=md.rmsd,
                                     cluster_radius=dist_cutoff,
                                     n_clusters=n_clusters)
        #clusterer = cluster.KHybrid(metric=md.rmsd, cluster_radius=dist_cutoff, n_clusters=n_clusters, kmedoids_updates=2)
        clusterer.fit(trjs_sub)
        center_indices, distances, assignments, centers = \
            clusterer.result_.partition(trj_lengths)
        ra.save("./data/assignments.h5", assignments)
        ra.save("./data/distances.h5", distances)
        trjs_sub[clusterer.center_indices_].save_xtc("./data/centers.xtc")
        np.save("./data/center_indices.npy", clusterer.center_indices_)

        print("Done clustering!")

    if True:
        lag_time = 10  # 20ps * 200 = 4 ns
        #lag_time = 1 # 20ps * 200 = 4 ns
        assignments = ra.load("./data/assignments.h5")
        unique_states = np.unique(np.concatenate(assignments))
        b = partial(builders.normalize,
                    prior_counts=1 / unique_states.shape[0])
        msm_obj = MSM(lag_time=lag_time, method=b)
        msm_obj.fit(assignments)
        np.save("./data/tcounts.npy", msm_obj.tcounts_)
        np.save("./data/tprobs.npy", msm_obj.tprobs_)
        np.save("./data/populations.npy", msm_obj.eq_probs_)

        print("Done MSM!")