Exemple #1
0
def load_trjs_or_features(args):

    if args.features:
        with timed("Loading features took %.1f s.", logger.info):
            lengths, data = load_features(args.features, stride=args.subsample)
    else:
        assert args.trajectories
        assert len(args.trajectories) == len(args.topologies)

        targets = {
            os.path.basename(topf): "%s files" % len(trjfs)
            for topf, trjfs in zip(args.topologies, args.trajectories)
        }
        logger.info("Beginning clustering; targets:\n%s",
                    json.dumps(targets, indent=4))

        with timed("Loading trajectories took %.1f s.", logger.info):
            lengths, xyz, select_top = load_trajectories(
                args.topologies,
                args.trajectories,
                selections=args.atoms,
                stride=args.subsample,
                processes=auto_nprocs())

        logger.info("Clustering using %s atoms matching '%s'.", xyz.shape[1],
                    args.atoms)

        # md.rmsd requires an md.Trajectory object, so wrap `xyz` in
        # the topology.
        data = md.Trajectory(xyz=xyz, topology=select_top)

    return lengths, data
Exemple #2
0
def main(argv=None):

    args = process_command_line(argv)

    # note that in MPI mode, lengths will be global, whereas data will
    # be local (i.e. only this node's data).
    lengths, data = load_trjs_or_features(args)

    kwargs = {}
    if args.cluster_iterations is not None:
        kwargs['kmedoids_updates'] = int(args.cluster_iterations)

    clustering = args.Clusterer(metric=args.cluster_distance,
                                n_clusters=args.cluster_number,
                                cluster_radius=args.cluster_radius,
                                mpi_mode=mpi_mode,
                                **kwargs)

    clustering.fit(data)
    # release the RAM held by the trajectories (we don't need it anymore)
    del data

    logger.info("Clustered %s frames into %s clusters in %s seconds.",
                sum(lengths), len(clustering.centers_), clustering.runtime_)

    result = clustering.result_
    if mpi_mode:
        local_ctr_inds, local_dists, local_assigs = \
            result.center_indices, result.distances, result.assignments

        with timed("Reassembled dist and assign arrays in %.2f sec",
                   logging.info):
            all_dists = mpi.ops.assemble_striped_ragged_array(
                local_dists, lengths)
            all_assigs = mpi.ops.assemble_striped_ragged_array(
                local_assigs, lengths)
            ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds, lengths)

        result = ClusterResult(center_indices=ctr_inds,
                               distances=all_dists,
                               assignments=all_assigs,
                               centers=result.centers)
    result = result.partition(lengths)

    if mpi.rank() == 0:
        with timed("Wrote center indices in %.2f sec.", logger.info):
            write_centers_indices(args.center_indices,
                                  [(t, f * args.subsample)
                                   for t, f in result.center_indices])
        with timed("Wrote center structures in %.2f sec.", logger.info):
            write_centers(result, args)
        write_assignments_and_distances_with_reassign(result, args)

    mpi.comm.barrier()

    logger.info("Success! Data can be found in %s.",
                os.path.dirname(args.distances))

    return 0
Exemple #3
0
def load(topologies, trajectories, selections, stride, processes):

    for top, selection in zip(topologies, selections):
        sentinel_trj = md.load(top)
        try:
            # noop, but causes fast-fail w/bad args.atoms
            sentinel_trj.top.select(selection)
        except:
            raise exception.ImproperlyConfigured(
                ("The provided selection '{s}' didn't match the topology "
                 "file, {t}").format(s=selection, t=top))

    flat_trjs = []
    configs = []
    n_inds = None

    for topfile, trjset, selection in zip(topologies, trajectories,
                                          selections):
        top = md.load(topfile).top
        indices = top.select(selection)

        if n_inds is not None:
            if n_inds != len(indices):
                raise exception.ImproperlyConfigured(
                    ("Selection on topology %s selected %s atoms, but "
                     "other selections selected %s atoms.") %
                    (topfile, len(indices), n_inds))
        n_inds = len(indices)

        for trj in trjset:
            flat_trjs.append(trj)
            configs.append({
                'top': top,
                'stride': stride,
                'atom_indices': indices,
            })

    logger.info(
        "Loading %s trajectories with %s atoms using %s processes "
        "(subsampling %s)", len(flat_trjs), len(top.select(selection)),
        processes, stride)
    assert len(top.select(selection)) > 0, "No atoms selected for clustering"

    with timed("Loading took %.1f sec", logger.info):
        lengths, xyz = load_as_concatenated(flat_trjs,
                                            args=configs,
                                            processes=processes)

    with timed("Turned over array in %.2f min", logging.info):
        tmp_xyz = xyz.copy()
        del xyz
        xyz = tmp_xyz

    logger.info("Loaded %s frames.", len(xyz))

    return lengths, xyz, top.subset(top.select(selection))
Exemple #4
0
def load_features(features, stride):
    if len(features) == 1:
        with timed("Loading features took %.1f s.", logger.info):
            try:
                data = ra.load(features[0])
            except tables.exceptions.NoSuchNodeError:
                data = ra.load(features[0], keys=...)

        lengths = data.lengths
        data = data._data
    else:  # and len(features) > 1
        with timed("Loading features took %.1f s.", logger.info):
            lengths, data = mpi.io.load_npy_as_striped(features, stride)

        with timed("Turned over array in %.2f min", logger.info):
            tmp_data = data.copy()
            del data
            data = tmp_data

    return lengths, data
Exemple #5
0
def load_features(features, stride):
    try:
        if len(features) == 1:
            with timed("Loading features took %.1f s.", logger.info):
                lengths, data = mpi.io.load_h5_as_striped(features[0], stride)

        else:  # and len(features) > 1
            with timed("Loading features took %.1f s.", logger.info):
                lengths, data = mpi.io.load_npy_as_striped(features, stride)

        with timed("Turned over array in %.2f min", logger.info):
            tmp_data = data.copy()
            del data
            data = tmp_data
    except MemoryError:
        logger.error(
            "Ran out of memory trying to allocate features array"
            " from file %s", features[0])
        raise

    logger.info("Loaded %s trajectories with %s frames with stride %s.",
                len(lengths), len(data), stride)

    return lengths, data
Exemple #6
0
def main(argv=None):
    """Run the driver script for this module. This code only runs if we're
    being run as a script. Otherwise, it's silent and just exposes methods.
    """
    args = process_command_line(argv)

    trj_list = load_trajs(args)

    with timed("Calculating CARDS correlations took %.1f s.", logger.info):
        ss_mi, dd_mi, sd_mi, ds_mi, inds = cards(trj_list, args.buffer_size,
                                                 args.processes)

    logger.info("Completed correlations. ")

    save_cards(ss_mi, dd_mi, sd_mi, ds_mi, args.matrices)
    np.savetxt(args.indices, inds, delimiter=",")

    logger.info("Saved dihedral indices as %s", args.indices)

    return 0
Exemple #7
0
def main(argv=None):
    '''Run the driver script for this module. This code only runs if we're
    being run as a script. Otherwise, it's silent and just exposes methods.'''
    args = process_command_line(argv)

    top = md.load(args.topology).top
    atom_ids = top.select(args.selection)

    logging.info("Running with %s total workers.", MPI_SIZE)

    logging.info(
        "Loading trajectories [%s::%s]; selection == '%s' w/ "
        "subsampling %s", MPI_RANK, MPI_SIZE, args.selection, args.subsample)

    with timed("load_as_concatenated took %.2f sec", logging.info):
        global_lengths, my_xyz = mpi.io.load_as_striped(
            filenames=args.trajectories,
            top=top,
            atom_indices=atom_ids,
            stride=args.subsample,
            processes=args.processes)

    with timed("Turned over array in %.2f min", logging.info):
        xyz = my_xyz.copy()
        del my_xyz
        my_xyz = xyz

    logging.info("Loaded %s frames in %s trjs (%.2fG).", len(my_xyz),
                 len(args.trajectories) // MPI_SIZE,
                 my_xyz.data.nbytes / 1024**3)

    trjs = md.Trajectory(my_xyz, topology=top.subset(atom_ids))

    logging.info(
        "Beginning kcenters clustering with memory footprint of %.2fG "
        "RAM (coords are %.2fG; total VRAM is %.2fG)",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3,
        psutil.virtual_memory().total / 1024**3)

    if len(args.cluster_radii) > 1:
        raise NotImplementedError(
            "Multiple cluster radii are not yet supported")

    tick = time.perf_counter()
    local_dists, local_assigs, local_ctr_inds = kcenters_mpi(
        trjs, md.rmsd, dist_cutoff=args.cluster_radii[0])
    tock = time.perf_counter()

    logging.info(
        "Finished kcenters clustering using %.2fG RAM (coords are "
        "%.2fG) in %.2f min.",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3, (tock - tick) / 60)

    for i in range(args.kmedoids_iters):
        with timed("KMedoids iteration {i} took %.2f sec".format(i=i),
                   logging.info):
            local_ctr_inds, local_dists, local_assigs = _kmedoids_pam_update(
                X=trjs,
                metric=md.rmsd,
                medoid_inds=local_ctr_inds,
                assignments=local_assigs,
                distances=local_dists,
                random_state=args.random_state)

    with timed("Reassembled dist and assign arrays in %.2f sec", logging.info):
        all_dists = mpi.ops.assemble_striped_ragged_array(
            local_dists, global_lengths)
        all_assigs = mpi.ops.assemble_striped_ragged_array(
            local_assigs, global_lengths)

        ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds,
                                                 global_lengths)
        ctr_inds = partition_indices(ctr_inds, global_lengths)

    if MPI_RANK == 0:
        logging.info("Dumping center indices to %s", args.center_indices)

        with open(args.center_indices, 'wb') as f:
            pickle.dump([(trj, frame * args.subsample)
                         for trj, frame in ctr_inds], f)

        if args.distances:
            ra.save(args.distances,
                    ra.RaggedArray(all_dists, lengths=global_lengths))
        if args.assignments:
            ra.save(args.assignments,
                    ra.RaggedArray(all_assigs, lengths=global_lengths))

        centers = load_frames(args.trajectories,
                              ctr_inds,
                              stride=args.subsample,
                              top=md.load(args.topology).top)

        with open(args.center_structures, 'wb') as f:
            pickle.dump(centers, f)
        logging.info("Wrote %s centers to %s", len(centers),
                     args.center_structures)

    return 0
Exemple #8
0
def reassign(topologies, trajectories, atoms, centers, frac_mem=0.5):
    """Reassign a set of trajectories based on a subset of atoms and centers.

    Parameters
    ----------
    topologies : list
        List of topologies corresponding to the trajectories to be
        reassigned.
    trajectories : list of lists
        List of lists of tajectories to be loaded in batches and
        reassigned.
    atoms : list
        List of MDTraj atom query strings. Each string is applied to the
        corresponding topology to choose which atoms will be used for
        the reassignment.
    centers : md.Trajectory or list of trajectories
        The atoms representing the centers to reassign to.
    frac_mem : float, default=0.5
        The fraction of main RAM to use for trajectories. A lower number
        will mean more batches.
    """

    n_procs = enspara.util.parallel.auto_nprocs()

    # check input validity
    if len(topologies) != len(trajectories):
        raise enspara.exception.ImproperlyConfigured(
            "Number of topologies (%s) didn't match number of sets of "
            "trajectories (%s)." % (len(topologies), len(trajectories)))
    if len(topologies) != len(atoms):
        raise enspara.exception.ImproperlyConfigured(
            "Number of topologies (%s) didn't match number of atom selection "
            "strings (%s)." % (len(topologies), len(atoms)))

    # iteration across md.Trajectory is insanely slow. Do it only once here.
    if isinstance(centers, md.Trajectory):
        tick = time.perf_counter()
        logger.info('Centers are an md.Trajectory. Creating trj-list to '
                    'avoid repeated iteration.')
        # using in-place copies to reduce memory usage (and for speed)
        centers = [centers.slice(i, copy=False) for i in range(len(centers))]
        logger.info('Built trj list in %.1f seconds.',
                    time.perf_counter() - tick)

    # precenter centers (there will be many RMSD calcs here)
    for c in centers:
        c.center_coordinates()

    with timed("Reassignment took %.1f seconds.", logger.info):
        # build flat list of targets
        targets = []
        for topfile, trjfiles, atoms in zip(topologies, trajectories, atoms):
            t = md.load(topfile).top
            atom_ids = t.select(atoms)
            for trjfile in trjfiles:
                assert os.path.exists(trjfile)
                targets.append((trjfile, t, atom_ids))

        # determine trajectory length
        tick_sounding = time.perf_counter()
        logger.info("Sounding dataset of %s trajectories and %s topologies.",
                    sum(len(t) for t in trajectories), len(topologies))

        lengths = Parallel(n_jobs=n_procs)(
            delayed(sound_trajectory)(f) for f, _, _ in targets)

        logger.info("Sounded %s trajectories with %s frames (median length "
                    "%i frames) in %.1f seconds.",
                    len(lengths), sum(lengths), np.median(lengths),
                    time.perf_counter() - tick_sounding)

        assignments, distances = batch_reassign(
            targets, centers, lengths, frac_mem=frac_mem, n_procs=n_procs)

    if all([len(assignments[0]) == len(a) for a in assignments]):
        logger.info("Trajectory lengths are homogenous. Output will "
                    "be np.ndarrays.")
        assert all([len(distances[0]) == len(d) for d in distances])
        return np.array(assignments), np.array(distances)
    else:
        logger.info("Trajectory lengths are heterogenous. Output will "
                    "be ra.RaggedArrays.")
        return ra.RaggedArray(assignments), ra.RaggedArray(distances)
Exemple #9
0
def batch_reassign(targets, centers, lengths, frac_mem, n_procs=None):

    example_center = centers[0]

    DTYPE_BYTES = 4
    batch_size, batch_gb = determine_batch_size(
        example_center.n_atoms, DTYPE_BYTES, frac_mem)

    logger.info(
        'Batch max size set to %s frames (~%.2f GB, %.1f%% of total RAM).' %
        (batch_size, batch_gb, frac_mem*100))

    if batch_size < max(lengths):
        raise enspara.exception.ImproperlyConfigured(
            'Batch size of %s was smaller than largest file (size %s).' %
            (batch_size, max(lengths)))

    batches = compute_batches(lengths, batch_size)

    assignments = []
    distances = []

    for i, batch_indices in enumerate(batches):
        tick = time.perf_counter()
        logger.info("Starting batch %s of %s", i+1, len(batches))
        batch_targets = [targets[i] for i in batch_indices]

        with timed("Loaded frames for batch in %.1f seconds", logger.info):
            batch_lengths, xyz = load_as_concatenated(
                [tfile for tfile, top, aids in batch_targets],
                lengths=[lengths[i] for i in batch_indices],
                args=[{'top': top, 'atom_indices': aids}
                      for t, top, aids in batch_targets],
                processes=n_procs)

        # mdtraj loads as float32, and load_as_concatenated should thus
        # also load as float32. This should _never_ be hit, but there might be
        # some platform-specific situation where double != float64?
        assert xyz.dtype.itemsize == DTYPE_BYTES

        trj = md.Trajectory(xyz, topology=example_center.top)

        with timed("Precentered trajectories in %.1f seconds", logger.debug):
            trj.center_coordinates()

        with timed("Assigned trajectories in %.1f seconds", logger.debug):
            batch_assignments, batch_distances = assign_to_nearest_center(
                    trj, centers, partial(md.rmsd, precentered=True))

        # clear memory of xyz and trj to allow cleanup to deallocate
        # these large arrays; may help with memory high-water mark
        with timed("Cleared array from memory in %.1f seconds", logger.debug):
            xyz_size = xyz.size
            del trj, xyz

        assignments.extend(partition_list(batch_assignments, batch_lengths))
        distances.extend(partition_list(batch_distances, batch_lengths))

        logger.info(
            "Finished batch %s of %s in %.1f seconds. Coordinates array had "
            "memory footprint of %.2f GB (of memory high-water mark %.2f/%.2f "
            "GB).",
            i, len(batches), time.perf_counter() - tick,
            xyz_size * DTYPE_BYTES / 1024**3,
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
            psutil.virtual_memory().total / 1024**3)

    return assignments, distances