Esempio n. 1
0
def get_running_jobs():
    """Finds jobs that are currently running"""
    try:
        squeue_output = tools.run_commands('squeue')[0]
        job_listing_information = squeue_output.split("\n")[:-1]
        running_jobs = ra.RaggedArray(
            [s.split() for s in job_listing_information])[:, 0]
        if running_jobs[0] != 'JOBID':
            raise UnexpectedResult('slurm queue wrapper failed to parse jobs!')
        else:
            running_jobs = running_jobs[1:]
    except:
        logger.log("an error has occured with finding jobs...")
        logger.log("for error checking purposes: ")
        logger.log(squeue_output)
        logger.log(job_listing_information)
        logger.log(running_jobs)
        raise UnexpectedResult('slurm queue wrapper failed to parse jobs!')
    return np.array(running_jobs)
Esempio n. 2
0
def nonzero_mi_ra():
    data, n_states = nonzero_mi_np()
    a = ra.RaggedArray(array=data[0], lengths=[1000, 2000, 5000, 2000])
    return a, n_states
Esempio n. 3
0
def main(argv=None):
    '''Run the driver script for this module. This code only runs if we're
    being run as a script. Otherwise, it's silent and just exposes methods.'''
    args = process_command_line(argv)

    top = md.load(args.topology).top
    atom_ids = top.select(args.selection)

    logging.info("Running with %s total workers.", MPI_SIZE)

    logging.info(
        "Loading trajectories [%s::%s]; selection == '%s' w/ "
        "subsampling %s", MPI_RANK, MPI_SIZE, args.selection, args.subsample)

    with timed("load_as_concatenated took %.2f sec", logging.info):
        global_lengths, my_xyz = mpi.io.load_as_striped(
            filenames=args.trajectories,
            top=top,
            atom_indices=atom_ids,
            stride=args.subsample,
            processes=args.processes)

    with timed("Turned over array in %.2f min", logging.info):
        xyz = my_xyz.copy()
        del my_xyz
        my_xyz = xyz

    logging.info("Loaded %s frames in %s trjs (%.2fG).", len(my_xyz),
                 len(args.trajectories) // MPI_SIZE,
                 my_xyz.data.nbytes / 1024**3)

    trjs = md.Trajectory(my_xyz, topology=top.subset(atom_ids))

    logging.info(
        "Beginning kcenters clustering with memory footprint of %.2fG "
        "RAM (coords are %.2fG; total VRAM is %.2fG)",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3,
        psutil.virtual_memory().total / 1024**3)

    if len(args.cluster_radii) > 1:
        raise NotImplementedError(
            "Multiple cluster radii are not yet supported")

    tick = time.perf_counter()
    local_dists, local_assigs, local_ctr_inds = kcenters_mpi(
        trjs, md.rmsd, dist_cutoff=args.cluster_radii[0])
    tock = time.perf_counter()

    logging.info(
        "Finished kcenters clustering using %.2fG RAM (coords are "
        "%.2fG) in %.2f min.",
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024**2,
        trjs.xyz.nbytes / 1024**3, (tock - tick) / 60)

    for i in range(args.kmedoids_iters):
        with timed("KMedoids iteration {i} took %.2f sec".format(i=i),
                   logging.info):
            local_ctr_inds, local_dists, local_assigs = _kmedoids_pam_update(
                X=trjs,
                metric=md.rmsd,
                medoid_inds=local_ctr_inds,
                assignments=local_assigs,
                distances=local_dists,
                random_state=args.random_state)

    with timed("Reassembled dist and assign arrays in %.2f sec", logging.info):
        all_dists = mpi.ops.assemble_striped_ragged_array(
            local_dists, global_lengths)
        all_assigs = mpi.ops.assemble_striped_ragged_array(
            local_assigs, global_lengths)

        ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds,
                                                 global_lengths)
        ctr_inds = partition_indices(ctr_inds, global_lengths)

    if MPI_RANK == 0:
        logging.info("Dumping center indices to %s", args.center_indices)

        with open(args.center_indices, 'wb') as f:
            pickle.dump([(trj, frame * args.subsample)
                         for trj, frame in ctr_inds], f)

        if args.distances:
            ra.save(args.distances,
                    ra.RaggedArray(all_dists, lengths=global_lengths))
        if args.assignments:
            ra.save(args.assignments,
                    ra.RaggedArray(all_assigs, lengths=global_lengths))

        centers = load_frames(args.trajectories,
                              ctr_inds,
                              stride=args.subsample,
                              top=md.load(args.topology).top)

        with open(args.center_structures, 'wb') as f:
            pickle.dump(centers, f)
        logging.info("Wrote %s centers to %s", len(centers),
                     args.center_structures)

    return 0
Esempio n. 4
0
def reassign(topologies, trajectories, atoms, centers, frac_mem=0.5):
    """Reassign a set of trajectories based on a subset of atoms and centers.

    Parameters
    ----------
    topologies : list
        List of topologies corresponding to the trajectories to be
        reassigned.
    trajectories : list of lists
        List of lists of tajectories to be loaded in batches and
        reassigned.
    atoms : list
        List of MDTraj atom query strings. Each string is applied to the
        corresponding topology to choose which atoms will be used for
        the reassignment.
    centers : md.Trajectory or list of trajectories
        The atoms representing the centers to reassign to.
    frac_mem : float, default=0.5
        The fraction of main RAM to use for trajectories. A lower number
        will mean more batches.
    """

    n_procs = enspara.util.parallel.auto_nprocs()

    # check input validity
    if len(topologies) != len(trajectories):
        raise enspara.exception.ImproperlyConfigured(
            "Number of topologies (%s) didn't match number of sets of "
            "trajectories (%s)." % (len(topologies), len(trajectories)))
    if len(topologies) != len(atoms):
        raise enspara.exception.ImproperlyConfigured(
            "Number of topologies (%s) didn't match number of atom selection "
            "strings (%s)." % (len(topologies), len(atoms)))

    # iteration across md.Trajectory is insanely slow. Do it only once here.
    if isinstance(centers, md.Trajectory):
        tick = time.perf_counter()
        logger.info('Centers are an md.Trajectory. Creating trj-list to '
                    'avoid repeated iteration.')
        # using in-place copies to reduce memory usage (and for speed)
        centers = [centers.slice(i, copy=False) for i in range(len(centers))]
        logger.info('Built trj list in %.1f seconds.',
                    time.perf_counter() - tick)

    # precenter centers (there will be many RMSD calcs here)
    for c in centers:
        c.center_coordinates()

    with timed("Reassignment took %.1f seconds.", logger.info):
        # build flat list of targets
        targets = []
        for topfile, trjfiles, atoms in zip(topologies, trajectories, atoms):
            t = md.load(topfile).top
            atom_ids = t.select(atoms)
            for trjfile in trjfiles:
                assert os.path.exists(trjfile)
                targets.append((trjfile, t, atom_ids))

        # determine trajectory length
        tick_sounding = time.perf_counter()
        logger.info("Sounding dataset of %s trajectories and %s topologies.",
                    sum(len(t) for t in trajectories), len(topologies))

        lengths = Parallel(n_jobs=n_procs)(
            delayed(sound_trajectory)(f) for f, _, _ in targets)

        logger.info("Sounded %s trajectories with %s frames (median length "
                    "%i frames) in %.1f seconds.",
                    len(lengths), sum(lengths), np.median(lengths),
                    time.perf_counter() - tick_sounding)

        assignments, distances = batch_reassign(
            targets, centers, lengths, frac_mem=frac_mem, n_procs=n_procs)

    if all([len(assignments[0]) == len(a) for a in assignments]):
        logger.info("Trajectory lengths are homogenous. Output will "
                    "be np.ndarrays.")
        assert all([len(distances[0]) == len(d) for d in distances])
        return np.array(assignments), np.array(distances)
    else:
        logger.info("Trajectory lengths are heterogenous. Output will "
                    "be ra.RaggedArrays.")
        return ra.RaggedArray(assignments), ra.RaggedArray(distances)