예제 #1
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None,
                  verbosity=None,
                  log_every_n=1000):
        """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)
        assert verbosity in [None, "low", "high"]

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = []
            for i, mol in enumerate(mols):
                if verbosity is not None and i % log_every_n == 0:
                    log("Featurizing %d / %d" % (i, len(mols)))
                if mol is not None:
                    features.append(self._featurize(mol))
                else:
                    features.append(np.array([]))

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
예제 #2
0
  def featurize(self, mols, parallel=False, client_kwargs=None,
                view_flags=None, verbosity=None, log_every_n=1000):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
    if self.conformers and isinstance(mols, types.GeneratorType):
      mols = list(mols)
    assert verbosity in [None, "low", "high"]

    if parallel:
      from IPython.parallel import Client

      if client_kwargs is None:
          client_kwargs = {}
      if view_flags is None:
          view_flags = {}
      client = Client(**client_kwargs)
      client.direct_view().use_dill()  # use dill
      view = client.load_balanced_view()
      view.set_flags(**view_flags)
      call = view.map(self._featurize, mols, block=False)
      features = call.get()

      # get output from engines
      call.display_outputs()

    else:
      features = []
      for i, mol in enumerate(mols):
        if verbosity is not None and i % log_every_n == 0:
          log("Featurizing %d / %d" % (i, len(mols)))
        if mol is not None:
          features.append(self._featurize(mol))
        else:
          features.append(np.array([]))

    if self.conformers:
      features = self.conformer_container(mols, features)
    else:
      features = np.asarray(features)
    return features
예제 #3
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None):
        """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = [self._featurize(mol) for mol in mols]

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
예제 #4
0
def get_map(cluster_id=None):
    """
    Get the proper mapping function.

    Parameters
    ----------
    cluster_id : str, optional
        IPython.parallel cluster ID.
    """
    if cluster_id is not None:
        client = Client(cluster_id=cluster_id)
        client.direct_view().use_dill()
        view = client.load_balanced_view()
        return view.map_sync
    else:
        return map
예제 #5
0
    def __init__(self, config_filename=None, profile=None, seed=None, sshkey=None, packer='json'):
        """Initialize a IPClusterEngine

        Do IPython.parallel operations to set up cluster and generate mapper.

        """
        super(IPClusterEngine, self).__init__(seed=seed)
        rc = Client(config_filename, profile=profile, sshkey=sshkey, packer=packer)
        # FIXME: add a warning if environment in direct view is not 'empty'?
        #        else, might become dependent on an object created in
        #        environemnt in a prior run
        dview = rc.direct_view()
        lview = rc.load_balanced_view()
        with dview.sync_imports(local=True):
            import crosscat
        mapper = lambda f, tuples: self.lview.map(f, *tuples)
        # if you're trying to debug issues, consider clearning to start fresh
        # rc.clear(block=True)
        #
        self.rc = rc
        self.dview = dview
        self.lview = lview
        self.mapper = mapper
        self.do_initialize = None
        self.do_analyze = None
        return
예제 #6
0
def remove_duplicates(df):
    logging.info('Removing duplicates.')

    image_names = df.image_name.unique()

    def process_image_name(image_name):
        data = df[df.image_name == image_name]
        data = remove_duplicates_from_image_name_data(data)
        data.to_hdf(get_temp_fname(image_name), 'df')

    # parallel approach, u need to launch an ipcluster/controller for this work!
    c = Client()
    dview = c.direct_view()
    dview.push({'remove_duplicates_from_image_name_data':
                remove_duplicates_from_image_name_data,
                'data_root': data_root})
    lbview = c.load_balanced_view()
    lbview.map_sync(process_image_name, image_names)

    df = []
    for image_name in image_names:
        try:
            df.append(pd.read_hdf(get_temp_fname(image_name), 'df'))
        except OSError:
            continue
        else:
            os.remove(get_temp_fname(image_name))
    df = pd.concat(df, ignore_index=True)
    logging.info('Duplicates removal complete.')
    return df
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('db_fname',
                        help="Provide the filename of the HDF database "
                             "file here.")
    args = parser.parse_args()

    image_names = get_image_names_from_db(args.db_fname)
    logging.info('Found {} image_names'.format(len(image_names)))

    c = Client()
    dview = c.direct_view()
    lbview = c.load_balanced_view()

    dview.push({'do_clustering': do_clustering,
                'dbfile': args.db_fname})
    results = lbview.map_async(process_image_name, image_names)
    import time
    import sys
    import os
    dirname = os.path.join(os.environ['HOME'], 'data/planet4/catalog_2_and_3')
    while not results.ready():
        print("{:.1f} %".format(100 * results.progress / len(image_names)))
        sys.stdout.flush()
        time.sleep(10)
    for res in results.result:
        print(res)
    logging.info('Catalog production done. Results in {}.'.format(dirname))
예제 #8
0
    def featurize(self, mols, parallel=False, client_kwargs=None,
                  view_flags=None):
        """
        Calculate features for molecules.

        Parameters
        ----------
        mols : iterable
            RDKit Mol objects.
        parallel : bool, optional
            Whether to train subtrainers in parallel using
            IPython.parallel (default False).
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = [self._featurize(mol) for mol in mols]

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
예제 #9
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None):
        """
        Calculate features for molecules.

        Parameters
        ----------
        mols : iterable
            RDKit Mol objects.
        parallel : bool, optional (default False)
            Train subtrainers in parallel using IPython.parallel.
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize,
                            np.array_split(mols, len(client.direct_view())),
                            block=False)
            features = call.get()
            features = np.concatenate(features)

            # get output from engines
            call.display_outputs()

        else:
            features = self._featurize(mols)

        return np.asarray(features)
예제 #10
0
파일: dragon.py 프로젝트: rbharath/vs-utils
    def featurize(self, mols, parallel=False, client_kwargs=None,
                  view_flags=None):
        """
        Calculate features for molecules.

        Parameters
        ----------
        mols : iterable
            RDKit Mol objects.
        parallel : bool, optional (default False)
            Train subtrainers in parallel using IPython.parallel.
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(
                self._featurize,
                np.array_split(mols, len(client.direct_view())), block=False)
            features = call.get()
            features = np.concatenate(features)

            # get output from engines
            call.display_outputs()

        else:
            features = self._featurize(mols)

        return np.asarray(features)
예제 #11
0
class IPythonParallelMap(object):
    """ Class to handle the creation and management of cluster
    resources, typically on IRP, through IPython's parallel
    implementation. """
    def __init__(self, nodes, irp=True, debug=False):
        """ if SSH, Open a connection to IRP and start the IPCluster
        daemon """
        self.nodes = nodes
        self.irp = irp
        if self.irp:
            self.child = pexpect.spawn('ssh [email protected]')
            if debug: self.child.logfile = sys.stdout
            time.sleep(0.2)
            self.child.sendline('cd /home/psj/Documents/IPClusterLogs')
            self.child.sendline('ipcluster start --profile=pbs -n ' +
                                str(nodes) + ' --daemonize')
        else:
            self.child = pexpect.spawn('ipcluster start -n ' + str(nodes) +
                                       ' --daemonize')

    def close(self):
        """ Close the IPCluster, delete jobs, and logout of SSH """
        if self.irp: self.child.sendline('ipcluster stop --profile=pbs')
        else: self.child.sendline('ipcluster stop')
        time.sleep(0.5)
        if self.irp: self.child.sendline('qdel all')
        time.sleep(0.1)
        self.child.sendline('logout')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.close()

    def connect_client(self):
        """ Connect the current client to the running engine """
        from IPython.parallel import Client

        if self.irp: self.client = Client(profile='pbs')
        else: self.client = Client()

        assert len(self.client.ids) == self.nodes
        self.lview = self.client.load_balanced_view()
        self.dview = self.client.direct_view()

    def __call__(self, *args, **kwargs):
        """ Map function call to parallel view """
        results = self.lview.map(*args, balanced=True, **kwargs)
        return results.get()
예제 #12
0
def get_client_and_view(verbose=True):
    c = None
    v = None
    while c is None:
        try:
            c = Client(profile_dir=profile_minspan_dir)
        except:
            sleep(5)
    while len(c.ids) == 0:
        if verbose:
            print "waiting for connections"
        sleep(5)
    while v is None:
        try:
            v = c.direct_view()
        except Exception, e:
            print e
            sleep(5)
예제 #13
0
def get_client_and_view(verbose=True):
    c = None
    v = None
    while c is None:
        try:
            c = Client(profile_dir=profile_minspan_dir)
        except:
            sleep(5)
    while len(c.ids) == 0:
        if verbose:
            print "waiting for connections"
        sleep(5)
    while v is None:
        try:
            v = c.direct_view()
        except Exception, e:
            print e
            sleep(5)
예제 #14
0
def remove_duplicates_from_file(dbname, do_odo=False):
    logging.info('Removing duplicates.')

    image_names = get_image_names(dbname)

    def process_image_name(image_name):
        import pandas as pd
        data = pd.read_hdf(dbname, 'df', where='image_name==' + image_name)
        data = remove_duplicates_from_image_name_data(data)
        data.to_hdf(get_temp_fname(image_name), 'df')

    # parallel approach, u need to launch an ipcluster/controller for this work!
    c = Client()
    dview = c.direct_view()
    dview.push({'remove_duplicates_from_image_name_data':
                remove_duplicates_from_image_name_data,
                'data_root': data_root,
                'get_temp_fname': get_temp_fname})
    lbview = c.load_balanced_view()
    logging.info('Starting parallel processing.')
    lbview.map_sync(process_image_name, image_names)
    logging.info('Done clean up. Now concatenating results.')

    merge_temp_files(dbname, image_names, do_odo)
예제 #15
0
#####################
### This Analysis ###
#####################
hconst = 50. # assume a surface layer of const depth 50 m
anal_name = 'hires'

##############################
## Set Up Parallel Engines ###
##############################

# give engines time to load
time.sleep(20)

c = Client()
dview = c.direct_view()
lview = c.load_balanced_view()

with dview.sync_imports():
    import numpy
    from watermasstools import pop_model, transformation

#####################################
## Define Regions for Calculation ###
#####################################

# where to find the data
ddir = '/glade/scratch/enewsom/HR_ANALYSIS/DATA/HRfiles/'
fprefix = 'HRC06.br.pop.h1'
fnames = []
for year in xrange(147,168):
예제 #16
0
from __future__ import division
from IPython.parallel import Client
from IPython.parallel.util import interactive

# TODO may be able to use IPython.parallel.Reference as in
# http://minrk.github.com/scipy-tutorial-2011/basic_remote.html

# the ipcluster should be set up before this file is imported
c = Client()
dv = c.direct_view()
dv.execute('import pyhsmm')
lbv = c.load_balanced_view()

# these dicts need to be populated by hand before calling build_states*,
# both locally (in this module) and in the ipython top-level module on every
# engine
# the second dict only needs to be used when calling build_states_changepoints
alldata = {}
allchangepoints = {}

# these functions function are run on the engines, and expects the alldata (and
# allchangepoints) global(s) as well as the current model hsmm_subhmms_model to be
# present in the ipython global frame
@lbv.parallel(block=True)
@interactive
def build_states(data_id):
    global global_model
    global alldata

    # adding the data to the pushed global model will build a substates object
    # and resample the states given the parameters in the model
예제 #17
0
from IPython.parallel import Client
from numpy import array, savez, percentile, nan

from arch.compat.python import range, lmap

# Time in seconds to sleep before checking if ready
SLEEP = 10
# Number of repetitions
EX_NUM = 500
# Number of simulations per exercise
EX_SIZE = 200000
# Approximately controls memory use, in MiB
MAX_MEMORY_SIZE = 100

rc = Client()
dview = rc.direct_view()
with dview.sync_imports():
    from numpy import ones, vstack, arange, cumsum, sum, dot, zeros
    from numpy.random import RandomState, seed, random_integers
    from numpy.linalg import pinv


def clear_cache(client, view):
    """Cache-clearing function from mailing list"""
    assert not rc.outstanding, "don't clear history when tasks are outstanding"
    client.purge_results('all')  # clears controller
    client.results.clear()
    client.metadata.clear()
    view.results.clear()
    client.history = []
    view.history = []
예제 #18
0
class ParallelCache(object):
    def __init__(self, cluster_profile=None, cachedir=None, purge=False,
                 idle_timeout=None, shutdown=False, retries=3, poll_interval=10,
                 verbose=5, **kwargs):
        self._purge = purge
        self._idle_timeout = idle_timeout
        self._shutdown = shutdown
        self._retries = retries
        self._poll_interval = poll_interval
        self._verbose = verbose
        self._execution_times = None
        if cluster_profile is not None:
            self._ip_client = Client(profile=cluster_profile, **kwargs)
        else:
            self._ip_client = None

        if cachedir is not None:
            self._memory = Memory(cachedir=cachedir, verbose=verbose)
        else:
            self._memory = None

    def map(self, f, *sequences, **kwargs):
        # make sure all sequences have the same length
        n_jobs = None
        my_seqs = []
        for ii, seq in enumerate(sequences):
            try:
                this_n_elems = len(seq)
                if n_jobs is None:
                    n_jobs = this_n_elems
                if this_n_elems != n_jobs:
                    raise ValueError('All sequences must have the same lenght,'
                                     'sequence at position %d has length %d'
                                     % (ii + 1, this_n_elems))
                my_seqs.append(seq)
            except TypeError:
                # we allow passing ints etc, convert them to a sequence
                my_seqs.append(repeat(seq))

        t_start = time.time()
        if self._ip_client is None:
            if self._verbose >= 1:
                tmp = 'without' if self._memory is None else 'with'
                print_('Running %d jobs locally %s caching..' % (n_jobs, tmp))
            out = list()
            my_fun = f if self._memory is None else self._memory.cache(f)
            for this_args in zip(*my_seqs):
                out.append(my_fun(*this_args, **kwargs))
        elif self._ip_client is not None and self._memory is None:
            if self._verbose >= 1:
                print('Running %d jobs on cluster without caching..' % n_jobs)
            out = [None] * n_jobs
            lbview = self._ip_client.load_balanced_view()
            tasks = list()
            for this_args in zip(*my_seqs):
                tasks.append(lbview.apply(f, *this_args, **kwargs))
            # wait for tasks to complete
            result_retrieved = [False] * len(tasks)
            execution_times = [None] * len(tasks)
            retry_no = np.zeros(len(tasks), dtype=np.int)
            last_print = 0
            last_idle_check = time.time()
            idle_times = {}
            while True:
                for ii, task in enumerate(tasks):
                    if not result_retrieved[ii] and task.ready():
                        if task.successful():
                            out[ii] = task.get()
                            execution_times[ii] = task.serial_time
                            result_retrieved[ii] = True
                        else:
                            # task failed for some reason, re-run it
                            if retry_no[ii] < self._retries:
                                if self._verbose > 3:
                                    print ('\nTask %d failed, re-running (%d / %d)'
                                           % (ii, retry_no[ii] + 1,
                                              self._retries))
                                this_args = zip(*my_seqs)[ii]
                                new_task = lbview.apply(f, *this_args, **kwargs)
                                tasks[ii] = new_task
                                retry_no[ii] += 1
                            else:
                                msg = ('\nTask %d failed %d times. Stopping'
                                       % (ii, self._retries + 1))
                                print msg
                                # this will throw an exception
                                task.get()
                                raise RuntimeError(msg)
                        if self._purge:
                            _purge_results(self._ip_client, task)

                n_completed = int(np.sum(result_retrieved))
                progress = n_completed / float(n_jobs)
                # print progress in 10% increments
                this_print = int(np.floor(progress * 10))
                if self._verbose >= 1 and this_print != last_print:
                    print_(' %d%%' % (100 * progress), end='')
                    last_print = this_print
                if n_completed == n_jobs:
                    # we are done!
                    print_('')  # newline
                    break
                if self._idle_timeout is not None and time.time() > last_idle_check + 30:
                    now = time.time()
                    queue = self._ip_client.queue_status()
                    shutdown_eids = []
                    for eid in self._ip_client.ids:
                        if eid not in queue:
                            continue
                        if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                            # engine is idle
                            idle_time = idle_times.get(eid, None)
                            if idle_time is None:
                                # mark engine as idle
                                idle_times[eid] = now
                                continue
                            if now - idle_time > self._idle_timeout:
                                # shut down engine
                                shutdown_eids.append(eid)
                        elif eid in idle_times:
                            # engine has started running again
                            del idle_times[eid]

                    if len(shutdown_eids) > 0:
                        if self._verbose > 0:
                            print 'Shuting-down engines: ', shutdown_eids
                        dv = self._ip_client.direct_view(shutdown_eids)
                        dv.shutdown()
                        for eid in shutdown_eids:
                            del idle_times[eid]
                    last_idle_check = now
                time.sleep(self._poll_interval)

            self._execution_times = execution_times
            if self._shutdown:
                self._shutdown_cluster()

        elif self._ip_client is not None and self._memory is not None:
            # now this is the interesting case..
            if self._verbose >= 1:
                print('Running %d jobs on cluster with caching..' % n_jobs)
            f_cache = self._memory.cache(f)
            lbview = None
            out = [None] * n_jobs
            execution_times = [None] * n_jobs
            task_info = list()

            n_cache = 0
            for ii, this_args in enumerate(zip(*my_seqs)):
                # get the cache directory
                out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs)
                if op.exists(op.join(out_dir, 'output.pkl')):
                    out[ii] = f_cache.load_output(out_dir)
                    n_cache += 1
                    continue
                if lbview is None:
                    lbview = self._ip_client.load_balanced_view()
                task = lbview.apply(f, *this_args, **kwargs)
                task_info.append(dict(task=task, idx=ii, args=this_args))
            if self._verbose >= 1:
                print_('Loaded %d results from cache' % n_cache)

            # wait for tasks to complete
            last_print = 0
            last_idle_check = time.time()
            idle_times = {}
            result_retrieved = [False] * len(task_info)
            retry_no = np.zeros(len(task_info), dtype=np.int)
            failed_tasks = []
            while len(task_info) > 0:
                for ii, ti in enumerate(task_info):
                    if not result_retrieved[ii] and ti['task'].ready():
                        task = ti['task']
                        if task.successful():
                            this_out = task.get()
                            # cache the input and output
                            out_dir, _ = f_cache.get_output_dir(*ti['args'],
                                                                **kwargs)
                            f_cache._persist_output(this_out, out_dir)
                            f_cache._persist_input(out_dir, *ti['args'], **kwargs)
                            # insert result into output
                            out[ti['idx']] = this_out
                            execution_times[ti['idx']] = task.serial_time
                            result_retrieved[ii] = True
                        else:
                            if retry_no[ii] < self._retries:
                                if self._verbose > 3:
                                    print ('\nTask %d failed, re-running (%d / %d)'
                                           % (ii, retry_no[ii] + 1,
                                              self._retries))
                                new_task = lbview.apply(f, *ti['args'], **kwargs)
                                ti['task'] = new_task
                                retry_no[ii] += 1
                            else:
                                # task failed too many times, mark it as done
                                # but keep running
                                if self._verbose >= 1:
                                    print ('\nTask %d failed %d times.'
                                           % (ii, self._retries + 1))
                                failed_tasks.append(task)
                                result_retrieved[ii] = True

                    if self._purge:
                        _purge_results(self._ip_client, task)

                if self._idle_timeout is not None and time.time() > last_idle_check + 30:
                    now = time.time()
                    queue = self._ip_client.queue_status()
                    shutdown_eids = []
                    for eid in self._ip_client.ids:
                        if eid not in queue:
                            continue
                        if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                            # engine is idle
                            idle_time = idle_times.get(eid, None)
                            if idle_time is None:
                                # mark engine as idle
                                idle_times[eid] = now
                                continue
                            if now - idle_time > self._idle_timeout:
                                # shut down engine
                                shutdown_eids.append(eid)
                        elif eid in idle_times:
                            # engine has started running again
                            del idle_times[eid]

                    if len(shutdown_eids) > 0:
                        if self._verbose > 0:
                            print 'Shuting-down engines: ', shutdown_eids
                        dv = self._ip_client.direct_view(shutdown_eids)
                        dv.shutdown()
                        for eid in shutdown_eids:
                            del idle_times[eid]

                        last_idle_check = now

                n_completed = int(np.sum(result_retrieved))
                progress = n_completed / float(n_jobs - n_cache)
                # print progress in 10% increments
                this_print = int(np.floor(progress * 10))
                if self._verbose >= 1 and this_print != last_print:
                    print_(' %d%% ' % (100 * progress), end='')
                    last_print = this_print
                if n_completed == n_jobs - n_cache:
                    # we are done!
                    print_('')  # newline
                    break
                time.sleep(self._poll_interval)

            if self._shutdown:
                self._shutdown_cluster()

            if len(failed_tasks) > 0:
                msg = ''
                for task in failed_tasks[:5]:
                    try:
                        task.get()
                    except Exception as e:
                        msg += str(e)
                raise RuntimeError('%d tasks failed:\n %s'
                                   % (len(failed_tasks), msg))

            self._execution_times = execution_times
        else:
            raise RuntimeError('WTF?')

        if self._verbose >= 1:
            print_('Done (%0.1f seconds)' % (time.time() - t_start))

        return out

    def get_last_excecution_times(self):
        return self._execution_times

    def purge_results(self, f, *sequences, **kwargs):
        # make sure all sequences have the same length
        n_jobs = None
        my_seqs = []
        for ii, seq in enumerate(sequences):
            try:
                this_n_elems = len(seq)
                if n_jobs is None:
                    n_jobs = this_n_elems
                if this_n_elems != n_jobs:
                    raise ValueError('All sequences must have the same lenght,'
                                     'sequence at position %d has length %d'
                                     % (ii + 1, this_n_elems))
                my_seqs.append(seq)
            except TypeError:
                # we allow passing ints etc, convert them to a sequence
                my_seqs.append(repeat(seq))

        f_cache = self._memory.cache(f)
        n_deleted = 0
        for this_args in zip(*my_seqs):
            out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs)
            if op.exists(out_dir):
                shutil.rmtree(out_dir)
                n_deleted += 1
        print 'Purging cache: %d out of %d deleted' % (n_deleted, n_jobs)

    def _shutdown_cluster(self):
        # shut down all idle engines
        queue = self._ip_client.queue_status()
        shutdown_eids = []
        for eid in self._ip_client.ids:
            if eid not in queue:
                continue
            if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                shutdown_eids.append(eid)
        if len(shutdown_eids) > 0:
            if self._verbose > 0:
                print 'Shuting-down engines: ', shutdown_eids
            dv = self._ip_client.direct_view(shutdown_eids)
            dv.shutdown()
예제 #19
0
"""
Start with `ipcluster`
"""
#------------------------------------------------------------
# Initialize the connection to the client
#------------------------------------------------------------
# Also look at StarCluster to use EC2 instances
#------------------------------------------------------------
from IPython.parallel import Client
client = Client()
queue = client.direct_view()
print "available workers: ", len(queue)

#------------------------------------------------------------
# Do some work in parallel
#------------------------------------------------------------
squared = queue.map_sync(lambda x: x**2, [1,2,3,4])
print squared
예제 #20
0
def wmt_rho(aname,
            ddir,
            fprefix,
            years,
            pref=0,
            hconst=50.,
            task='calc_transformation_rates',
            monthly_mean=False,
            fsuffix=''):
    """Perform water mass analysis on a specific POP model run.
    aname - (string) the nickname of this specific analysis,
                used when saving data
        ddir - the directory where the run lives
        fprefix - the string that begins the file names
            (e.g. hybrid_v5_rel04_BC5_ne120_t12_pop62.pop.h.nday1)
        years - a list of years to analyze
        fsuffix - a trailing suffix (before .nc)
        pref - reference pressure for analysis
        hconst - depth of assumed surface layer
        monthly_mean - whether to calculate the
            transformation rate on the monthly mean instead of
            of the daily snapshots
    """

    ##############################
    ## Set Up Parallel Engines ###
    ##############################

    # give engines time to load
    time.sleep(20)

    c = Client()
    dview = c.direct_view()
    lview = c.load_balanced_view()

    with dview.sync_imports():
        import numpy
        from watermasstools import pop_model, transformation

    #####################################
    ## Define Regions for Calculation ###
    #####################################

    fnames = []
    for year in years:
        for month in xrange(1, 13):
            fname = '%s/%s.%04d-%02d%s.nc' % (ddir, fprefix, year, month,
                                              fsuffix)
            fnames.append(fname)
    for f in fnames:
        print f

    # load a test file
    p = pop_model.POPFile(fnames[0], pref=pref)

    # define basins
    if pref == 0:
        natl = transformation.WaterMassRegion(
            basin_names=['Atlantic Ocean', 'GIN Seas', 'Labrador Sea'],
            latmin=15)
        natl.initialize_mask(p)
        natl.calculate_rholevs(rhomin=1022,
                               rhomax=1028.5,
                               nlevs=120,
                               linear=True)

        npac = transformation.WaterMassRegion(basin_names=['Pacific Ocean'],
                                              latmin=15)
        npac.initialize_mask(p)
        npac.calculate_rholevs(rhomin=1020,
                               rhomax=1027,
                               nlevs=120,
                               linear=True)

        so = transformation.WaterMassRegion(basin_names=['Southern Ocean'],
                                            latmax=30)
        so.initialize_mask(p)
        so.calculate_rholevs(rhomin=1022, rhomax=1030, nlevs=120, linear=True)

        globe = transformation.WaterMassRegion()
        globe.initialize_mask(p)
        globe.calculate_rholevs(rhomin=1018,
                                rhomax=1030,
                                nlevs=120,
                                linear=True)
    elif pref == 2000:
        natl = transformation.WaterMassRegion(
            basin_names=['Atlantic Ocean', 'GIN Seas', 'Labrador Sea'],
            latmin=15)
        natl.initialize_mask(p)
        natl.calculate_rholevs(rhomin=1028,
                               rhomax=1037.8,
                               nlevs=120,
                               linear=True)

        npac = transformation.WaterMassRegion(basin_names=['Pacific Ocean'],
                                              latmin=15)
        npac.initialize_mask(p)
        npac.calculate_rholevs(rhomin=1027,
                               rhomax=1036.5,
                               nlevs=120,
                               linear=True)

        so = transformation.WaterMassRegion(basin_names=['Southern Ocean'],
                                            latmax=30)
        so.initialize_mask(p)
        so.calculate_rholevs(rhomin=1030,
                             rhomax=1037.8,
                             nlevs=120,
                             linear=True)

        globe = transformation.WaterMassRegion()
        globe.initialize_mask(p)
        globe.calculate_rholevs(rhomin=1026,
                                rhomax=1037.8,
                                nlevs=120,
                                linear=True)
    else:
        raise ValueError('Invalid pressure level %g specified' % pref)

    region_dict = {'natl': natl, 'npac': npac, 'so': so, 'globe': globe}

    # push to engines
    dview.push(dict(hconst=hconst, pref=pref, monthly_mean=monthly_mean))
    dview.push(region_dict)
    dview.execute(
        "region_dict = {'natl': natl, 'npac': npac, 'so': so, 'globe': globe}")
    # check
    dview.execute('a = region_dict.keys()[0]')
    a = dview.gather('a')
    for r in a.get():
        assert r == 'npac'

    #######################
    ## Apply on Engines ###
    #######################

    if task == 'calc_transformation_rates':
        mapfunc = calc_transformation_rates
        prefix = 'WMT'
    elif task == 'calc_Fd':
        mapfunc = calc_Fd
        prefix = 'FD'

    res = lview.map(mapfunc, fnames)

    while not res.ready():
        print 'progress %3.2f%%' % (100 * res.progress / float(len(res)))
        time.sleep(60)

    assert res.successful()

    ###################
    ## Save Results ###
    ###################

    all_res = dict()
    for k in region_dict:
        all_res[k] = []
    for r in res:
        for k in all_res:
            all_res[k].append(r[k])
    for k in all_res:
        all_res[k] = numpy.array(all_res[k])
        numpy.savez('../data/%s_%s_sigma%1d_hconst%03d_%s.npz' %
                    (prefix, aname, pref / 1000, hconst, k),
                    A=all_res[k],
                    rholevs=region_dict[k].rholevs)
# (because we do not "install" the llc module)
sys.path.append('..')
from llc import llc_model

base_dir = os.path.join(os.environ['LLC'], 'llc_1080')
LLC = llc_model.LLCModel1080(
#base_dir = os.path.join(os.environ['LLC'], 'llc_4320')
#LLC = llc_worker.LLCModel4320(
        data_dir = os.path.join(base_dir, 'run_day732_896'),
        grid_dir = os.path.join(base_dir, 'grid'))

# set make this True to use parallel execution
if False:
    # connect to ipcluster server
    c = Client(profile='default')
    dview = c.direct_view()
    lbv = c.load_balanced_view()
    mapfunc = lbv.map_async
else:
    # just use serial execution
    mapfunc = map
    
# this is where the work gets done
def work_on_tile(tile):
    # need to reimport the modules
    # there must be a cleaner way to do this, but I don't know how
    try:
        from llc import llc_model
    except ImportError:
        sys.path.append('..')
        from llc import llc_model
예제 #22
0
파일: parallel.py 프로젝트: kmsquire/pyhsmm
from __future__ import division
from IPython.parallel import Client
from IPython.parallel.util import interactive

# NOTE: the ipcluster should be set up before this file is imported

c = Client()
dv = c.direct_view()
dv.execute('import pyhsmm')
lbv = c.load_balanced_view()

# this dict needs to be populated by hand before calling build_states*, both
# locally (in this module) and in the ipython top-level module on every engine
# NOTE: the data should probably be arrays with dtype=np.float64
alldata = {}


# this function is run on the engines, and expects the alldata global as well as
# the current model global_model to be present in the ipython global frame
@lbv.parallel(block=True)
@interactive
def build_states(data_id):
    global global_model
    global alldata

    # adding the data to the pushed global model will build a substates object
    # and resample the states given the parameters in the model
    global_model.add_data(alldata[data_id], initialize_from_prior=False)
    stateseq = global_model.states_list[-1].stateseq
    global_model.states_list = []
예제 #23
0
        raise ValueError("Cannot gather results from {0}".format(outfile))
    results = NumpyCache(outfile)
    brd = np.broadcast(pointing_indices, ndays, gmags, template_indices)
    results = np.array([results.get_row(key) for key in brd])
    return results.reshape(brd.shape + results.shape[-1:])


if __name__ == '__main__':
    parallel = True

    if parallel:
        # Need some imports on the engine
        from IPython.parallel import Client
        client = Client()

        dview = client.direct_view()
        with dview.sync_imports():
            from gatspy.periodic import (LombScargleMultiband,
                                         LombScargleMultibandFast,
                                         SuperSmootherMultiband)
    else:
        client = None

    template_indices = np.arange(2 * 23).reshape(2, 23).T
    pointing_indices = np.arange(1, 24)[:, None]
    ndays = np.array([90, 180, 365, 2 * 365, 5 * 365])[:, None, None]
    gmags = np.array([20, 21, 22, 23, 24.5])[:, None, None, None]

    kwargs = dict(pointing_indices=pointing_indices,
                  ndays=ndays,
                  gmags=gmags,
예제 #24
0
    results = NumpyCache(outfile)
    brd = np.broadcast(pointing_indices, ndays, gmags, template_indices)
    results = np.array([results.get_row(key) for key in brd])
    return results.reshape(brd.shape + results.shape[-1:])


if __name__ == "__main__":
    parallel = True

    if parallel:
        # Need some imports on the engine
        from IPython.parallel import Client

        client = Client()

        dview = client.direct_view()
        with dview.sync_imports():
            from gatspy.periodic import LombScargleMultiband, LombScargleMultibandFast, SuperSmootherMultiband
    else:
        client = None

    template_indices = np.arange(2 * 23).reshape(2, 23).T
    pointing_indices = np.arange(1, 24)[:, None]
    ndays = np.array([90, 180, 365, 2 * 365, 5 * 365])[:, None, None]
    gmags = np.array([20, 21, 22, 23, 24.5])[:, None, None, None]

    kwargs = dict(
        pointing_indices=pointing_indices,
        ndays=ndays,
        gmags=gmags,
        template_indices=template_indices,
예제 #25
0
DIR = os.getcwd()
OUTPUT_DIR = DIR + '/' + 'output/'
TMP_DIR = '/da/dmp/cb/clayie1/tmp/seed_graph/'
if not os.path.exists(OUTPUT_DIR):
    print('Making ' + OUTPUT_DIR)
    os.makedirs(OUTPUT_DIR)
print('Output in ' + OUTPUT_DIR)
# PROXY = os.environ.get('http_proxy', 'http://eu-chbs-PROXY.eu.novartis.net:2011/')
PROXY = os.environ.get('http_proxy', 'http://nibr-proxy.global.nibr.novartis.net:2011/')

# <codecell>

# parallel support
from IPython.parallel import Client, error
cluster = Client() # default profile
dview = cluster.direct_view() # direct access/control (including push/pull)
lbview = cluster.load_balanced_view() # load balanced view for running jobs
cluster.ids
# nb: MPI clusters must be initiated - see: http://nbviewer.ipython.org/github/ipython/ipython/blob/master/examples/Parallel%20Computing/Using%20MPI%20with%20IPython%20Parallel.ipynb

# test
dview.execute('import os')
%px print("testing... pid: " + str(os.getpid()))

# <markdowncell>

# ## Collect UTR sequences from Biomart

# <markdowncell>

# ### Set up query