Esempio n. 1
0
def parallel_run():
    """
    Start parallel engines to run
    """
    from IPython.parallel import Client

    c = Client()   # here is where the client establishes the connection
    lv = c.load_balanced_view()   # this object represents the engines (workers)


    rays = []
    maxs=25
    bounding = AABA(xmin=0, ymin=0, zmin=0, xmax=maxs, ymax=maxs, zmax=maxs,)
    gridd = np.zeros((maxs,maxs,maxs))
    # spectrum for red to nir leaves
    red_nir_leaves = spectrum(np.array([0.5, 0.85]), np.array([0.1, 0.6]), np.array([0.5, 0.1]))
    # spectrum for soil
    red_nir_soil = spectrum(np.array([0.5, 0.85]), np.array([0.3, 0.4]), np.array([0.0, 0.0]))


    # scattering setup
    scatt = BRDSF(red_nir_leaves, 0.0)
    lf = leaf(55.0, 0.8) # leaf angle distribution and leaf area density


    tasks = []
    for x in xrange(maxs):
        for y in xrange(maxs):
            tasks.append(lv.apply(prun, x,y, maxs, gridd, scatt, red_nir_soil, bounding, lf))

    result = [task.get() for task in tasks]  # blocks until all results are back

    return results
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('db_fname',
                        help="Provide the filename of the HDF database "
                             "file here.")
    args = parser.parse_args()

    image_names = get_image_names_from_db(args.db_fname)
    logging.info('Found {} image_names'.format(len(image_names)))

    c = Client()
    dview = c.direct_view()
    lbview = c.load_balanced_view()

    dview.push({'do_clustering': do_clustering,
                'dbfile': args.db_fname})
    results = lbview.map_async(process_image_name, image_names)
    import time
    import sys
    import os
    dirname = os.path.join(os.environ['HOME'], 'data/planet4/catalog_2_and_3')
    while not results.ready():
        print("{:.1f} %".format(100 * results.progress / len(image_names)))
        sys.stdout.flush()
        time.sleep(10)
    for res in results.result:
        print(res)
    logging.info('Catalog production done. Results in {}.'.format(dirname))
Esempio n. 3
0
def remove_duplicates(df):
    logging.info('Removing duplicates.')

    image_names = df.image_name.unique()

    def process_image_name(image_name):
        data = df[df.image_name == image_name]
        data = remove_duplicates_from_image_name_data(data)
        data.to_hdf(get_temp_fname(image_name), 'df')

    # parallel approach, u need to launch an ipcluster/controller for this work!
    c = Client()
    dview = c.direct_view()
    dview.push({'remove_duplicates_from_image_name_data':
                remove_duplicates_from_image_name_data,
                'data_root': data_root})
    lbview = c.load_balanced_view()
    lbview.map_sync(process_image_name, image_names)

    df = []
    for image_name in image_names:
        try:
            df.append(pd.read_hdf(get_temp_fname(image_name), 'df'))
        except OSError:
            continue
        else:
            os.remove(get_temp_fname(image_name))
    df = pd.concat(df, ignore_index=True)
    logging.info('Duplicates removal complete.')
    return df
Esempio n. 4
0
def map(r, func, args=None, modules=None):
    """
	Before you run parallel.map, start your cluster (e.g. ipcluster start -n 4)
	
	map(r,func, args=None, modules=None):
	args=dict(arg0=arg0,...)
	modules='numpy, scipy'    
	
	examples:
	func= lambda x: numpy.random.rand()**2.
	z=parallel.map(r_[0:1000], func, modules='numpy, numpy.random')
	plot(z)
	
	A=ones((1000,1000));
	l=range(0,1000)
	func=lambda x : A[x,l]**2.
	z=parallel.map(r_[0:1000], func, dict(A=A, l=l))
	z=array(z)
	
	"""
    mec = Client()
    mec.clear()
    lview = mec.load_balanced_view()
    for k in mec.ids:
        mec[k].activate()
        if args is not None:
            mec[k].push(args)
        if modules is not None:
            mec[k].execute('import ' + modules)
    z = lview.map(func, r)
    out = z.get()
    return out
Esempio n. 5
0
def ipythonMP(m):
    from IPython.parallel import Client

    cli = Client()
    dview = cli[:]
    lbview = cli.load_balanced_view()
    return dview.map_sync(factorize,range(m))
    def test_run_from_multiple_files_without_cache_on_ipy_cluster(self):
        try:
            from IPython.parallel import Client
            client = Client()

            pool = client.load_balanced_view()
        except:
            raise unittest.SkipTest("Cluster connection failed")

        models = [self.transport]
        p = Point(self.start_lon, self.start_lat)
        model = IPythonClusterModelController(geometry=p,
                                              depth=self.start_depth,
                                              start=self.start_time,
                                              step=self.time_step,
                                              nstep=self.num_steps,
                                              npart=self.num_particles,
                                              models=models,
                                              use_bathymetry=False,
                                              use_shoreline=False,
                                              pool=pool)

        model.setup_run("/data/lm/tests/pws_das_2014*.nc")
        model.run(output_formats=self.output_formats, output_path=self.output_path)

        self.assertTrue(os.path.exists(os.path.join(self.output_path, "simple_trackline.geojson")))
        self.draw_trackline(os.path.join(self.output_path, "simple_trackline.geojson"))
        # Not a caching controller, no cache path should exist
        self.assertFalse(os.path.exists(self.cache_path))
    def test_run_from_multiple_files_without_cache_on_ipy_cluster(self):
        try:
            from IPython.parallel import Client
            client = Client()

            pool = client.load_balanced_view()
        except:
            raise unittest.SkipTest("Cluster connection failed")

        models = [self.transport]
        p = Point(self.start_lon, self.start_lat)
        model = IPythonClusterModelController(geometry=p,
                                              depth=self.start_depth,
                                              start=self.start_time,
                                              step=self.time_step,
                                              nstep=self.num_steps,
                                              npart=self.num_particles,
                                              models=models,
                                              use_bathymetry=False,
                                              use_shoreline=False,
                                              pool=pool)

        model.setup_run("/data/lm/tests/pws_das_2014*.nc")
        model.run(output_formats=self.output_formats,
                  output_path=self.output_path)

        self.assertTrue(
            os.path.exists(
                os.path.join(self.output_path, "simple_trackline.geojson")))
        self.draw_trackline(
            os.path.join(self.output_path, "simple_trackline.geojson"))
        # Not a caching controller, no cache path should exist
        self.assertFalse(os.path.exists(self.cache_path))
Esempio n. 8
0
    def __init__(self, config_filename=None, profile=None, seed=None, sshkey=None, packer='json'):
        """Initialize a IPClusterEngine

        Do IPython.parallel operations to set up cluster and generate mapper.

        """
        super(IPClusterEngine, self).__init__(seed=seed)
        rc = Client(config_filename, profile=profile, sshkey=sshkey, packer=packer)
        # FIXME: add a warning if environment in direct view is not 'empty'?
        #        else, might become dependent on an object created in
        #        environemnt in a prior run
        dview = rc.direct_view()
        lview = rc.load_balanced_view()
        with dview.sync_imports(local=True):
            import crosscat
        mapper = lambda f, tuples: self.lview.map(f, *tuples)
        # if you're trying to debug issues, consider clearning to start fresh
        # rc.clear(block=True)
        #
        self.rc = rc
        self.dview = dview
        self.lview = lview
        self.mapper = mapper
        self.do_initialize = None
        self.do_analyze = None
        return
Esempio n. 9
0
def map(r,func, args=None, modules=None):
	"""
	Before you run parallel.map, start your cluster (e.g. ipcluster start -n 4)
	
	map(r,func, args=None, modules=None):
	args=dict(arg0=arg0,...)
	modules='numpy, scipy'    
	
	examples:
	func= lambda x: numpy.random.rand()**2.
	z=parallel.map(r_[0:1000], func, modules='numpy, numpy.random')
	plot(z)
	
	A=ones((1000,1000));
	l=range(0,1000)
	func=lambda x : A[x,l]**2.
	z=parallel.map(r_[0:1000], func, dict(A=A, l=l))
	z=array(z)
	
	"""
	mec = Client()
	mec.clear()
	lview=mec.load_balanced_view()
	for k in mec.ids:
		mec[k].activate()
		if args is not None:
			mec[k].push(args)
		if modules is not None:
			mec[k].execute('import '+modules)
	z=lview.map(func, r)
	out=z.get()
	return out
Esempio n. 10
0
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    print "\t# nodes in use: {}".format(len(lview.targets or rc.ids))
    lview.block = False

    print "\t# of tasks: {}".format(len(task_list))
    print "\tsubmitting...",
    job = lview.map_async(worker, task_list)
    print "done."
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print "Aborted, all submitted jobs are cancelled."
        else:
            print "Aborted, but your jobs are still running on the cluster."
        return

    if len(job.result) != len(task_list):
        print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list))
    print "\ttotal time: {}".format(timesofar(t0))

    if shutdown_ipengines_after_done:
        print "\tshuting down all ipengine nodes...",
        lview.shutdown()
        print 'Done.'
    return job.result
Esempio n. 11
0
def _init_cluster_and_database(profile=None):
    rc = Client(profile=profile)
    _dview = rc[:]
    _lview = rc.load_balanced_view()

    with _dview.sync_imports():
        import os
        from os.path import join
        import gzip
        import pickle
        import numpy
        from pymongo import MongoClient
        from rpy2.robjects.conversion import ri2py
        from sklearn.base import BaseEstimator
        from survival.base import ExternalREstimatorMixin
        from survival.cross_validation import _fit_and_score
        from survival.metrics import concordance_index_censored
        from survival.meta.ensemble_selection import EnsembleAverage

    _dview.push({
        "mongodb_host": mongodb_host,
        "models_dir": models_dir
    },
                block=True)

    return _dview, _lview
Esempio n. 12
0
def subsample(cache_dir, image_sets, ipython_profile):
    parameters = [(cache_dir, images) for images in image_sets]

    if ipython_profile:
        from IPython.parallel import Client, LoadBalancedView
        client = Client(profile='lsf')
        lview = client.load_balanced_view()
        generator = lview.imap(_compute_group_subsample, parameters)
    elif ipython_profile == False:
        generator = (_compute_group_subsample(p) for p in parameters)
    else:
        from multiprocessing import Pool
        lview = Pool()
        generator = lview.imap(_compute_group_subsample, parameters)
    progress = progressbar.ProgressBar(widgets=['Subsampling:',
                                                progressbar.Percentage(), ' ',
                                                progressbar.Bar(), ' ', 
                                                progressbar.Counter(), '/', 
                                                str(len(parameters)), ' ',
                                                progressbar.ETA()],
                                       maxval=len(parameters))
    results = list(generator)

    subsample = []
    for i, (p, r) in enumerate(zip(parameters, results)):
        if r is None:
            print >>sys.stderr, '#### There was an error, recomputing locally: %s' % parameters[i][1]
            results[i] = _compute_group_subsample(p) # just to see throw the exception
        subsample.extend(r)

    print "the subsampling set contains %d items" % len(subsample)
    return subsample
def analyze_log_file_in_phases(file_id, nstates, trials, iter):
    print_n_flush("Starting phase by phase analysis...")
    # id_to_log = lambda x: "logs/%s.exp.log" % x
    filename_log = id_to_log(file_id)
    responses, tests, responses_t, tests_t, images = toCSV(filename_log)
    from IPython.parallel import Client
    #     from functools import partial
    from rpy2.rinterface import initr

    rinterface.set_initoptions(("--max-ppsize=100000"))
    initr()
    client = Client(profile="default")
    # client[:].push(dict(initr=initr))
    # client[:].apply_sync(lambda: initr())
    lview = client.load_balanced_view()  # default load-balanced view
    lview.block = True
    # func = lambda args: train_hmm_n_times(file_id=args[0], nstates=args[1], trials=args[2], iter=args[3])
    # trials = 4
    client[:].push(dict(train_hmm_once=train_hmm_once))
    # args = [(file_id, nstates, trials, 1000) for nstates in range(5,10)]
    # results = lview.map(func, args)# hmm, d, results = train_hmm_n_times(file_id, nstates, trials=20, iter=1000)
    # pool.join()
    results = {}
    for i in range(3):
        results[i] = train_hmm_n_times(file_id, nstates=nstates, trials=trials, iter=iter, phase=i)
    return results
Esempio n. 14
0
    def main_loop(self,
                  time_budget=None,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None):
        """
        Run main_loop of each trainer.

        Note: if you get PickleErrors when running in parallel, make sure
        you have `dill` installed.

        Parameters
        ----------
        time_budget : int, optional
            The maximum number of seconds before interrupting
            training. Default is `None`, no time limit.
        parallel : bool, optional
            Whether to train subtrainers in parallel using
            IPython.parallel (default False).
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        self.setup()
        if parallel:
            from IPython.parallel import Client

            def _train(trainer, time_budget=None):
                """
                Run main_loop of this trainer.

                Parameters
                ----------
                trainer : Train object
                    Train object.
                time_budget : int, optional
                    The maximum number of seconds before interrupting
                    training. Default is `None`, no time limit.
                """
                trainer.main_loop(time_budget)
                return trainer

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(_train,
                            self.trainers[self.skip_folds:], [time_budget] *
                            len(self.trainers[self.skip_folds:]),
                            block=False)
            self.trainers = call.get()
        else:
            for trainer in self.trainers[self.skip_folds:]:
                trainer.main_loop(time_budget)
        self.save()
Esempio n. 15
0
def _test_wrapper_remote(func):
    """Execute a function on a remote ipengine"""
    from IPython.parallel import Client
    from qiita_core.configuration_manager import ConfigurationManager
    config = ConfigurationManager()
    c = Client(profile=config.ipython_default)
    bv = c.load_balanced_view()
    return _ipy_wait(bv.apply_async(func))
Esempio n. 16
0
def _test_wrapper_remote(func):
    """Execute a function on a remote ipengine"""
    from IPython.parallel import Client
    from qiita_core.configuration_manager import ConfigurationManager
    config = ConfigurationManager()
    c = Client(profile=config.ipython_default)
    bv = c.load_balanced_view()
    return _ipy_wait(bv.apply_async(func))
Esempio n. 17
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None,
                  verbosity=None,
                  log_every_n=1000):
        """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)
        assert verbosity in [None, "low", "high"]

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = []
            for i, mol in enumerate(mols):
                if verbosity is not None and i % log_every_n == 0:
                    log("Featurizing %d / %d" % (i, len(mols)))
                if mol is not None:
                    features.append(self._featurize(mol))
                else:
                    features.append(np.array([]))

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
Esempio n. 18
0
    def compute(cls,
                keys,
                variables,
                function,
                parameters,
                ipython_profile=None,
                group_name=None):
        """
        Compute profiles by applying the parameters to the function in parallel.

        """
        assert len(keys) == len(parameters)
        njobs = len(parameters)
        if isinstance(ipython_profile, LSFView):
            view = ipython_profile
            logger.debug('Running %d jobs on LSF' % view.njobs)
            generator = view.imap(function, parameters)
        elif ipython_profile:
            from IPython.parallel import Client, LoadBalancedView
            client = Client(profile=ipython_profile)
            view = client.load_balanced_view()
            logger.debug('Running %d jobs' % njobs)
            generator = view.imap(function, parameters)
        elif ipython_profile == False:
            generator = (function(p) for p in parameters)
        else:
            from multiprocessing import Pool, cpu_count
            import threading
            view = Pool()
            logger.debug('Running %d jobs on %d local CPU%s' %
                         (njobs, cpu_count(), ' s'[cpu_count() > 1]))
            generator = view.imap(function, parameters)
        try:
            import progressbar
            progress = progressbar.ProgressBar(widgets=[
                progressbar.Percentage(), ' ',
                progressbar.Bar(), ' ',
                progressbar.Counter(), '/',
                str(njobs), ' ',
                progressbar.ETA()
            ],
                                               maxval=njobs)
            data = list(progress(generator))
        except ImportError:
            data = list(generator)

        for i, (p, r) in enumerate(zip(parameters, data)):
            if r is None:
                logger.info('Retrying failed computation locally')
                data[i] = function(p)

        rowmask = [(l != None) and all(~np.isnan(l)) for l in data]
        import itertools
        data = list(itertools.compress(data, rowmask))
        keys = list(itertools.compress(keys, rowmask))

        return cls(keys, data, variables, group_name=group_name)
Esempio n. 19
0
def main():
    parser = OptionParser()
    parser.set_defaults(n=100)
    parser.set_defaults(tmin=1e-3)
    parser.set_defaults(tmax=1)
    parser.set_defaults(profile='default')

    parser.add_option("-n",
                      type='int',
                      dest='n',
                      help='the number of tasks to run')
    parser.add_option("-t",
                      type='float',
                      dest='tmin',
                      help='the minimum task length in seconds')
    parser.add_option("-T",
                      type='float',
                      dest='tmax',
                      help='the maximum task length in seconds')
    parser.add_option("-p",
                      '--profile',
                      type='str',
                      dest='profile',
                      help="the cluster profile [default: 'default']")

    (opts, args) = parser.parse_args()
    assert opts.tmax >= opts.tmin, "tmax must not be smaller than tmin"

    rc = Client()
    view = rc.load_balanced_view()
    print view
    rc.block = True
    nengines = len(rc.ids)
    with rc[:].sync_imports():
        from IPython.utils.timing import time

    # the jobs should take a random time within a range
    times = [
        random.random() * (opts.tmax - opts.tmin) + opts.tmin
        for i in range(opts.n)
    ]
    stime = sum(times)

    print "executing %i tasks, totalling %.1f secs on %i engines" % (
        opts.n, stime, nengines)
    time.sleep(1)
    start = time.time()
    amr = view.map(time.sleep, times)
    amr.get()
    stop = time.time()

    ptime = stop - start
    scale = stime / ptime

    print "executed %.1f secs in %.1f secs" % (stime, ptime)
    print "%.3fx parallel performance on %i engines" % (scale, nengines)
    print "%.1f%% of theoretical max" % (100 * scale / nengines)
Esempio n. 20
0
def calibrate_multiple():
    num_runs = 12
    rc = Client()
    lview = rc.load_balanced_view()
    lview.block = True
    print(datetime.datetime.now())
    res = lview.map(lambda q: calibrate_mh(), range(num_runs))
    print(datetime.datetime.now())
    1/0
Esempio n. 21
0
    def main_loop(self, time_budget=None, parallel=False, client_kwargs=None,
                  view_flags=None):
        """
        Run main_loop of each trainer.

        Note: if you get PickleErrors when running in parallel, make sure
        you have `dill` installed.

        Parameters
        ----------
        time_budget : int, optional
            The maximum number of seconds before interrupting
            training. Default is `None`, no time limit.
        parallel : bool, optional
            Whether to train subtrainers in parallel using
            IPython.parallel (default False).
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        self.setup()
        if parallel:
            from IPython.parallel import Client

            def _train(trainer, time_budget=None):
                """
                Run main_loop of this trainer.

                Parameters
                ----------
                trainer : Train object
                    Train object.
                time_budget : int, optional
                    The maximum number of seconds before interrupting
                    training. Default is `None`, no time limit.
                """
                trainer.main_loop(time_budget)
                return trainer

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(_train,
                            self.trainers[self.skip_folds:],
                            [time_budget] * len(self.trainers[self.skip_folds:]),
                            block=False)
            self.trainers = call.get()
        else:
            for trainer in self.trainers[self.skip_folds:]:
                trainer.main_loop(time_budget)
        self.save()
Esempio n. 22
0
def main():
    parser = OptionParser()
    parser.add_option('-d','--dataset',dest='dataset',help='path to dataset')
    parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)')
    parser.add_option('--l1_min',dest='l1_min',type='float',help='min l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l1_max',dest='l1_max',type='float',help='max l1 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_min',dest='l2_min',type='float',help='min l2 constant to try (expected to be a power of 10)')
    parser.add_option('--l2_max',dest='l2_max',type='float',help='max l2 constant to try (expected to be a power of 10)')
    parser.add_option('--max_sims',dest='max_sims',type='int',default=2000,help='max desired number of positive item similarity weights (default: %default)')
    parser.add_option('--min_sims',dest='min_sims',type='int',default=15,help='min desired number of positive item similarity weights (default: %default)')
    parser.add_option('--max_sparse',dest='max_sparse',type='float',default=0.01,help='max allowable proportion of items with less than min_sims positive similarity weights (default: %default)')
    parser.add_option('--num_samples',dest='num_samples',type='int',default=100,help='number of sample items to evaluate for each regularization setting')
    parser.add_option('--packer',dest='packer',default='json',help='packer for IPython.parallel (default: %default)')
    parser.add_option('--add_module_paths',dest='add_module_paths',help='comma-separated list of paths to append to pythonpath to enable import of uninstalled modules')

    (opts,args) = parser.parse_args()
    if not opts.dataset or not opts.input_format or not opts.l1_min or not opts.l1_max or not opts.l2_min or not opts.l2_max:
        parser.print_help()
        raise SystemExit

    logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s')

    dataset = load_fast_sparse_matrix(opts.input_format,opts.dataset)

    params = {'l1_reg':pow_range(opts.l1_min,opts.l1_max),
              'l2_reg':pow_range(opts.l2_min,opts.l2_max)}
    num_items = dataset.shape[1]
    sample_items = random.sample(xrange(num_items),opts.num_samples)

    logging.info('preparing tasks for a grid search of these values:')
    logging.info(params)
    tasks = [(args,dataset,opts.min_sims,sample_items) for args in ParameterGrid(params)]

    c = Client(packer=opts.packer)
    view = c.load_balanced_view()

    if opts.add_module_paths:
        c[:].execute('import sys')
        for path in opts.add_module_paths.split(','):
            logging.info('adding {0} to pythonpath on all engines'.format(path))
            c[:].execute("sys.path.append('{0}')".format(path))

    logging.info('running {0} tasks in parallel...'.format(len(tasks)))
    results = view.map(estimate_sparsity,tasks,ordered=False)

    candidates = [(args,nsims,nsparse,nneg) for args,nsims,nsparse,nneg in results if nsims <= opts.max_sims and nsparse <= opts.max_sparse]

    if candidates:
        best = min(candidates,key=itemgetter(1))

        print 'best parameter setting: {0}'.format(best[0])
        print 'mean # positive similarity weights per item = {0:.3}'.format(best[1])
        print 'proportion of items with fewer than {0} positive similarity weights = {1:.3}'.format(opts.min_sims,best[2])
        print 'mean # negative similarity weights per item = {0:.3}'.format(best[3])
    else:
        print 'no parameter settings satisfied the conditions, try increasing --min_sims, --max_sims or --max_sparse'
    def test_run_from_multiple_files_without_cache_on_ipy_cluster(self):
        try:
            from IPython.parallel import Client
            client = Client()

            pool = client.load_balanced_view()
        except:
            raise unittest.SkipTest("Cluster connection failed")

        self.test_run_from_multiple_files_without_cache(pool=pool)
    def _init_cluster(self):
        rc = Client(profile=self.profile)
        dview = rc[:]
        lview = rc.load_balanced_view()

        with dview.sync_imports():
            from IPython.config import Application
            from survival.cross_validation import _fit_and_score
            from sklearn.base import clone

        return dview, lview
Esempio n. 25
0
def main():
    partial_results=[]
    c=Client(profile='default')
    print c.ids
    view=c.load_balanced_view()
    ar = view.map_async(func, range(10))
    #print ar.get_dict(timeout=0)
    print ar.msg_ids
    for i, r in enumerate(ar):
        print r[1]
    print ar.get()
Esempio n. 26
0
def run_parallel_jobs(jobs, job_fn, ipython_profile=None):
    # IPython will error out if jobs is empty.
    if jobs:
        if ipython_profile is None:
            c = Client()
        else:
            c = Client(profile=ipython_profile)

        lview = c.load_balanced_view()
        lview.block = True
        lview.map(job_fn, jobs)
Esempio n. 27
0
def compute_parallel(cache,
                     func,
                     keys,
                     save_every=4,
                     func_args=None,
                     func_kwargs=None,
                     parallel=True,
                     client=None):
    """Do a parallel computation of a function"""
    keys = [key for key in keys]
    results = dict(cache.items())
    print(50 * '=')
    print("Starting parallel run of {0} results".format(len(keys)))
    print(" - parallel={0}".format(parallel))

    if results:
        print(" - found {0} previous results in {1}"
              "".format(len(results), cache.filename))
    keys_to_compute = [key for key in keys if key not in results]

    # default arguments
    def iter_function(key,
                      func=func,
                      func_args=func_args,
                      func_kwargs=func_kwargs):
        func_args = func_args or ()
        func_kwargs = func_kwargs or {}
        return func(key, *func_args, **func_kwargs)

    print(" - computing {0} results".format(len(keys_to_compute)))

    # Set up the iterator over results
    if parallel:
        # Use interactive to prevent namespace issues
        from IPython.parallel.util import interactive
        iter_function = interactive(iter_function)
        if client is None:
            from IPython.parallel import Client
            client = Client()
        lbv = client.load_balanced_view()
        results_iter = lbv.imap(iter_function, keys_to_compute, ordered=False)
    else:
        results_iter = imap(iter_function, keys_to_compute)

    # Do the iteration, saving the results occasionally
    print(datetime.now())
    for i, (key, result) in enumerate(results_iter):
        print('{0}/{1}: {2}'.format(i + 1, len(keys_to_compute), result))
        print(' {0}'.format(datetime.now()))
        cache.add_row(key, result, save=((i + 1) % save_every == 0))
    cache.save()

    return np.array([cache.get_row(key) for key in keys])
Esempio n. 28
0
  def featurize(self, mols, parallel=False, client_kwargs=None,
                view_flags=None, verbosity=None, log_every_n=1000):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
    if self.conformers and isinstance(mols, types.GeneratorType):
      mols = list(mols)
    assert verbosity in [None, "low", "high"]

    if parallel:
      from IPython.parallel import Client

      if client_kwargs is None:
          client_kwargs = {}
      if view_flags is None:
          view_flags = {}
      client = Client(**client_kwargs)
      client.direct_view().use_dill()  # use dill
      view = client.load_balanced_view()
      view.set_flags(**view_flags)
      call = view.map(self._featurize, mols, block=False)
      features = call.get()

      # get output from engines
      call.display_outputs()

    else:
      features = []
      for i, mol in enumerate(mols):
        if verbosity is not None and i % log_every_n == 0:
          log("Featurizing %d / %d" % (i, len(mols)))
        if mol is not None:
          features.append(self._featurize(mol))
        else:
          features.append(np.array([]))

    if self.conformers:
      features = self.conformer_container(mols, features)
    else:
      features = np.asarray(features)
    return features
Esempio n. 29
0
def run_nb(beliefs, meta, params, num_samples):
    if params['do_parallel']:
        rc = Client()
        lview = rc.load_balanced_view()
        lview.block = True
    else:
        lview = None
    print(datetime.datetime.now().time())
    results = aggregation.importance_multiple(beliefs, meta, params,
                                              num_samples, lview)
    print(datetime.datetime.now().time())
    return results
Esempio n. 30
0
def calibrate():
    num_runs = 12
    rc = Client()
    lview = rc.load_balanced_view()
    lview.block = True
    print(datetime.datetime.now())
    res = lview.map(lambda q: all_together_q(), range(num_runs))
    print(datetime.datetime.now())
    act = np.array([r[0] for r in res])
    mh =  np.array([r[1] for r in res])
    nb =  np.array([r[2] for r in res])
    gp =  np.array([r[3] for r in res])
    1/0
Esempio n. 31
0
class DistributedSpider(object):

    # Time to wait between polling for task results.
    pollingDelay = 0.5

    def __init__(self, site):
        self.client = Client()
        self.view = self.client.load_balanced_view()
        self.mux = self.client[:]

        self.allLinks = []
        self.linksWorking = {}
        self.linksDone = {}

        self.site = site

    def visitLink(self, url):
        if url not in self.allLinks:
            self.allLinks.append(url)
            if url.startswith(self.site):
                print '    ', url
                self.linksWorking[url] = self.view.apply(fetchAndParse, url)

    def onVisitDone(self, links, url):
        print url, ':'
        self.linksDone[url] = None
        del self.linksWorking[url]
        for link in links:
            self.visitLink(link)

    def run(self):
        self.visitLink(self.site)
        while self.linksWorking:
            print len(self.linksWorking), 'pending...'
            self.synchronize()
            time.sleep(self.pollingDelay)

    def synchronize(self):
        for url, ar in self.linksWorking.items():
            # Calling get_task_result with block=False will return None if the
            # task is not done yet.  This provides a simple way of polling.
            try:
                links = ar.get(0)
            except error.TimeoutError:
                continue
            except Exception as e:
                self.linksDone[url] = None
                del self.linksWorking[url]
                print url, ':', e.traceback
            else:
                self.onVisitDone(links, url)
Esempio n. 32
0
class IPythonParallelizationBackend(ParallelizationBackend):
    """A parallelization backend which uses an IPython cluster to compute
    results.
    """

    def __init__(self, *args, **kwargs):
        """Initializes a new instance of the IPythonParallelizationBackend.

        Args: The same as the IPython.parallel.Client class
        """
        # Create the client
        self._client = Client(*args, **kwargs)

        # Create the cluster view
        self._cluster = self._client.load_balanced_view()

    def start(self, cache, job_specs, callback):
        """Run jobs on the backend, blocking until their completion.

        Args:
            cache: The persistent cache which should be set on the backend
            job_specs: The job specification (see
                owls_parallel.backends.ParallelizationBackend)
            callback: The job notification callback, not used by this backend
        """
        return [self._cluster.apply_async(_run, cache, j)
                for j
                in itervalues(job_specs)]

    def prune(self, jobs):
        """Prunes a collection of jobs by pruning those which are complete.

        The input collection should not be modified.

        Args:
            jobs: A collection of jobs to prune

        Returns:
            A new collection of jobs which are still incomplete.
        """
        # Extract unfinished jobs, and re-raise any remote exceptions
        result = []
        for j in jobs:
            if j.ready():
                # This will re-raise remotely-raised exceptions locally
                j.get()
            else:
                result.append(j)

        # All done
        return result
Esempio n. 33
0
class DistributedSpider(object):
    
    # Time to wait between polling for task results.
    pollingDelay = 0.5
    
    def __init__(self, site):
        self.client = Client()
        self.view = self.client.load_balanced_view()
        self.mux = self.client[:]
        
        self.allLinks = []
        self.linksWorking = {}
        self.linksDone = {}
        
        self.site = site
        
    def visitLink(self, url):
        if url not in self.allLinks:
            self.allLinks.append(url)
            if url.startswith(self.site):
                print '    ', url
                self.linksWorking[url] = self.view.apply(fetchAndParse, url)
        
    def onVisitDone(self, links, url):
        print url, ':'
        self.linksDone[url] = None
        del self.linksWorking[url]
        for link in links:
            self.visitLink(link)
                
    def run(self):
        self.visitLink(self.site)
        while self.linksWorking:
            print len(self.linksWorking), 'pending...'
            self.synchronize()
            time.sleep(self.pollingDelay)
    
    def synchronize(self):
        for url, ar in self.linksWorking.items():
            # Calling get_task_result with block=False will return None if the
            # task is not done yet.  This provides a simple way of polling.
            try:
                links = ar.get(0)
            except error.TimeoutError:
                continue
            except Exception as e:
                self.linksDone[url] = None
                del self.linksWorking[url]
                print url, ':', e.traceback
            else:
                self.onVisitDone(links, url)
Esempio n. 34
0
def getView():
    from IPython.parallel import TimeoutError, Client
    try:
        cluster = Client(profile="sge")
        print "running on SGE"
    except (TimeoutError, IOError) as e:
        try:
            cluster = Client(profile="localcluster")
            print "running on the localcluster with %d threads." % len(cluster)
        except (TimeoutError, IOError) as f:
            print "Need to have at least one cluster running."
            print e
            raise f
    return cluster.load_balanced_view()
Esempio n. 35
0
def getView():
    from IPython.parallel import TimeoutError, Client
    try:
        cluster = Client(profile="sge")
        print "running on SGE"
    except (TimeoutError, IOError) as e:
        try:
            cluster = Client(profile="localcluster")
            print "running on the localcluster with %d threads." %len(cluster)
        except (TimeoutError, IOError) as f:
            print "Need to have at least one cluster running."
            print e
            raise f
    return cluster.load_balanced_view()
Esempio n. 36
0
class IPythonParallelMap(object):
    """ Class to handle the creation and management of cluster
    resources, typically on IRP, through IPython's parallel
    implementation. """
    def __init__(self, nodes, irp=True, debug=False):
        """ if SSH, Open a connection to IRP and start the IPCluster
        daemon """
        self.nodes = nodes
        self.irp = irp
        if self.irp:
            self.child = pexpect.spawn('ssh [email protected]')
            if debug: self.child.logfile = sys.stdout
            time.sleep(0.2)
            self.child.sendline('cd /home/psj/Documents/IPClusterLogs')
            self.child.sendline('ipcluster start --profile=pbs -n ' +
                                str(nodes) + ' --daemonize')
        else:
            self.child = pexpect.spawn('ipcluster start -n ' + str(nodes) +
                                       ' --daemonize')

    def close(self):
        """ Close the IPCluster, delete jobs, and logout of SSH """
        if self.irp: self.child.sendline('ipcluster stop --profile=pbs')
        else: self.child.sendline('ipcluster stop')
        time.sleep(0.5)
        if self.irp: self.child.sendline('qdel all')
        time.sleep(0.1)
        self.child.sendline('logout')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.close()

    def connect_client(self):
        """ Connect the current client to the running engine """
        from IPython.parallel import Client

        if self.irp: self.client = Client(profile='pbs')
        else: self.client = Client()

        assert len(self.client.ids) == self.nodes
        self.lview = self.client.load_balanced_view()
        self.dview = self.client.direct_view()

    def __call__(self, *args, **kwargs):
        """ Map function call to parallel view """
        results = self.lview.map(*args, balanced=True, **kwargs)
        return results.get()
Esempio n. 37
0
 class TaskClient :
   def __init__(self) :
     self.rc = Client()
     self.dview = self.rc[:]
     self.lbview = self.rc.load_balanced_view()
   
   def run(self, maptask) :
     return self.lbview.apply(maptask.func, *maptask.args)
     
   def get_task_result(self, task, block = True) :
     return task.get()
   
   def clear(self):
     pass
Esempio n. 38
0
    def init_now(self, **kwargs):

        self.storage = kwargs.pop(
            'storage'
        ) if 'storage' in kwargs else 'cluster_null_score_strict_smart_storage.dat'

        self.sperm = kwargs.pop('sperm') if 'sperm' in kwargs else 1000
        self.M = self.sperm
        self.alpha = kwargs.pop('alpha')
        self.tests = kwargs.pop('tests')
        self.mperm = math.ceil(1.0 / (self.alpha / self.tests))

        hits = kwargs.pop('hits')
        hitA = kwargs.pop('hitA')
        hitB = kwargs.pop('hitB')

        self.nproc = kwargs.pop('nproc') if 'nproc' in kwargs else 8
        self.parthresh = kwargs.pop(
            'parthresh') if 'parthresh' in kwargs else 25000

        self.psuedocount = 0 if ('psuedocount' in kwargs
                                 and kwargs['psuedocount'] == False) else 1

        self.scores = np.array([h[12] for h in hits], dtype=float)
        self.hitAs = np.array([hits[v][12] for v in hitA.values()],
                              dtype=float)
        self.hitBs = np.array([hits[v][12] for v in hitB.values()],
                              dtype=float)

        self.clt_H_mu = np.mean(self.scores)
        self.clt_H_s2 = np.var(self.scores)
        self.clt_U1_mu = np.mean(self.hitAs)
        self.clt_U1_s2 = np.var(self.hitAs)
        self.clt_U2_mu = np.mean(self.hitBs)
        self.clt_U2_s2 = np.var(self.hitBs)

        p = util.run_cmd("ipcluster start -n %d" % (self.nproc), bg=True)

        i = 0
        self.perm_func = self.permute
        for i in xrange(120):
            try:
                rc = Client()
                self.parc = rc.load_balanced_view()
                self.perm_func = self.permute_par
                print 'Parallel option'
                break
            except IOError:
                time.sleep(1)
                pass
Esempio n. 39
0
def main():

    client = Client()
    lbview = client.load_balanced_view()
    lbview.track = True
    lbview.retries = 10
    
    print("create 10 tasks:")
    msg_ids = []
    i = 0
    while i < 10:
        ar = lbview.apply(run_task)
        print("Task " + ar.msg_ids[0])
        ar.wait_for_send()
        msg_ids.append(ar.msg_ids[0])
        i += 1

    time.sleep(10) 

    print("\nsearch for tasks in DB:")
    tasks = client.db_query({"msg_id" : {"$in" : msg_ids}})

    # stop all tasks which are not yet done
    print("\nstop all tasks which are not yet done:")
    tasks_to_stop = []
    for task in tasks:
        print("Task " + task["msg_id"] + ": ")
        if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
            print("  finished at " + str(task["completed"]))
        else:
            print("  not finished yet. will abort.")
            tasks_to_stop.append(task["msg_id"])
    client.abort(tasks_to_stop)

    time.sleep(10)

    print("\nsearch for tasks in DB:")
    tasks = client.db_query({"msg_id" : {"$in" : msg_ids}})

    print("\nresubmit all tasks which are not yet done:")
    tasks_to_resubmit = []
    for task in tasks:
        print("Task " + task["msg_id"] + ": ")
        if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
            print("  finished at " + str(task["completed"])) 
        else:
            print("  not finished yet. will resubmit.")
            tasks_to_resubmit.append(task["msg_id"])
    client.resubmit(tasks_to_resubmit)
Esempio n. 40
0
    def compute(cls, keys, variables, function, parameters, ipython_profile=None,
                group_name=None):
        """
        Compute profiles by applying the parameters to the function in parallel.

        """
        assert len(keys) == len(parameters)
        njobs = len(parameters)
        if isinstance(ipython_profile, LSFView):
            view = ipython_profile
            logger.debug('Running %d jobs on LSF' % view.njobs)
            generator = view.imap(function, parameters)
        elif ipython_profile:
            from IPython.parallel import Client, LoadBalancedView
            client = Client(profile=ipython_profile)
            view = client.load_balanced_view()
            logger.debug('Running %d jobs' % njobs)
            generator = view.imap(function, parameters)
        elif ipython_profile == False:
            generator = (function(p) for p in parameters)
        else:
            from multiprocessing import Pool, cpu_count
            import threading
            view = Pool()
            logger.debug('Running %d jobs on %d local CPU%s' % (njobs, cpu_count(), ' s'[cpu_count() > 1]))
            generator = view.imap(function, parameters)
        try:
            import progressbar
            progress = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ',
                                                        progressbar.Bar(), ' ', 
                                                        progressbar.Counter(), '/', 
                                                        str(njobs), ' ',
                                                        progressbar.ETA()],
                                               maxval=njobs)
            data = list(progress(generator))
        except ImportError:
            data = list(generator)

        for i, (p, r) in enumerate(zip(parameters, data)):
            if r is None:
                logger.info('Retrying failed computation locally')
                data[i] = function(p)

        rowmask = [(l != None) and all(~np.isnan(l)) for l in data]
        import itertools
        data = list(itertools.compress(data, rowmask))
        keys = list(itertools.compress(keys, rowmask))

        return cls(keys, data, variables, group_name=group_name)
def main():
    client = Client(profile='ssh')

    print 'ids:', client.ids

    view = client.load_balanced_view()

    #pdb.set_trace()

    tic = time.time()
    results = view.map(sleeper, [1] * 40)
    #results = map(sleeper, [1] * 4)

    print 'results:', results.get()
    print 'elapsed:', time.time() - tic
Esempio n. 42
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None):
        """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = [self._featurize(mol) for mol in mols]

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
Esempio n. 43
0
def get_map(cluster_id=None):
    """
    Get the proper mapping function.

    Parameters
    ----------
    cluster_id : str, optional
        IPython.parallel cluster ID.
    """
    if cluster_id is not None:
        client = Client(cluster_id=cluster_id)
        client.direct_view().use_dill()
        view = client.load_balanced_view()
        return view.map_sync
    else:
        return map
Esempio n. 44
0
def compute_parallel(cache, func, keys, save_every=4,
                     func_args=None, func_kwargs=None,
                     parallel=True, client=None):
    """Do a parallel computation of a function"""
    keys = [key for key in keys]
    results = dict(cache.items())
    print(50 * '=')
    print("Starting parallel run of {0} results".format(len(keys)))
    print(" - parallel={0}".format(parallel))

    if results:
        print(" - found {0} previous results in {1}"
              "".format(len(results), cache.filename))
    keys_to_compute = [key for key in keys if key not in results]

    # default arguments
    def iter_function(key, func=func, func_args=func_args,
                      func_kwargs=func_kwargs):
        func_args = func_args or ()
        func_kwargs = func_kwargs or {}
        return func(key, *func_args, **func_kwargs)

    print(" - computing {0} results".format(len(keys_to_compute)))

    # Set up the iterator over results
    if parallel:
        # Use interactive to prevent namespace issues
        from IPython.parallel.util import interactive
        iter_function = interactive(iter_function)
        if client is None:
            from IPython.parallel import Client
            client = Client()
        lbv = client.load_balanced_view()
        results_iter = lbv.imap(iter_function, keys_to_compute,
                                ordered=False)
    else:
        results_iter = imap(iter_function, keys_to_compute)

    # Do the iteration, saving the results occasionally
    print(datetime.now())
    for i, (key, result) in enumerate(results_iter):
        print('{0}/{1}: {2}'.format(i + 1, len(keys_to_compute), result))
        print(' {0}'.format(datetime.now()))
        cache.add_row(key, result, save=((i + 1) % save_every == 0))
    cache.save()

    return np.array([cache.get_row(key) for key in keys])
Esempio n. 45
0
def cluster_view(parallel, config):
    """Provide a view on an ipython cluster for processing.

    parallel is a dictionary with:
      - profile: The name of the ipython profile to use
      - cores: The number of cores to start for processing.
      - queue_type: Optionally, the type of parallel queue
        to start. Defaults to a standard parallel queue, can
        also specify 'multicore' for a multiple core machine
        and 'io' for an I/O intensive queue.
    """
    delay = 5
    max_delay = 300
    max_tries = 10
    profile = parallel["profile"]
    if parallel.get("queue_type", None):
        profile = "%s_%s" % (profile, parallel["queue_type"])
    cluster_id = str(uuid.uuid1())
    num_tries = 0
    while 1:
        try:
            _start(parallel["cores"], profile, cluster_id, delay)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        slept = 0
        target_cores = 1 if parallel.get("queue_type", None) == "multicore" \
                       else parallel["cores"]
        while not _is_up(profile, cluster_id, target_cores):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        #client = Client(profile=profile, cluster_id=cluster_id)
        client = Client(profile=profile)
        # push config to all engines and force them to set up logging
        client[:]['config'] = config
        client[:].execute('from bcbio.log import setup_logging')
        client[:].execute('setup_logging(config)')
        client[:].execute('from bcbio.log import logger')
        yield client.load_balanced_view()
    finally:
        _stop(profile, cluster_id)
Esempio n. 46
0
def main():
    parser = OptionParser()
    parser.set_defaults(n=100)
    parser.set_defaults(tmin=1e-3)
    parser.set_defaults(tmax=1)
    parser.set_defaults(profile='default')

    parser.add_option("-n", type='int', dest='n',
                      help='the number of tasks to run')
    parser.add_option("-t", type='float', dest='tmin',
                      help='the minimum task length in seconds')
    parser.add_option("-T", type='float', dest='tmax',
                      help='the maximum task length in seconds')
    parser.add_option("-p", '--profile', type='str', dest='profile',
                      help="the cluster profile [default: 'default']")

    (opts, args) = parser.parse_args()
    assert opts.tmax >= opts.tmin, "tmax must not be smaller than tmin"

    rc = Client()
    view = rc.load_balanced_view()
    print(view)
    rc.block = True
    nengines = len(rc.ids)
    with rc[:].sync_imports():
        from IPython.utils.timing import time

    # the jobs should take a random time within a range
    times = [
        random.random() * (opts.tmax - opts.tmin) + opts.tmin for i in range(opts.n)]
    stime = sum(times)

    print("executing %i tasks, totalling %.1f secs on %i engines" %
          (opts.n, stime, nengines))
    time.sleep(1)
    start = time.time()
    amr = view.map(time.sleep, times)
    amr.get()
    stop = time.time()

    ptime = stop - start
    scale = stime / ptime

    print("executed %.1f secs in %.1f secs" % (stime, ptime))
    print("%.3fx parallel performance on %i engines" % (scale, nengines))
    print("%.1f%% of theoretical max" % (100 * scale / nengines))
Esempio n. 47
0
def plotTrajectories(
        folder='/run/media/peter/Elements/peter/data/tmp-20130506/'):
    """
    Example how trajectory plotting works using the clusters of IPython notebook
    """

    vE = videoExplorer()
    fileList = []
    posList = []

    t = time()
    for root, dirs, files in os.walk(folder):
        files = files
        for f in files:
            if f.endswith('npy'):
                #fl = open(root + '/' + f, 'r')
                path = root + '/' + f
                fileList.append(path)
                posList.append(np.load(path))

    #sort both lists based on the file name
    posList = [x for y, x in sorted(zip(fileList, posList))]
    fileList = sorted(fileList)
    print time() - t

    from IPython.parallel import Client
    rc = Client()
    print rc.ids
    dview = rc[:]

    dview.block = True
    dview['accDist'] = accDist
    dview['fileList'] = fileList
    dview['posList'] = posList
    dview['saveScatters'] = saveScatters
    dview['plotTrajectorySummery'] = plotTrajectorySummery
    dview['vE'] = vE

    t = time()
    lview = rc.load_balanced_view()
    lview.block = True
    res = lview.map(lambda x: saveScatters(x),
                    range(0,
                          len(posList) - 2),
                    chunksize=10)
    print time() - t
Esempio n. 48
0
    def featurize(self, mols, parallel=False, client_kwargs=None,
                  view_flags=None):
        """
        Calculate features for molecules.

        Parameters
        ----------
        mols : iterable
            RDKit Mol objects.
        parallel : bool, optional
            Whether to train subtrainers in parallel using
            IPython.parallel (default False).
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = [self._featurize(mol) for mol in mols]

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
Esempio n. 49
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None):
        """
        Calculate features for molecules.

        Parameters
        ----------
        mols : iterable
            RDKit Mol objects.
        parallel : bool, optional (default False)
            Train subtrainers in parallel using IPython.parallel.
        client_kwargs : dict, optional
            Keyword arguments for IPython.parallel Client.
        view_flags : dict, optional
            Flags for IPython.parallel LoadBalancedView.
        """
        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize,
                            np.array_split(mols, len(client.direct_view())),
                            block=False)
            features = call.get()
            features = np.concatenate(features)

            # get output from engines
            call.display_outputs()

        else:
            features = self._featurize(mols)

        return np.asarray(features)
Esempio n. 50
0
def cluster_view(parallel):
    """Provide a view on an ipython cluster for processing.

    parallel is a dictionary with:
      - profile: The name of the ipython profile to use
      - cores: The number of cores to start for processing.
      - queue_type: Optionally, the type of parallel queue
        to start. Defaults to a standard parallel queue, can
        also specify 'multicore' for a multiple core machine
        and 'io' for an I/O intensive queue.
    """
    delay = 10
    max_delay = 300
    max_tries = 5
    profile = parallel["profile"]
    if parallel.get("queue_type", None):
        profile = "%s_%s" % (profile, parallel["queue_type"])
    cluster_id = str(uuid.uuid1())
    num_tries = 0
    while 1:
        try:
            _start(parallel["cores"], profile, cluster_id, delay)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        slept = 0
        target_cores = 1 if parallel.get("queue_type", None) == "multicore" \
                       else parallel["cores"]
        while not _is_up(profile, cluster_id, target_cores):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        #client = Client(profile=profile, cluster_id=cluster_id)
        client = Client(profile=profile)
        yield client.load_balanced_view()
    finally:
        _stop(profile, cluster_id)
Esempio n. 51
0
def parallel_run():
    """
    Start parallel engines to run
    """
    from IPython.parallel import Client

    c = Client()  # here is where the client establishes the connection
    lv = c.load_balanced_view()  # this object represents the engines (workers)

    rays = []
    maxs = 25
    bounding = AABA(
        xmin=0,
        ymin=0,
        zmin=0,
        xmax=maxs,
        ymax=maxs,
        zmax=maxs,
    )
    gridd = np.zeros((maxs, maxs, maxs))
    # spectrum for red to nir leaves
    red_nir_leaves = spectrum(np.array([0.5, 0.85]), np.array([0.1, 0.6]),
                              np.array([0.5, 0.1]))
    # spectrum for soil
    red_nir_soil = spectrum(np.array([0.5, 0.85]), np.array([0.3, 0.4]),
                            np.array([0.0, 0.0]))

    # scattering setup
    scatt = BRDSF(red_nir_leaves, 0.0)
    lf = leaf(55.0, 0.8)  # leaf angle distribution and leaf area density

    tasks = []
    for x in xrange(maxs):
        for y in xrange(maxs):
            tasks.append(
                lv.apply(prun, x, y, maxs, gridd, scatt, red_nir_soil,
                         bounding, lf))

    result = [task.get()
              for task in tasks]  # blocks until all results are back

    return results
Esempio n. 52
0
def _init_cluster_and_database(profile=None):
    rc = Client(profile=profile)
    _dview = rc[:]
    _lview = rc.load_balanced_view()

    with _dview.sync_imports():
        import os
        from os.path import join, exists
        import gzip
        import pickle
        import numpy
        from pymongo import MongoClient
        from sklearn.base import BaseEstimator
        from sklearn.metrics import mean_squared_error
        from survival.cross_validation import _fit_and_score
        from survival.meta.ensemble_selection import EnsembleAverage

    _dview.push({"mongodb_host": mongodb_host, "models_dir": models_dir}, block=True)

    return _dview, _lview
def runScript(job_script, fit_name, job_range):
    
    # Introduce more ways in which this can be run


    from IPython.parallel import Client, interactive
    import iparallel
    rc = Client(profile = 'wgs3')
    lview = rc.load_balanced_view()

    def runMCFit(job_script, file_name, task_nr):
        import os
        file_name += str(int(task_nr))
        qsub_string = '  '.join(['python',job_script, file_name])
        print qsub_string
        os.system(qsub_string)


    result = lview.map_async(runMCFit, [job_script]*len(job_range), [fit_name]*len(job_range), job_range)
    iparallel.waitOn(result)
Esempio n. 54
0
class ClusterPool(object):
    def __init__(self, *args, **kwargs):
        self.client = Client(*args, **kwargs)
        self.lbview = self.client.load_balanced_view()
        self.chunksize = 1

    def map_with_shared_data(self, func, shared_data, args, chunksize=None):
        """Map a function over each in a set of arguments, also passing a constant shared variable to each invocation."""
        # no imap with shared data, since we couldn't guarantee the integrity of FUNC and SHARED_DATA
        self.dview.push(dict(FUNC=func, SHARED_DATA=shared_data), block=True)
        return self.lbview.map_sync(func_int,
                                    args,
                                    chunksize=chunksize or self.chunksize,
                                    ordered=True)

    def map(self, func, args, chunksize=None):
        map = self.lbview.map
        return iter(
            map(func,
                args,
                chunksize=chunksize or self.chunksize,
                block=True,
                ordered=True))

    def imap(self, func, args, chunksize=None):
        map = self.lbview.map
        return iter(
            map(func,
                args,
                chunksize=chunksize or self.chunksize,
                block=False,
                ordered=True))

    def imap_unordered(self, func, args, chunksize=None):
        map = self.lbview.map
        return iter(
            map(func,
                args,
                chunksize=chunksize or self.chunksize,
                block=False,
                ordered=False))
class MulticoreJob(object):
    def __init__(self):
        self.tasks = {}
        self.client = Client()
        self.lb_view = self.client.load_balanced_view()

    def apply(self, f, named_tasks):
        """named_tasks: dict of {nametask: taskparams}
        """
        self.tasks = {
            tname: self.lb_view.apply(f, **param)
            for (tname, param) in named_tasks.items()
        }
        return self

    def isready(self):
        return all([t.ready() for t in self.tasks.values()])

    def progress(self):
        return np.mean([t.ready() for t in self.tasks.values()])

    def partial_result(self):
        return {
            tname: tresult.get()
            for (tname, tresult) in self.tasks.items() if tresult.ready()
        }

    def wait(self):
        for (tname, tresult) in self.tasks.items():
            tresult.wait()
        return self

    def abort(self):
        for (tname, tresult) in self.tasks.items():
            if not tresult.ready():
                try:
                    tresult.abort()
                except:
                    pass
        return self
Esempio n. 56
0
def cluster_view(parallel, config):
    """Provide a view on an ipython cluster for processing.

    parallel is a dictionary with:
      - scheduler: The type of cluster to start (lsf, sge).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
    """
    delay = 5
    max_delay = 300
    max_tries = 10
    profile = "bcbio_nextgen"
    cluster_id = str(uuid.uuid1())
    num_tries = 0
    while 1:
        try:
            _start(parallel, profile, cluster_id)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        slept = 0
        while not _is_up(profile, cluster_id, parallel["num_jobs"]):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        #client = Client(profile=profile, cluster_id=cluster_id)
        client = Client(profile=profile)
        # push config to all engines and force them to set up logging
        client[:]['config'] = config
        client[:].execute('from bcbio.log import setup_logging')
        client[:].execute('setup_logging(config)')
        client[:].execute('from bcbio.log import logger')
        yield client.load_balanced_view()
    finally:
        _stop(profile, cluster_id)