Exemple #1
0
def add_engines(n=1, profile='default', total=False):
    """add a number of engines to a given profile.
     
    If total is True, then already running engines are counted, and only
    the additional engines necessary (if any) are started.
    """
    rc = parallel.Client(profile=profile)
    base = len(rc)
     
    if total:
        n = max(n - base, 0)
     
    eps = []
    for _ in range(n):
        ep = TestProcessLauncher()
        ep.cmd_and_args = ipengine_cmd_argv + [
            '--profile=%s' % profile,
            '--log-level=50',
            '--InteractiveShell.colors=nocolor'
            ]
        ep.start()
        launchers.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base+n:
        if any([ ep.poll() is not None for ep in eps ]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time()-tic > 15:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
Exemple #2
0
def get_dview():
    """Obtain DirectView object for computation"""
    clients = parallel.Client()
    dview = clients[:]
    dview.block = False
    dview.use_dill()  # to serialize messy things
    return dview
Exemple #3
0
    def __init__(self,
                 n_clusters,
                 feat_patches,
                 client,
                 cache_dir='/tmp',
                 algo_name='KMeans',
                 sparse_result=True,
                 random_state=0):
        """
		n_clusters = number of clusters used in KMeans or MiniBatchKmeans
		feat_patches = patches of feat indices to build clusters on, e.g., 
			[[feat_idx_i1, ..., feat_idx_j1], [feat_idx_i2, .., feat_idx_j2]]
			the feat_patches can be extracted by sequence generators such as 
			strided_seqs or bootstrap_seqs in the package.
		cache_dir = the cache dir for shared memory object - in parallel computing
		algo_name = the clustering algorithm used for now only {'KMeans', 'MiniBatchKmeans'}
		sparse_result = if the transformed result should be a sparse matrix coo_matrix or normal
		"""
        self.n_clusters = n_clusters
        self.feat_patches = feat_patches
        self.client = client or parallel.Client()
        self.cache_dir = cache_dir
        assert algo_name in ['KMeans', 'MiniBatchKMeans']
        self.algo_name = algo_name
        self.feat_to_kmeans_ = None
        self.sparse_result = sparse_result
        self.random_state = random_state
        random.seed(random_state)
Exemple #4
0
def get_lview():
    """Obtain LoadBalancedView object for computation"""
    clients = parallel.Client()
    clients.direct_view().use_dill()  # Testing, stackoverflow.com/a/24316222
    lview = clients.load_balanced_view()
    lview.block = False
    return lview
Exemple #5
0
def start_parallel(imports_str=None):
    """
    This function starts a parallel computing environment 

    Parameters
    ----------
    imports_str: a string with the imports that are required to run on your
    engines, so that they can do their job. For example: 
     'import osmosis.model as ozm \n import osmosis.utils as ozu'
    
    """
    try:
        # Get parallel computing stuff from IPython:
        from IPython import parallel
        rc = parallel.Client()
    except ImportError:
        warnings.warn("Could not import IPython.parallel")
        return None
    except AssertionError:
        # If you get here, that probably means that you didn't turn on your
        # cluster...
        e_s = "Could not get an IPython connection file."
        e_s += "Did you remember to start your cluster?"
        warnings.warn(e_s)
        return None

    if imports_str is not None:
            rc[:].execute(imports_str)

    print("Parallelizing on %s engines"%len(rc))    
    dview = rc[:]

    # Now you can do:
    # out = dview.apply_async(para_func, args)            
    return dview
Exemple #6
0
def main():
    X = ...
    y = ...
    estimator = ...  # concrete BaseGradientBoosting object
    K = 5

    param_grid = {
        'n_estimators': [10000],
        'min_samples_leaf': [7, 9, 13],
        'max_depth': [4, 5, 6, 7],
        'max_features': [100, 150, 250],
        'learn_rate': [0.05, 0.02, 0.01],
    }

    grid = IterGrid(param_grid)
    grid_size = sum(1 for params in grid)
    print("_" * 80)
    print("GridSearch")
    print("grid size: %d" % grid_size)
    print("num tasks: %d" % (K * grid_size))

    cv = KFold(X.shape[0], K, shuffle=True, random_state=0)

    # instantiate the tasks - K times the number of grid cells
    # FIXME use generator to limit memory consumption or do fancy
    # indexing in _parallel_grid_search.
    tasks = [(i, k, estimator, params, X[train], y[train], X[test], y[test])
             for i, params in enumerate(grid)
             for k, (train, test) in enumerate(cv)]

    # distribute tasks on ipcluster
    rc = parallel.Client()
    lview = rc.load_balanced_view()
    results = lview.map(_parallel_grid_search, tasks)
Exemple #7
0
def reseed_project():
    #
    args = parse_commandline()
    #make sure we have full path to avoid annoying issues with path
    args.d = os.path.abspath(args.d)
    if args.t is None:
        args.t = os.path.join(args.d, "topologies/")
    else:
        args.t = os.path.abspath(args.t)

    client_list = parallel.Client(profile=args.p)
    client_list[:].execute("from fah_reseeder import *")
    print("Running on:", len(client_list.ids))
    view = client_list.load_balanced_view()
    view.block = True
    #extract
    extract_project_wrapper(args.d, args.t, view)
    #featurize
    feature_dict = featurize_project(args.d, args.t, args.f, args.s, view)

    #ticafy
    if args.i == True:
        feature_dict = tica_wrapper(args.d, feature_dict, args.l)

    #assignment
    cluster_mdl, assignments = cluster_project_wrapper(args.d, feature_dict,
                                                       args.n)

    #cluster and pull frames
    pull_new_seeds(args.d, args.t, cluster_mdl, assignments, args.r, args.c,
                   args.s, view)

    return
Exemple #8
0
    def run(self, loop, mapPlugin):
        from IPython import parallel

        client = parallel.Client()
        view = client.load_balanced_view()
        try:
            return view.map_sync(LoopWrapper(loop), mapPlugin.getWorkload())
        finally:
            pass
Exemple #9
0
def async_avail():
    from IPython import parallel
    try:
        client = parallel.Client(PARALLEL_PROFILE)
        return len(client) > 0
    except IOError:
        return False
    except Exception:
        return False
Exemple #10
0
def get_client():
    from IPython import parallel
    try:
        client = parallel.Client(profile=PARALLEL_PROFILE)
        return client if len(client) > 0 else None
    except IOError:
        return None
    except Exception:
        return None
Exemple #11
0
    def setUpClass(cls):
        logger = ema_logging.get_logger()
        mocked_logger = mock.Mock(spec=logger)
        mocked_logger.handlers = []
        ema_logging._logger = mocked_logger

        cls.client = parallel.Client(profile='default')
        cls.url = 'tcp://{}:20202'.format(localhost())
        cls.watcher = ema.start_logwatcher(cls.url)
Exemple #12
0
def setup_client():
    """Get a Client and initialize it.

    This assumes that all nodes see a shared filesystem.
    """
    global _client
    if _client is None:
        _client = parallel.Client()
        mydir = os.path.split(os.path.abspath(__file__))[0]
        def cd(path):
            import os
            os.chdir(path)
        _client[:].apply_sync(cd, mydir)
    return _client
Exemple #13
0
def time_throughput(nmessages, t=0, f=wait):
    client = parallel.Client()
    view = client.load_balanced_view()
    # do one ping before starting timing
    if f is echo:
        t = np.random.random(t / 8)
    view.apply_sync(echo, '')
    client.spin()
    tic = time.time()
    for i in xrange(nmessages):
        view.apply(f, t)
    lap = time.time()
    client.wait()
    toc = time.time()
    return lap - tic, toc - tic
Exemple #14
0
 def _wait_for_cluster(self, timeout):
     tic = time.time()
     #Wait to connect to the controller
     while True and time.time() - tic < timeout:
         try:
             rc = parallel.Client(profile=self.profile_name)
             break
         except IOError:
             time.sleep(2)
     #wait for all engines to come online
     while True and time.time() - tic < timeout:
         if len(rc.ids) == self.nengines:
             return True
         else:
             time.sleep(2)
     return False
Exemple #15
0
 def wait_for_controller(self):   
     """Loops until the controller is ready"""  
     tic = time.time()
     while True:
         if  time.time() - tic > 30:
             break
         self.logger.debug('waiting for controller ' + str(time.time() - tic) )
         try:
             rc = parallel.Client(profile=self.profile)
             return True
         except ValueError, e:
             self.logger.debug(e)
             time.sleep(2)
         except IOError, e:
             self.logger.debug(e)
             time.sleep(2)
Exemple #16
0
    def __init__(self, Y, in_parallel=False):
        """
        An illustration of quadtarture for use with var_EP.
        """
        Tilted.__init__(self, Y)
        self.Y = Y.flatten()  # we're only doing 1D at the moment
        self.num_data = self.Y.size
        self.lik = student_t(
        )  # hard coded right now. Incorporate into GPy when the code is ready.
        self._has_params = True
        self.num_params = 2

        self.parallel = in_parallel
        if self.parallel:
            self.client = parallel.Client()
            self.dv = self.client.direct_view()
Exemple #17
0
    def __init__(self, dsetname='dataset'):
        n_proposal = 100
        self.dsetname = dsetname
        print("master>>init() dsetname: {}".format(dsetname))
        #creat dview
        print("master>> create dview")
        # init cluster client
        self.clients = parallel.Client(packer='pickle')
        self.clients.block = True
        #0 use master as engine
        #1 donot use master as engine
        self.dview = self.clients.direct_view(self.clients.ids[0:])
        self.dview.block = True
        #engine init
        self.dview.execute("""import os; os.chdir(r'%s')""" % os.getcwd())
        #self.eng=engine()
        print("master>> init engine")
        try:
            __import__('imp').find_module('pforest')
            print "Found pforest"
            self.dview.execute('from pforest.dataset import dataset')
            self.dview.execute('from pforest.engine import engine')
        except ImportError:
            print "Not found pforest. Importing local modules"
            self.dview.execute('from %s import dataset' % (dsetname))
            self.dview.execute('from engine import engine')
        #self.dview.execute("reload(dataset)")
        for i, dv in enumerate(self.clients):
            dv.execute('dset=dataset(%d,%d,_prefix="%s")'\
            %(i,n_proposal//len(self.clients.ids), dsetname ))

        self.dview.execute('eng=engine(dset)')
        self.engines_path = self.dview.gather('dset.path')
        print "debug:master:__init__: %s" % self.engines_path

        #dont need to gather
        #        print("master>> gather engines")
        #        self.engs=self.dview.gather('eng')
        #        print("master>>engs:\n{}".format(self.engs))
        #init local variables
        print("master>> init local variables")
        self.minbagsize = 2
        self.maxdepth = 20
        #self.maxdepth=10
        self.queue = None
        self.root = None
        self.node = None
Exemple #18
0
 def test_get_result(self):
     """test getting results from the Hub."""
     c = pmod.Client(profile='iptest')
     # self.add_engines(1)
     t = c.ids[-1]
     v = c[t]
     v2 = self.client[t]
     ar = v.apply_async(wait, 1)
     # give the monitor time to notice the message
     time.sleep(.25)
     ahr = v2.get_result(ar.msg_ids[0])
     self.assertTrue(isinstance(ahr, AsyncHubResult))
     self.assertEqual(ahr.get(), ar.get())
     ar2 = v2.get_result(ar.msg_ids[0])
     self.assertFalse(isinstance(ar2, AsyncHubResult))
     c.spin()
     c.close()
Exemple #19
0
def main(nodes, edges):
    """Generate a random graph, submit jobs, then validate that the
    dependency order was enforced.
    Finally, plot the graph, with time on the x-axis, and
    in-degree on the y (just for spread).  All arrows must
    point at least slightly to the right if the graph is valid.
    """
    from matplotlib import pyplot as plt
    from matplotlib.dates import date2num
    from matplotlib.cm import gist_rainbow
    print("building DAG")
    G = random_dag(nodes, edges)
    jobs = {}
    pos = {}
    colors = {}
    for node in G:
        jobs[node] = randomwait

    client = parallel.Client()
    view = client.load_balanced_view()
    print("submitting %i tasks with %i dependencies" % (nodes, edges))
    results = submit_jobs(view, G, jobs)
    print("waiting for results")
    view.wait()
    print("done")
    for node in G:
        md = results[node].metadata
        start = date2num(md.started)
        runtime = date2num(md.completed) - start
        pos[node] = (start, runtime)
        colors[node] = md.engine_id
    validate_tree(G, results)
    nx.draw(G,
            pos,
            node_list=colors.keys(),
            node_color=colors.values(),
            cmap=gist_rainbow,
            with_labels=False)
    x, y = zip(*pos.values())
    xmin, ymin = map(min, (x, y))
    xmax, ymax = map(max, (x, y))
    xscale = xmax - xmin
    yscale = ymax - ymin
    plt.xlim(xmin - xscale * .1, xmax + xscale * .1)
    plt.ylim(ymin - yscale * .1, ymax + yscale * .1)
    return G, results
Exemple #20
0
 def wait_for_engines(self):
     """Loops until engies have started"""
     tic = time.time()
     while True and time.time() - tic < 120:
         try:
             rc = parallel.Client(profile=self.profile) 
             if len(rc.ids) == len(self.engines):
                 self.logger.debug('Engines started ' + str(len(rc.ids)) )
                 return True
             else:
                 self.logger.debug('waiting for engines ' + str(time.time() - tic) + ' ' + str(len(rc.ids))) 
                 time.sleep(2)     
         except ValueError, e:
             self.logger.debug(e)
             time.sleep(2)
         except IOError, e:
             self.logger.debug(e)
             time.sleep(2)
Exemple #21
0
    def run_commands(self, commands):
        """Maps the commands to the execute_command function, in parallel"""
        self.logger.debug('running')
        rc = parallel.Client(profile=self.profile) 
        lview = rc.load_balanced_view() 
        lview.retries = 10
        
        number_of_jobs = len(commands)
        self.logger.debug(number_of_jobs)

        tic = time.time()
        ar = lview.map(execute_command, commands)
        
        for i,r in enumerate(ar):
            self.logger.debug("task: %i finished on %s, %.3f percent finished at time %.3f "%(
                               i, r['host'], 100*((i+1)/float(number_of_jobs)), time.time()-tic ))

        self.logger.debug('done')
Exemple #22
0
    def __init__(self,
                 ensemble_path,
                 scorefn,
                 votefn,
                 random_seed=0,
                 client=None):
        """
		scorefn = function used to score model (in greedy search)
			sig = scorefn(y, yhat) RETURNS score
		votefn = function used to combine different model outputs
			sig = votefn(yhats) RETURNS combined_yhat 
		client = client to IPython.parallel.Client, if None, create new one
		"""
        self.ensemble_path = ensemble_path
        self.scorefn = scorefn
        self.votefn = votefn
        self.random_seed = random_seed
        self.client = client or parallel.Client()
        self.ensemble_ = []
Exemple #23
0
    def add_images(self, image_urls, image_ids=None):
        """
        Add all images in a list of URLs.
        If ipcluster is running, load images in parallel.

        Parameters
        ----------
        image_urls : list
        image_ids : list, optional
            If given, images are stored with the given ids.
            If None, the index of the image in the dataset is its id.
        """
        collection.ensure_index('id')

        # Construct the arguments list due to IPython.parallel's pickling
        if image_ids is None:
            jobs = [(url, None, self.palette) for url in image_urls]
        else:
            jobs = [(url, _id, self.palette)
                    for url, _id in zip(image_urls, image_ids)]

        print("Loading images...")
        tt = TicToc()
        parallelized = False
        try:
            rc = parallel.Client()
            lview = rc.load_balanced_view()
            parallelized = True
        except:
            warn(
                Warning("Launch an IPython cluster to parallelize \
                           ImageCollection loading."))

        if parallelized:
            results = lview.map(process_image, jobs)
            results.wait_interactive()
        else:
            results = map(process_image, jobs)

        collection.ensure_index('id')
        print("Finished inserting {} images in {:.3f} s".format(
            len(image_urls), tt.qtoc()))
Exemple #24
0
def start_validation(setup_code):
    """
    Perform the validation with IPython parallel processing.

    Parameters
    ----------
    setup_code : string
        Path to .py file containing the setup for the validation.
    """
    c = parallel.Client()
    dv = c[:]
    lview = c.load_balanced_view()

    dv.run(setup_code, block=True)

    jobs = None
    try:
        jobs = dv['jobs'][0]
    except parallel.CompositeError:
        print("Variable 'jobs' is not defined!")

    save_path = None
    try:
        save_path = dv['save_path'][0]
    except parallel.CompositeError:
        print("Variable 'save_path' is not defined!")

    to_write = len(jobs)
    if (jobs is not None) and (save_path is not None):
        with lview.temp_flags(retries=2):
            amr = lview.map_async(func, jobs)
            results = zip(amr, jobs)
            for result, job in results:
                netcdf_results_manager(result, save_path)
                to_write -= 1
                print('job = ' + str(job), 'remaining jobs = ' + str(to_write))

    c[:].clear()
Exemple #25
0
from IPython import parallel
from datetime import datetime
from DataMining.code.com import log,parallels
import os

rc= parallel.Client()

lview = rc.load_balanced_view()

lview.block = True

from DataMining.code.com.BigData import BigData
input_files = BigData.GetInputFiles('./DataMining/data/')


@lview.parallel()
def processFile(filep):
        from DataMining.code.com import log, parallels
        import os
        from ujson import loads, dumps
        import gzip
        
        outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json'
        f = gzip.open(filep)
        logger = log.logger('Parallel/'+os.path.basename(filep))
        logger.log( 'finding all records with location for: ' + f.name)
        locs = {}
        tot_lines =0
        loc_lines =0
        line = f.readline()
        while line:
Exemple #26
0
    def optimize(self,
                 method,
                 quantiles=(.1, .3, .5, .7, .9),
                 n_runs=3,
                 n_bootstraps=0,
                 parallel_profile=None):
        """
        Optimize model using ML, chi^2 or G^2.

        :Input:
            method : str
                Optimization method ('ML', 'chisquare' or 'gsquare').

            quantiles : tuple
                A sequence of quantiles to be used for chi^2 and G^2.
                Default values are the ones used by Ratcliff (.1, .3, .5, .7, .9).

            n_runs : int <default=3>
                Number of attempts to optimize.

            n_bootstraps : int <default=0>
                Number of bootstrap iterations.

            parrall_profile : str <default=None>
                IPython profile for parallelization.

        :Output:
            results <dict> - a results dictionary of the parameters values.

        :Note:
            The values of the nodes in single subject model is updated according to the results.
            The nodes of group models are not updated
        """

        results = self._run_optimization(method=method,
                                         quantiles=quantiles,
                                         n_runs=n_runs)

        #bootstrap if requested
        if n_bootstraps == 0:
            return results

        #init DataFrame to save results
        res = pd.DataFrame(np.zeros((n_bootstraps, len(self.values))),
                           columns=list(self.values.keys()))

        #prepare view for parallelization
        if parallel_profile is not None:  #create view
            client = parallel.Client(profile=parallel_profile)
            view = client.load_balanced_view()
            runs_list = [None] * n_bootstraps
        else:
            view = None

        #define single iteration bootstrap function
        def single_bootstrap(data,
                             accumulator_class=self.__class__,
                             class_kwargs=self._kwargs,
                             method=method,
                             quantiles=quantiles,
                             n_runs=n_runs):

            #resample data
            new_data = data.iloc[np.random.randint(0, len(data), len(data))]
            new_data = new_data.set_index(pd.Index(list(range(len(data)))))
            h = accumulator_class(new_data, **class_kwargs)

            #run optimization
            h._run_optimization(method=method,
                                quantiles=quantiles,
                                n_runs=n_runs)

            return pd.Series(h.values, dtype=np.float)

        #bootstrap iterations
        for i_strap in range(n_bootstraps):
            if view is None:
                res.iloc[i_strap] = single_bootstrap(self.data)
            else:
                # append to job queue
                runs_list[i_strap] = view.apply_async(single_bootstrap,
                                                      self.data)

        #get parallel results
        if view is not None:
            view.wait(runs_list)
            for i_strap in range(n_bootstraps):
                res.iloc[i_strap] = runs_list[i_strap].get()

        #get statistics
        stats = res.describe()
        for q in [2.5, 97.5]:
            stats = stats.append(
                pd.DataFrame(res.quantile(q / 100.),
                             columns=[repr(q) + '%']).T)

        self.bootstrap_stats = stats.sort_index()
        return results
Exemple #27
0
from IPython import parallel
with drctview.sync_imports():
   import numpy
clients = parallel.Client(profile=’testprofile’)
drctview = clients[:]
drctview.activate()
drctview.block=True
%px dummymatrix = numpy.random.rand(4,4)
%px eigenvalue = numpy.linalg.eigvals(dummymatrix)
drctview['eigenvalue']

%pxconfig --noblock
%autopx
maximum_egnvals = []
for idx in range(50):
    arr = numpy.random.rand(10,10)
    egnvals = numpy.linalg.eigvals(arr)
    maximum_egnvals.append(egnvals[0].real)
%autopx
%pxconfig --block 
%px answer= "The average maximum eigenvalue is: %f"%(sum(maximum_egnvals)/len(maximum_egnvals))
dv['answer']

%%px --block --group-outputs=engine
import numpy as np
arr = np.random.random (4,4)
egnvals = numpy.linalg.eigvals(arr)
print egnvals
egnvals.max()
egnvals.min()
Exemple #28
0
 def _get_engines(self):
   rc = parallel.Client()
   view = rc[:]
   return view
Exemple #29
0
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import netCDF4
import numpy.ma as ma
from pylab import *
sys.path.append('/noc/users/hb1g13/Python/python_functions/')
import SG as SG
import layers_calc_numba
sys.path.append('/noc/users/hb1g13/Python/python_functions/MITgcmUtils/')
import utils

# Set up processors
rc = parallel.Client(
    '/noc/users/hb1g13/.ipython/profile_maelstrom/security/ipcontroller-client.json'
)
dv = rc[:]
rc.ids

# Now each processor needs to know where my modules are:
dv.execute('import sys')
dv.execute('sys.path.append("/noc/users/hb1g13/Python/python_functions/")')
dv.execute('import layers_calc_numba')
dv.execute(
    'sys.path.append("/noc/users/hb1g13/Python/python_functions/MITgcmUtils/")'
)
dv.execute('import utils')

# Some parameteres to ensure right files are picked up:
Full = 'N'  # 9 Pannels isn't ideal for presentations N option give 4 plots
Exemple #30
0
# coding: utf-8

# In[87]:

from IPython import parallel

c = parallel.Client(profile='sge', sshserver='[email protected]')
view = c[:]
c.ids

# In[88]:

get_ipython().magic(u"px print('Hello World!')")

# In[89]:

get_ipython().run_cell_magic(
    u'px', u'',
    u'import os\nimport socket\nprint os.getpid()\nprint socket.gethostname()')

# In[90]:

A = 'Shared var '
get_ipython().magic(u"px A = 'My var'")

# In[91]:


def myfunc(x):
    import os
    import socket