Esempio n. 1
0
def cloud_pooler(func, in_q, chunk_size=1000, _env='python-web', _type='c2', _max_runtime=60, get_results=True, **kwargs):
	import cloud
	if chunk_size > 1:
		if isinstance(in_q, collections.Iterable):
			in_q = WebQueue(in_q)
		chunks = []
		chunk = []
		while not in_q.empty():
			chunk.append(in_q.get())
			if len(chunk) == chunk_size:
				chunks.append(chunk)
				chunk = []
		if len(chunk):
			chunks.append(chunk)
	else:
		chunks = in_q

	partial_func = functools.partial(func, **kwargs)
	jids = cloud.map(partial_func, chunks, _env=_env, _type=_type, _max_runtime=_max_runtime)

	if get_results:
		print jids
		for result in cloud.iresult(jids, ignore_errors=True):
			if result:
				yield result
	else:
		for jid in jids:
			yield jid
Esempio n. 2
0
def picloud():
    t1 = time.time()
    jids = cloud.map(testfunc, np.arange(K), _type="f2", _vol="my-vol", _env="base/precise")
    # get the results
    cloud.result(jids)
    t2 = time.time()
    return t2 - t1
Esempio n. 3
0
def start_inference(infilename, outfilename):

    ITERS = SAMPLER_ITERS

    indata = pickle.load(open(infilename, 'r'))
    filenames = indata['filenames']
    data_filename = filenames['data']
    latent_filenames = []
    config_filenames = []
    for chain, v in filenames['chains'].iteritems():
        latent_filenames.append(v['latent'])
        config_filenames.append(v['config'])

    jids = cloud.map(inference_run_ld,
                     latent_filenames, [data_filename] * CHAINS_TO_RUN,
                     config_filenames, [ITERS] * CHAINS_TO_RUN,
                     range(CHAINS_TO_RUN),
                     _env='connectivitymotif',
                     _type='f2')

    # fixme save all inputs
    pickle.dump(
        {
            'infile': indata['infile'],
            'filenames': filenames,
            'jids': jids
        }, open(outfilename, 'w'))
Esempio n. 4
0
def timing_triple_map_cloud():
    execfile('picloud_venture_credentials.py')
    exp_params = experiment.exp_param_defaults({})
    exp_params['intermediate_iter'] = 1
    exp_params['max_initial_run_time'] = 30
    exp_params['max_burn_time'] = 30
    exp_params['max_sample_time'] = 30
    exp_params['n_samples'] = 25
    print experiment.exp_params_to_str(exp_params)
    
    data = scipy.io.loadmat("../data/irm_synth/irm_synth_20.mat", squeeze_me=True)
    observed = list(zip(data['train_i'].flat, data['train_j'].flat, data['train_v'].flat))
    missing  = list(zip(data['test_i'].flat,  data['test_j'].flat,  data['test_v'].flat))
    data = {'observations' : observed, 'missing' : missing}
    
    model = models.product_IRM
    model_params = {'D' : 1, 'alpha' : 1, 'symmetric' : True}
    
    # Timing run
    print 'Timing'
    print 'Mapping'
    job_id = cloud.call(experiment.network_cv_timing_run, data, model, exp_params, model_params, _max_runtime=5, _env=cloud_environment)
    print 'Waiting'
    time_per_mh_iter = cloud.result(job_id)['time_per_mh_iter']
    
    # Live run
    print 'Live'
    print 'Mapping'
    exp_params['intermediate_iter'] = max(1, int(round(0.9 * exp_params['max_sample_time'] / (exp_params['n_samples'] * time_per_mh_iter))))
    job_ids = cloud.map(experiment.network_cv_single_run, itertools.repeat(data, 5), itertools.repeat(model, 5), itertools.repeat(exp_params, 5), itertools.repeat(model_params, 5), _max_runtime=5, _env=cloud_environment)
    print 'Waiting'
    cloud.join(job_id)
    print cloud.result(job_ids)
Esempio n. 5
0
def multimap(fun, args, naccounts=None):
    if naccounts is None:
        naccounts = len(api_keys)

    max_parallel = sum(array(parallelism[:naccounts]))
    if len(args) <= max_parallel:
        naccounts = nonzero(cumsum(parallelism) - len(args) >= 0)[0][0] + 1
        size = parallelism[:naccounts]
    else:
        size = [len(args) / naccounts for _ in xrange(naccounts)]
    if naccounts > 1:
        size[-1] = len(args) - sum(array(size[:-1]))
    else:
        size[0] = len(args)

    # jids[i] contains the job indices for account i
    jids = [None for _ in xrange(naccounts)]

    # Launches the jobs
    k = 0
    for i in xrange(naccounts):
        api_key = api_keys[i]
        api_secretkey = api_secretkeys[i]
        n = size[i]

        args_tmp = args[k:k + n]
        if len(args_tmp) > 0:
            print "Launching %d jobs with account %d..." % (len(args_tmp),
                                                            i + 1)
            cloud.setkey(api_key=api_key, api_secretkey=api_secretkey)
            jids[i] = cloud.map(fun, args_tmp, _high_cpu=True)
            print "    Jobs:", jids[i]
            k += n
    return jids
Esempio n. 6
0
def multimap(fun, args, naccounts=None):
    if naccounts is None:
        naccounts = len(api_keys)

    max_parallel = sum(array(parallelism[:naccounts]))
    if len(args) <= max_parallel:
        naccounts = nonzero(cumsum(parallelism) - len(args) >= 0)[0][0] + 1
        size = parallelism[:naccounts]
    else:
        size = [len(args) / naccounts for _ in xrange(naccounts)]
    if naccounts > 1:
        size[-1] = len(args) - sum(array(size[:-1]))
    else:
        size[0] = len(args)

    # jids[i] contains the job indices for account i
    jids = [None for _ in xrange(naccounts)]

    # Launches the jobs
    k = 0
    for i in xrange(naccounts):
        api_key = api_keys[i]
        api_secretkey = api_secretkeys[i]
        n = size[i]

        args_tmp = args[k:k + n]
        if len(args_tmp) > 0:
            print "Launching %d jobs with account %d..." % (len(args_tmp), i + 1)
            cloud.setkey(api_key=api_key, api_secretkey=api_secretkey)
            jids[i] = cloud.map(fun, args_tmp, _high_cpu=True)
            print "    Jobs:", jids[i]
            k += n
    return jids
Esempio n. 7
0
def cloudHandEVByCount(players=6, TOTAL_SHOES=100000000, CHUNKING=100000):
    '''
    Accumulate the calcHandEVByCount of __many__ games using pycloud.
    '''
    CHUNKS = TOTAL_SHOES / CHUNKING
    totEV = {}
    totEVName = "HandEVByCount-%d-p%d.p" % (TOTAL_SHOES, players)
    
    # benchmarking
    start = time.clock()
    
    cloudIds = cloud.map(workerHandEVByCount, (players,)*CHUNKS, (CHUNKING,)*CHUNKS)
    
    for ev in cloud.iresult(cloudIds, ignore_errors=True):
        if not isinstance(ev, dict):
            print "error"
            print ev
        else:
            for (key, (gResult, gTotHands)) in ev.iteritems():
                result, totHands = totEV.get(key, (0,0))
                result += gResult
                totHands += gTotHands
                totEV[key] = (result, totHands)

    # benchmarking
    end = time.clock()
    print end-start
    
    totEVFile = open(totEVName, 'w')
    pickle.dump(totEV, totEVFile)
    totEVFile.close
    
    return totEV
Esempio n. 8
0
def start_inference(infilename, outfilename):

    ITERS = SAMPLER_ITERS

    indata = pickle.load(open(infilename, 'r'))
    filenames = indata['filenames']
    data_filename = filenames['data']
    latent_filenames = []
    config_filenames = []
    for chain, v in filenames['chains'].iteritems():
        latent_filenames.append(v['latent'])
        config_filenames.append(v['config'])
    
    jids = cloud.map(inference_run_ld, latent_filenames,
                     [data_filename]*CHAINS_TO_RUN, 
                     config_filenames,
                     [ITERS] * CHAINS_TO_RUN, 
                     range(CHAINS_TO_RUN), 
                     _env='connectivitymotif', 
                     _type='f2')

    # fixme save all inputs
    pickle.dump({'infile' : indata['infile'],
                 'filenames' : filenames,
                 'jids' : jids}, 
                open(outfilename, 'w'))
Esempio n. 9
0
def main(n_rbms=5,
         save_folder='../data/mnist/many-rbm-samples/default',
         cloud_simulation=True):
    pass
    execfile('picloud_misc_credentials.py')
    if cloud_simulation:
        cloud.start_simulator()

    if not os.path.isdir(save_folder):
        os.makedirs(save_folder)

    seeds = [np.random.randint(2**31) for dummy in range(n_rbms)]
    print 'Sending jobs'
    job_ids = cloud.map(train_and_sample, seeds, _type='f2', _cores=1)
    print 'Jobs sent'
    images = np.zeros((0, 28 * 28))
    labels = np.zeros((0, 1))
    count = 1
    for (some_images, some_labels) in cloud.iresult(job_ids):
        print 'Job %d of %d complete' % (count, n_rbms)
        count += 1
        images = np.vstack((images, some_images))
        labels = np.vstack((labels, some_labels))
        np.savetxt(os.path.join(save_folder, 'images.csv'),
                   images,
                   delimiter=',')
        np.savetxt(os.path.join(save_folder, 'labels.csv'),
                   labels,
                   delimiter=',')
    return (images, labels)
Esempio n. 10
0
def main(n_rbms=5, save_folder='../data/mnist/many-rbm-samples/default', cloud_simulation=True):
    execfile('picloud_misc_credentials.py')
    if cloud_simulation:
        cloud.start_simulator()

    #n_rbms = 4
    #save_folder = 'picloud_test'
    if not os.path.isdir(save_folder):
        os.makedirs(save_folder)

    seeds = [np.random.randint(2**31) for dummy in range(n_rbms)]
    print 'Sending jobs'
    job_ids = cloud.map(train_and_sample, seeds, _type='f2', _cores=1)
    print 'Jobs sent'
    images = np.zeros((0,28*28))
    labels = np.zeros((0,1))
    count = 1
    for (some_images, some_labels) in cloud.iresult(job_ids):
        print 'Job %d of %d complete' % (count, n_rbms)
        count += 1
        images = np.vstack((images, some_images))
        labels = np.vstack((labels, some_labels))
        np.savetxt(os.path.join(save_folder, 'images.csv'), images, delimiter=',')
        np.savetxt(os.path.join(save_folder, 'labels.csv'), labels, delimiter=',')
    return (images, labels)
 def run_jobs(self, f, jobs):
     if self.usecloud:
         jids = cloud.map(f, jobs, _env=self.cloud_env, _profile=True, _depends_on=self.preprocess_job)
         ires = cloud.iresult(jids)
     else:
         pool = ThreadPool(processes=cv2.getNumberOfCPUs())
         ires = pool.imap_unordered(f, jobs)
     return ires
Esempio n. 12
0
def outer_map(y):
    jids=cloud.map(inner_map,range(y))
    
    cloud.join(jids)
    
    results = cloud.result(jids)
    
    return list(results)
Esempio n. 13
0
def run_experiment(experiment, reset_database=False):
    """Create and run all jobs for experiment."""
    url = sql.get_database_url()
    if reset_database:
        print "Resetting database..."
        sql.reset_database(experiment.SQLBase, url)
    print "Creating jobs..."
    jobs = experiment.create_jobs(num_jobs=500)
    session = sql.get_session(url)
    session.add_all(jobs)
    session.commit()
    job_ids = [job.id for job in jobs]
    session.close()
    print "Running jobs..."
    run = lambda job_id: run_job(experiment, job_id, url)
    cloud.map(run, job_ids, _type="f2")
    print "Done!"
Esempio n. 14
0
def traintst(Gs,ls,L):
    m = np.shape(Gs)[0]
    traintest = []
    for i in range(10):
        sl = slice(i*m//10,(i+1)*m//10)
        traintest.append( (np.delete(Gs,sl,0),np.delete(ls,sl,0),Gs[sl,:],ls[sl,:],L) )
    jids = cloud.map(crossvalidate,traintest,_type='c1') 
    return cloud.result(jids)    
Esempio n. 15
0
def outer_map(y):
    jids = cloud.map(inner_map, range(y))

    cloud.join(jids)

    results = cloud.result(jids)

    return list(results)
def run_experiments(experiment_args):
    if args.picloud:
        import cloud
        jids = cloud.map(run_single_experiment_wrapper, experiment_args, _env="test", _type="c2")
        print "Now waiting for results..."
        results = cloud.result(jids)
        return zip(experiment_args, results)
    else:
        return zip(experiment_args, [run_single_experiment(**a) for a in experiment_args])
Esempio n. 17
0
def traintst(Gs, ls, L):
    m = np.shape(Gs)[0]
    traintest = []
    for i in range(10):
        sl = slice(i * m // 10, (i + 1) * m // 10)
        traintest.append((np.delete(Gs, sl,
                                    0), np.delete(ls, sl,
                                                  0), Gs[sl, :], ls[sl, :], L))
    jids = cloud.map(crossvalidate, traintest, _type='c1')
    return cloud.result(jids)
Esempio n. 18
0
def picloud():
    t1 = time.time()
    jids = cloud.map(testfunc,
                     np.arange(K),
                     _type='f2',
                     _vol="my-vol",
                     _env="base/precise")
    # get the results
    cloud.result(jids)
    t2 = time.time()
    return t2 - t1
Esempio n. 19
0
def main():
    jobs = []
    for seed in [0, 1, 2, 3, 4]:
        for learner_class_index in [0, 1]:
            jobs.append((learner_class_index, seed))
    print "Scheduling jobs..."
    jids = cloud.map(run, jobs, _type="f2")
    print "Waiting for results..."
    results = cloud.result(jids)
    for job, result in zip(jobs, results):
        print job, result
def simulate_all_worlds(scenario,K):
    '''
    Simulate a given parameter scenario K times.
    '''
    print 'Simulating %d worlds.' % K
    # create K copies of the scenario
    scenarios = []
    for k in xrange(K):
        scenarios.append(copy.copy(scenario))
    jobs = cloud.map(get_stats,scenarios,_label='%d worlds' % K,_type='c2')
    return jobs
def hessian_on_the_cloud(name, args, chunk_size = 500):
    print("submitting")
    dims = [ A.size for A in args ] + [len(args), len(args)]
    H = np.empty(dims, 'double')
    ws_product_chunked = grouper(itertools.product(*args), chunk_size)
    jids = cloud.map(lambda ws: _hessian_on_the_cloud(name, ws), ws_product_chunked)
    print("waiting")
    chunked_results = cloud.result(jids)
    print("assembling")
    H.flat = list(itertools.chain.from_iterable(chunked_results))
    return H
def simulate_all_worlds(scenario, K):
    '''
    Simulate a given parameter scenario K times.
    '''
    print 'Simulating %d worlds.' % K
    # create K copies of the scenario
    scenarios = []
    for k in xrange(K):
        scenarios.append(copy.copy(scenario))
    jobs = cloud.map(get_stats, scenarios, _label='%d worlds' % K, _type='c2')
    return jobs
 def run_jobs(self, f, jobs):
     if self.usecloud:
         jids = cloud.map(f,
                          jobs,
                          _env=self.cloud_env,
                          _profile=True,
                          _depends_on=self.preprocess_job)
         ires = cloud.iresult(jids)
     else:
         pool = ThreadPool(processes=cv2.getNumberOfCPUs())
         ires = pool.imap_unordered(f, jobs)
     return ires
Esempio n. 24
0
def process_s3_files(input_bucket_name,
                     key_glob='*',
                     output_bucket_name_1ms=None,
                     output_bucket_name_100ms=None,
                     overwrite=False,
                     use_cloud=True):

    if output_bucket_name_1ms is None:
        output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"

    if output_bucket_name_100ms is None:
        output_bucket_name_100ms = input_bucket_name + "-hdf"

    s3_cxn = get_s3_cxn()
    in_bucket = s3_cxn.get_bucket(input_bucket_name)

    # create output buckets if they don't already exist
    # it's better to do this before launching remote computations
    s3_cxn.create_bucket(output_bucket_name_1ms)
    s3_cxn.create_bucket(output_bucket_name_100ms)

    matching_keys = []
    for k in in_bucket:
        if fnmatch.fnmatch(k.name, key_glob):
            matching_keys.append(k.name)

    if use_cloud:
        print "Launching %d jobs" % len(matching_keys)

        def do_work(key_name):
            return process_s3_file(input_bucket_name, key_name,
                                   output_bucket_name_1ms,
                                   output_bucket_name_100ms, overwrite)

        jids = cloud.map(do_work,
                         matching_keys,
                         _type='f2',
                         _label='generate HDF')

        progress = progressbar.ProgressBar(len(jids)).start()
        n_finished = 0
        for _ in cloud.iresult(jids):
            n_finished += 1
            progress.update(n_finished)
        progress.finish()
    else:
        print "Running locally..."
        print "%d keys match the pattern \'%s\'" % (len(matching_keys),
                                                    key_glob)
        for key in matching_keys:
            process_s3_file(input_bucket_name, key, output_bucket_name_1ms,
                            output_bucket_name_100ms)
    print "Done!"
    def create_alternative_worlds(self):
        ''' 
        Simulate each of the economies. 
        '''
        T = self.T
        I = self.I

        print 'Simulating a world with %d countries.' % I

        # these are the individual simulation jobs
        jobs = cloud.map(simulate_an_economy,self.economies,[T]*I,_label='A world with %d countries.' % I,_type='c2')
        # this is a harvester job, collecting the results of each individual simulation
        return cloud.call(GDP_collector, jobs, T, T1, T2,_depends_on=jobs,_label='Collecting GDP results.')
Esempio n. 26
0
def distrubutedly_train_terms(train_terms, terms, chunk_size=1000, use_cloud=False):
    logging.info(u'Starting distributed training <{0}>'.format(['locally','remotely'][int(use_cloud)]))
    if use_cloud:
        import cloud
        job_idds = cloud.map(train_terms, terms)
        term_iters = cloud.result(job_ids)
    else:
        import collections
        chunks = collections.defaultdict(list)
        chunk_count = len(terms) / chunk_size + 1
        for term in terms: chunks[hash(term) % chunk_count].append(term)
        term_iters = map(train_terms, chunks.values())
    return itertools.chain.from_iterable(term_iters)
Esempio n. 27
0
def Jgrad_picloud(params, encoding_size, decoded_size, training, wd, num_cores):
    W_e, W_d, b_e, b_d = unroll_params(params, encoding_size, decoded_size)

    gradW_e = np.zeros(W_e.shape)
    gradW_d = np.zeros(W_d.shape)
    gradb_e = np.zeros(b_e.shape)
    gradb_d = np.zeros(b_d.shape)

    # split the training set into batches, send out to picloud cores for backprop
    # offset = num_cores - len(training)%num_cores
    # for index in range(offset):
    # training.
    split = len(training) / num_cores

    final_training = []

    for i in range(num_cores):
        final_training.append(training[i * split : (i + 1) * split])

    offset = len(training) % num_cores

    if offset > 0:
        final_training.append(training[len(training) - offset :])

    jids = cloud.map(
        Jgrad_picloud_sub,
        [params] * num_cores,
        [encoding_size] * num_cores,
        [decoded_size] * num_cores,
        final_training,
        _type="c2",
    )

    # call for results
    results = cloud.result(jids)
    for result in results:
        gradW_e += result[0]
        gradW_d += result[1]
        gradb_e += result[2]
        gradb_d += result[3]

    # add weight decay factor and normalization coefficient
    a = 1.0 / len(training)
    grad_J_W_e = a * gradW_e + wd * W_e
    grad_J_W_d = a * gradW_d + wd * W_d
    grad_J_b_e = a * gradb_e
    grad_J_b_d = a * gradb_d

    # roll up and return as 1-d array
    return np.concatenate((grad_J_W_e.flatten(), grad_J_W_d.flatten(), grad_J_b_e.flatten(), grad_J_b_d.flatten()))
Esempio n. 28
0
def main():
    names = list(uai_net.names(500))
    print "Submitting {} jobs...".format(len(names))
    jids = cloud.map(run_job, names, _type="f2")
    print "Waiting for results..."
    for name, marginals_string, is_exact in cloud.result(jids):
        print name
        exact = "true" if is_exact else "approx"
        f = open(
            os.path.join(os.path.dirname(__file__),
                         "../data/marginals/uai/{}.{}.mar".format(name,
                                                                  exact)), "w")
        f.write(marginals_string)
        f.close()
Esempio n. 29
0
def calc_pi():
    num_jobs = 8
    tests_per_call = total_tests/num_jobs

    # argument list has 8 duplicate elements
    jids = cloud.map(monte_carlo, [tests_per_call]*num_jobs, _type='c2')

    # get list of all counts
    num_in_circle_list = cloud.result(jids)

    # sum all counts
    num_in_circle = sum(num_in_circle_list)

    pi = (4 * num_in_circle) / float(total_tests)
    return pi
Esempio n. 30
0
def calc_pi():
    num_jobs = 8
    tests_per_call = total_tests / num_jobs

    # argument list has 8 duplicate elements
    jids = cloud.map(monte_carlo, [tests_per_call] * num_jobs, _type='c2')

    # get list of all counts
    num_in_circle_list = cloud.result(jids)

    # sum all counts
    num_in_circle = sum(num_in_circle_list)

    pi = (4 * num_in_circle) / float(total_tests)
    return pi
Esempio n. 31
0
def test(batch, remote, debug, dependency = []):
    params = cache.get("batch/%s/params" % batch, remote)
    numEpisodes = params['episodes']['num']
    
    i_ = range(numEpisodes)
    f = lambda i : tester.test(batch, params, i, remote, debug)
    
    logging.info("running %s test instances" % len(i_))
    if (remote):
        k_ = cloud.map(f, i_, _label = "%s/test" % batch, _depends_on = dependency, _type = 'c1', _max_runtime = 30)
        logging.info("k_ %s" % k_)
        return k_
    else:
        results = map(f, i_)
        return results
def process_s3_files(input_bucket_name, key_glob = '*', 
      output_bucket_name_1ms = None, 
      output_bucket_name_100ms = None, 
      overwrite = False, 
      use_cloud = True):
      
  if output_bucket_name_1ms is None:
    output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"
  
  if output_bucket_name_100ms is None:
    output_bucket_name_100ms = input_bucket_name + "-hdf" 
  
  s3_cxn = get_s3_cxn()    
  in_bucket = s3_cxn.get_bucket(input_bucket_name)
  
  # create output buckets if they don't already exist
  # it's better to do this before launching remote computations 
  s3_cxn.create_bucket(output_bucket_name_1ms)
  s3_cxn.create_bucket(output_bucket_name_100ms)
  
  matching_keys = []
  for k in in_bucket:
    if fnmatch.fnmatch(k.name, key_glob):
      matching_keys.append(k.name)
     
  if use_cloud:
    print "Launching %d jobs" % len(matching_keys)
    def do_work(key_name):
      return process_s3_file(
        input_bucket_name, 
        key_name, 
        output_bucket_name_1ms, 
        output_bucket_name_100ms, 
        overwrite)
    jids = cloud.map(do_work, matching_keys, _type = 'f2', _label='generate HDF')
    
    progress = progressbar.ProgressBar(len(jids)).start()
    n_finished = 0
    for _ in cloud.iresult(jids):
      n_finished += 1
      progress.update(n_finished)
    progress.finish()
  else:
    print "Running locally..."
    print "%d keys match the pattern \'%s\'" % (len(matching_keys), key_glob)
    for key in matching_keys:
      process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms)
  print "Done!"
Esempio n. 33
0
def process_s3_files(
    input_bucket_name,
    key_glob="*",
    output_bucket_name_1ms=None,
    output_bucket_name_100ms=None,
    overwrite=False,
    use_cloud=True,
):

    if output_bucket_name_1ms is None:
        output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"

    if output_bucket_name_100ms is None:
        output_bucket_name_100ms = input_bucket_name + "-hdf"

    matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob)

    s3_cxn = cloud_helpers.get_s3_cxn()
    # create output buckets if they don't already exist
    # it's better to do this before launching remote computations
    s3_cxn.create_bucket(output_bucket_name_1ms)
    s3_cxn.create_bucket(output_bucket_name_100ms)

    if use_cloud:
        print "Launching %d jobs" % len(matching_keys)

        def do_work(key_name):
            return process_s3_file(
                input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite
            )

        label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob)
        jids = cloud.map(do_work, matching_keys, _type="f2", _label=label)

        progress = progressbar.ProgressBar(len(jids)).start()
        n_finished = 0
        for _ in cloud.iresult(jids):
            n_finished += 1
            progress.update(n_finished)
        progress.finish()
    else:
        print "Running locally..."
        print "%d keys match the pattern '%s'" % (len(matching_keys), key_glob)
        for key in matching_keys:
            process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms)
    print "Done!"
Esempio n. 34
0
def launch_jobs(bucket, key_names, work_fn, combine, acc, label, _type, 
  accept_none_as_result, retry_timeouts = True):
  jids = cloud.map(\
    lambda name: work_fn(bucket, name), 
    key_names, 
    _type = _type, 
    _label= label, 
    _env = 'compute')
  
  timed_out = []
  try:
    progress = progressbar.ProgressBar(len(jids)).start()

    for (i, result) in enumerate(cloud.iresult(jids, num_in_parallel = 25)):
      if result is None and not accept_none_as_result:
        print "Job #", jids[i], key_names[i], "returned None"
      elif combine:
        # client-side reduction! Be careful about not doing too much
        # work here
        new_acc = combine(acc, result)
        if new_acc is not None: acc = new_acc
      progress.update(i+1)
  except KeyboardInterrupt:
    print "Caught keyboard interrupt, killing active workers..."
    cloud.kill(jids)  
    return acc  
  except cloud.CloudException as e:
    if isinstance(e.parameter, ssl.SSLError) and retry_timeouts:
      print "Job #", jids[i], "timed out"
      timed_out.append(key_names[i])
    else:
      print "Killing workers..."
      cloud.kill(jids)
      raise
  except:
    print "Killing workers..."
    cloud.kill(jids)
    raise
  finally:
    progress.finish()
  
  if len(timed_out) > 0:
    return launch_jobs(bucket, key_names, work_fn, combine, acc, 
      label, _type, accept_none_as_result, retry_timeouts = None)
  return acc
Esempio n. 35
0
def train(batch, remote, debug, dependency = []):
    params = cache.get("batch/%s/params" % batch, remote)
    numEpisodes = params['episodes']['num']
    
    trainParams = params['train']
    numIters = trainParams['iters']
    
    ij_ = [(i, j) for i, j in it.product(range(numEpisodes), range(numIters))]
    f = lambda (i, j) : trainer.train(batch, params, i, j, remote, debug)
    
    logging.info("running %s train instances" % len(ij_))
    if (remote):
        k_ = cloud.map(f, ij_, _label = "%s/train" % batch, _depends_on = dependency, _type = 'c1', _max_runtime = 30)
        logging.info("k_ %s" % k_)
        return k_
    else:
        results = map(f, ij_)
        return results
Esempio n. 36
0
def get_shares_bulk(urls,limit=-1,use_cloud=False):
    if use_cloud:
        # using picloud. parallelizing on chunks
        chunks = list(parallel.partitions(urls,100))
        def f(url):
            return map(lambda url: (url,get_shares(url)),url)
        jids = cloud.map(f,chunks)
        ret = list(itertools.chain(*cloud.result(jids)))
    else:
        # local. parallelizing using pool
        ret =  list(parallel.imap(get_shares,urls,threads=10))

    ret.sort(key= lambda (u,r) : -r['shares_count'])

    if limit == -1:
        return ret
    else:
        return ret[:limit]
def call_on_cloud(cmd_params, core_type, num_batches, start_batch_num, end_batch_num):
    ntests = len(cmd_params)
    batch_size = int(math.ceil(ntests/(num_batches+0.0)))

    batch_edges = batch_size*np.array(xrange(num_batches))[start_batch_num : end_batch_num]
    print batch_edges
    for i in xrange(len(batch_edges)):
        if i==len(batch_edges)-1:
            cmds = cmd_params[batch_edges[i]:]
        else:
            cmds = cmd_params[batch_edges[i]:min(batch_edges[i+1], len(cmd_params))]
        print colorize("calling on cloud..", "yellow", True)
        try:
            jids = cloud.map(run_sim_test, cmds, _vol='rss_dat', _env='RSS3', _type=core_type)
            res  = cloud.result(jids)
            print colorize("got results for batch %d/%d "%(i, len(batch_edges)), "green", True)
            save_results(res)
        except Exception as e:
            print "Found exception %s. Not saving data for this demo."%e
Esempio n. 38
0
def picloud(func, *args, **kwargs):
    """
    Runs the given function in parallel over the PiCloud cluster.

    Parameters
    ----------
    func : function
        Function to run in parallel.

    In addition to the function 'func' to be run in parallel, the picloud
    function accepts a series of arguments that are passed to the function
    as variables. In general, the function can have multiple input variables,
    and these arguments must be passed in the same order as they are defined in
    the function definition.

    Furthermore, several keyword arguments may be given that set the settings
    for the PiCloud cluster:

    _type - Type of core used in picloud: 'c1', 'c2', 'f2' (default), 'm1',
            's1'
    _cores - Number of cores used: 1 (default)
    _env - Custom environment for computation. Set to current version of qutip.
    _label - Provide a label for the current computation.

    For more information see the PiCloud website: http://www.picloud.com/

    """
    kw = _default_cloud_settings()
    for keys in kwargs.keys():
        if keys not in kw.keys():
            raise Exception(str(keys) + ' is not a valid kwarg.')
        else:
            kw[keys] = kwargs[keys]
    job_ids = cloud.map(func, *args, **kw)
    results = cloud.result(job_ids)
    if isinstance(results[0], tuple):
        par_return = [elem for elem in results]
        num_elems = len(results[0])
        return [
            np.array([elem[ii] for elem in results]) for ii in range(num_elems)
        ]
    else:
        return list(results)
Esempio n. 39
0
def picloud(func, *args, **kwargs):
    """
    Runs the given function in parallel over the PiCloud cluster.

    Parameters
    ----------
    func : function
        Function to run in parallel.

    In addition to the function 'func' to be run in parallel, the picloud
    function accepts a series of arguments that are passed to the function
    as variables. In general, the function can have multiple input variables,
    and these arguments must be passed in the same order as they are defined in
    the function definition.

    Furthermore, several keyword arguments may be given that set the settings
    for the PiCloud cluster:

    _type - Type of core used in picloud: 'c1', 'c2', 'f2' (default), 'm1',
            's1'
    _cores - Number of cores used: 1 (default)
    _env - Custom environment for computation. Set to current version of qutip.
    _label - Provide a label for the current computation.

    For more information see the PiCloud website: http://www.picloud.com/

    """
    kw = _default_cloud_settings()
    for keys in kwargs.keys():
        if keys not in kw.keys():
            raise Exception(str(keys) + ' is not a valid kwarg.')
        else:
            kw[keys] = kwargs[keys]
    job_ids = cloud.map(func, *args, **kw)
    results = cloud.result(job_ids)
    if isinstance(results[0], tuple):
        par_return = [elem for elem in results]
        num_elems = len(results[0])
        return [np.array([elem[ii] for elem in results])
                for ii in range(num_elems)]
    else:
        return list(results)
Esempio n. 40
0
def cloud_map(func, args, jobs=None, return_jobs=False,
              **cloud_opts):
    """
    Call cloud.map, with some standard logging info

    Parameters
    ----------
    func : function to map
    args : list of mapping arguments
    jobs : list of pre-existing job ids, or None
        If present, will fetch the results from these jobs
    return_jobs : boolean (optional, default false)
        If True, return the job IDs instead of
        the job results
    cloud_opts : dict (optional)
        Extra keyword arguments to pass to cloud.map

    Returns
    -------
    Result of cloud.map if return_jobs=False, else the job ids
    """
    import cloud

    cloud_opts.setdefault('_env', 'mwp')
    cloud_opts.setdefault('_type', 'c2')
    cloud_opts.setdefault('_label', func.__name__)

    if jobs is None:
        log = logging.getLogger(func.__module__)

        log.debug(
            "Starting %i jobs on PiCloud for %s" % (len(args), func.__name__))
        jobs = cloud.map(func, args, **cloud_opts)
        log.debug("To re-fetch results, use \n"
                  "%s(jobs=range(%i, %i))" %
                  (func.__name__, min(jobs), max(jobs) + 1))

    if return_jobs:
        return jobs

    return cloud.result(jobs)
Esempio n. 41
0
def url_chunker(url, chunksize=1024):
    """Returns an iterator over contents of a file
        *Params*
        #file - an open FILE object
        #chunksize - how many lines to read at once?
    """
    #url=book[0]
    #bookname=book[1]

    user_agent = {'User-agent': 'Mozilla/5.0'}
    result = requests.get(url, headers=user_agent)

    try:
        doc = result.content
    except:
        raise Exception("URL " + url + "not responding")

    text_in = StringIO(doc)
    chunks = []
    stop = False
    while not stop:
        text = ""
        for x in range(chunksize):
            try:
                text += text_in.next()
            except StopIteration:
                chunks.append(text)
                stop = True
                break

        chunks.append(text)

    jobids = cloud.map(wordcount, [(url, c) for c in chunks])
    cloud.join(jobids, deadlock_check=False)
    results = cloud.result(jobids)

    index = reduce_results(results)

    mongo_insert(index)

    return "OK"
Esempio n. 42
0
def url_chunker(url, chunksize=1024):
    """Returns an iterator over contents of a file
        *Params*
        #file - an open FILE object
        #chunksize - how many lines to read at once?
    """
    #url=book[0]
    #bookname=book[1]
    
    user_agent = {'User-agent': 'Mozilla/5.0'}
    result=requests.get(url,headers=user_agent)
    
    try:
        doc = result.content
    except:
        raise Exception("URL "+url+"not responding")
    
    text_in=StringIO(doc)
    chunks = []
    stop = False
    while not stop:
        text=""
        for x in range(chunksize):
            try:
                text+=text_in.next()
            except StopIteration:
                chunks.append(text)
                stop=True
                break
                
        chunks.append(text)
        
    jobids = cloud.map(wordcount, [(url,c) for c in chunks])
    cloud.join(jobids,deadlock_check=False)
    results = cloud.result(jobids)
    
    index=reduce_results(results)
    
    mongo_insert(index)
    
    return "OK"
def run_ip():
    
    #Figure out how many jobs I want to create and how many requests per job
    job_count = int(sys.argv[1])
    
    job_rows = range(0, job_count)
    
    #Now actually map them to run in the cloud
    #The "s1" type gives unique IP addresses. Eek
    print "Creating job map for {0} jobs.".format(len(job_rows))
    jids = cloud.map(download_ip, job_rows, _type="s1")
    
    print "Waiting for jobs to complete."
    
    #The possible statuses and the statuses we are waiting for
    possible_job_statutes = ["waiting", "queued", "processing", "done", "error", "killed", "stalled"]
    pending_job_statuses = Set(["waiting", "queued", "processing"])
    
    #Keep looping until no job statuses are in the pending_job_statuses
    statuses = []
    while True:
        statuses = cloud.status(jids)
        tally = Counter()
        for status in statuses:
            tally[status] += 1
        print "Status of jobs: " + str(tally)
        
        #If none of the statuses are in pending_job_statuses, we are done!
        if len(pending_job_statuses.intersection(Set(statuses))) == 0:
            break
        
        #Wait for 5 seconds between checks
        sleep(5)
    
    #Now loop through the jobs and retrieve the results
    ip_counter = Counter()
    results = cloud.result(jids)
    for result in results:
        ip_counter[result] += 1
    
    print "IP Addresses: " + str(ip_counter)
Esempio n. 44
0
def detect_trials(pos_path, neg_path, threshold, test_frac, cmpr_window, cmpr_step,
           w_smooth, gamma, p_sample, detection_step, min_dist_step,
           detection_window_hrs, req_consec_detections):
  
  trials = 5
  pos_path_ = [pos_path] * trials
  neg_path_ = [neg_path] * trials
  threshold_ = [threshold] * trials
  test_frac_ = [test_frac] * trials
  cmpr_window_ = [cmpr_window] * trials
  cmpr_step_ = [cmpr_step] * trials
  w_smooth_ = [w_smooth] * trials
  gamma_ = [gamma] * trials
  p_sample_ = [p_sample] * trials
  detection_step_ = [detection_step] * trials
  min_dist_step_ = [min_dist_step] * trials
  detection_window_hrs_ = [detection_window_hrs] * trials
  req_consec_detections_ = [req_consec_detections] * trials
  
  jids = cloud.map(detect_trial,
                   pos_path_,
                   neg_path_,
                   threshold_,
                   test_frac_,
                   cmpr_window_,
                   cmpr_step_,
                   w_smooth_,
                   gamma_,
                   p_sample_,
                   detection_step_,
                   min_dist_step_,
                   detection_window_hrs_,
                   req_consec_detections_,
                   _type = 'f2')

  params = Params(pos_path, neg_path, threshold, test_frac, cmpr_window,
                  cmpr_step, w_smooth, gamma, p_sample, detection_step,
                  min_dist_step, detection_window_hrs, req_consec_detections)  

  return params, jids
Esempio n. 45
0
def detect_trials(pos_path, neg_path, threshold, test_frac, cmpr_window,
                  cmpr_step, w_smooth, gamma, p_sample, detection_step,
                  min_dist_step, detection_window_hrs, req_consec_detections):

    trials = 5
    pos_path_ = [pos_path] * trials
    neg_path_ = [neg_path] * trials
    threshold_ = [threshold] * trials
    test_frac_ = [test_frac] * trials
    cmpr_window_ = [cmpr_window] * trials
    cmpr_step_ = [cmpr_step] * trials
    w_smooth_ = [w_smooth] * trials
    gamma_ = [gamma] * trials
    p_sample_ = [p_sample] * trials
    detection_step_ = [detection_step] * trials
    min_dist_step_ = [min_dist_step] * trials
    detection_window_hrs_ = [detection_window_hrs] * trials
    req_consec_detections_ = [req_consec_detections] * trials

    jids = cloud.map(detect_trial,
                     pos_path_,
                     neg_path_,
                     threshold_,
                     test_frac_,
                     cmpr_window_,
                     cmpr_step_,
                     w_smooth_,
                     gamma_,
                     p_sample_,
                     detection_step_,
                     min_dist_step_,
                     detection_window_hrs_,
                     req_consec_detections_,
                     _type='f2')

    params = Params(pos_path, neg_path, threshold, test_frac, cmpr_window,
                    cmpr_step, w_smooth, gamma, p_sample, detection_step,
                    min_dist_step, detection_window_hrs, req_consec_detections)

    return params, jids
    def create_alternative_worlds(self):
        ''' 
        Simulate each of the economies. 
        '''
        T = self.T
        I = self.I

        print 'Simulating a world with %d countries.' % I

        # these are the individual simulation jobs
        jobs = cloud.map(simulate_an_economy,
                         self.economies, [T] * I,
                         _label='A world with %d countries.' % I,
                         _type='c2')
        # this is a harvester job, collecting the results of each individual simulation
        return cloud.call(GDP_collector,
                          jobs,
                          T,
                          T1,
                          T2,
                          _depends_on=jobs,
                          _label='Collecting GDP results.')
Esempio n. 47
0
def cloud_pooler(func,
                 in_q,
                 chunk_size=1000,
                 _env='python-web',
                 _type='c2',
                 _max_runtime=60,
                 get_results=True,
                 **kwargs):
    import cloud
    if chunk_size > 1:
        if isinstance(in_q, collections.Iterable):
            in_q = WebQueue(in_q)
        chunks = []
        chunk = []
        while not in_q.empty():
            chunk.append(in_q.get())
            if len(chunk) == chunk_size:
                chunks.append(chunk)
                chunk = []
        if len(chunk):
            chunks.append(chunk)
    else:
        chunks = in_q

    partial_func = functools.partial(func, **kwargs)
    jids = cloud.map(partial_func,
                     chunks,
                     _env=_env,
                     _type=_type,
                     _max_runtime=_max_runtime)

    if get_results:
        print jids
        for result in cloud.iresult(jids, ignore_errors=True):
            if result:
                yield result
    else:
        for jid in jids:
            yield jid
def extract_snapshots_on_cloud(demo_type, core_type):
    """
    runs snapshot extraction on the cloud and saves the result on local machine. 
    """
    demo_testing_dir = osp.join(testing_results_dir, demo_type)
    env_state_files  = find_recursive(demo_testing_dir, '*.cp')
    
    state_infos = []
    for env_state_file in env_state_files[0:2]:
        with open(env_state_file,"r") as fh:
            seg_info = cp.load(fh)['seg_info']
    
        if seg_info == None:
            continue

        save_dir = osp.join(osp.dirname(env_state_file),  'snapshots', osp.splitext(osp.basename(env_state_file))[0])        
        state_infos.append((seg_info, save_dir))

    print colorize("calling on cloud..", "yellow", True)
    jids = cloud.map(get_state_snapshots, state_infos, _env='RSS3', _type=core_type)
    res  = cloud.result(jids)
    print colorize("got snapshots from cloud for : %s. Saving..."%demo_type, "green", True)
    save_snapshots_from_cloud(res)
Esempio n. 49
0

TRIALS = int(sys.argv[1])
NUM_PARTICLES = int(sys.argv[2])
DELTA = int(sys.argv[3])
INTEGRAL_PATHS = int(sys.argv[4])


def run_on_instance(trial_id):
  global number_of_clusters
  global if_zero_shortlearning
  global experiment_name
  import subprocess
  import os
  os.environ['DISPLAY'] = ":1"
  print "Starting"
  ls_output = subprocess.Popen(["/home/picloud/julia/julia", "runner.jl", str(NUM_PARTICLES), str(DELTA), str(INTEGRAL_PATHS)], \
                               cwd = "/home/picloud/DPMixtureModel/DPMM_SMC/",  \
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)

  out, err = ls_output.communicate()
  return out
 
#result = run_on_instance([1])  

jids = cloud.map(run_on_instance, range(TRIALS), _env=cloud_environment, _type='c2', _cores=1)
print jids
result = cloud.result(jids)
pickle.dump(result, open("result_"+str(NUM_PARTICLES)+"particles_"+str(DELTA)+"delta_"+str(INTEGRAL_PATHS)+"path.pkl","wb"))
print "RESULT:", result
def calc_pure_python(show_output):
    # make a list of x and y values which will represent q
    # xx and yy are the co-ordinates, for the default configuration they'll look like:
    # if we have a 1000x1000 plot
    # xx = [-2.13, -2.1242, -2.1184000000000003, ..., 0.7526000000000064, 0.7584000000000064, 0.7642000000000064]
    # yy = [1.3, 1.2948, 1.2895999999999999, ..., -1.2844000000000058, -1.2896000000000059, -1.294800000000006]
    x_step = (float(x2 - x1) / float(w)) * 2
    y_step = (float(y1 - y2) / float(h)) * 2
    x = []
    y = []
    ycoord = y2
    while ycoord > y1:
        y.append(ycoord)
        ycoord += y_step
    xcoord = x1
    while xcoord < x2:
        x.append(xcoord)
        xcoord += x_step
    q = []
    for ycoord in y:
        for xcoord in x:
            q.append(complex(xcoord, ycoord))

    print "Total elements:", len(q)

    # split work list into continguous chunks, one per CPU
    # build this into chunks which we'll apply to map_async
    nbr_chunks = 128 # experiment with different nbrs of chunks
    #nbr_chunks = multiprocessing.cpu_count()
    chunk_size = len(q) / nbr_chunks

    # split our long work list into smaller chunks
    # make sure we handle the edge case where nbr_chunks doesn't evenly fit into len(q)
    if len(q) % nbr_chunks != 0:
        # make sure we get the last few items of data when we have
        # an odd size to chunks (e.g. len(q) == 100 and nbr_chunks == 3
        nbr_chunks += 1
    chunks = [(q[x*chunk_size:(x+1)*chunk_size], maxiter) \
              for x in xrange(nbr_chunks)]
    print chunk_size, len(chunks), len(chunks[0][0])
    print "Size of complex nbr:", sys.getsizeof(0+0j)
    print "We have %d complex numbers in the q array" % (len(q))
    print "In total we're sending %d bytes" % (len(q) * sys.getsizeof(0+0j))
    print "And receiving %d bytes" % (len(q) * sys.getsizeof(1))

    # create a Pool which will create Python processes
    start_time = datetime.datetime.now()
    print "Running cloud.map on %d chunks" % (len(chunks))
    jids = cloud.map(calculate_z, chunks)
    # we get a list of lists back, one per chunk, so we have to
    # flatten them back together
    # return a list of lists of results
    print "Jobs submitted, waiting on results"
    results = cloud.result(jids) # [[ints...], [ints...], []]
    print "cloud.result completed"
    output = []
    for res in results:
        output += res
    end_time = datetime.datetime.now()

    secs = end_time - start_time
    print "Main took", secs

    validation_sum = sum(output)
    print "Total sum of elements (for validation):", validation_sum

    if show_output:
        show(output)

    return validation_sum
Esempio n. 51
0
def run_exp((data_filename, inits), wait_file, kernel_config_name):
    # put the filenames in the data
    irm.experiments.to_bucket(data_filename, BUCKET_BASE)
    [irm.experiments.to_bucket(init_f, BUCKET_BASE) for init_f in inits]

    kc = KERNEL_CONFIGS[kernel_config_name]
    CHAINS_TO_RUN = len(inits)
    ITERS = kc['ITERS']
    kernel_config = kc['kernels']
    fixed_k = kc.get('fixed_k', False)

    jids = cloud.map(irm.experiments.inference_run,
                     inits, [data_filename] * CHAINS_TO_RUN,
                     [kernel_config] * CHAINS_TO_RUN, [ITERS] * CHAINS_TO_RUN,
                     range(CHAINS_TO_RUN), [BUCKET_BASE] * CHAINS_TO_RUN,
                     [None] * CHAINS_TO_RUN, [fixed_k] * CHAINS_TO_RUN,
                     _label="%s-%s-%s" %
                     (data_filename, inits[0], kernel_config_name),
                     _env='connectivitymotif',
                     _type='f2')

    pickle.dump(
        {
            'jids': jids,
            'data_filename': data_filename,
            'inits': inits,
            'kernel_config_name': kernel_config_name
        }, open(wait_file, 'w'))


@transform(run_exp, suffix('.wait'), '.samples')
Esempio n. 52
0
def launch_jobs(hdf_bucket, training_keys, testing_keys, 
    raw_features = None, start_hour = 3, end_hour = 7, 
    num_features = 1, future_offset = 600, profile = True):
  all_possible_params = gen_feature_params(raw_features)

  chosen_params = []
  def worker_wrapper(new_param):
    return download_and_eval(hdf_bucket, training_keys, testing_keys,
      chosen_params, 
      new_param,  
      start_hour = start_hour, 
      end_hour = end_hour, 
      future_offset = future_offset)
  
  for feature_num in xrange(num_features):

    print "=== Searching for feature #%d ===" % (feature_num+1)
    print "Launching %d jobs over %d training files and %d testing files" %  \
      (len(all_possible_params), len(training_keys), len(testing_keys))
    
    label = 'Evaluating %d parameter sets for feature #%d' % \
      (len(all_possible_params), feature_num+1)

    jids =\
      cloud.map(worker_wrapper, all_possible_params, 
        _env = 'compute', 
        _label=label, 
        _type = 'f2', 
        _profile = profile)
    results = {}
    best_result = None
    best_model = None
    for (i, (curr_best_score, curr_best_model, results)) in enumerate(cloud.iresult(jids)):
      if results is None:
        results = {}
      else:
        assert isinstance(results, dict)
      print "Got %d results with best score = %s" % (len(results), curr_best_score)
      for (param, r) in results.items():
        key = tuple(chosen_params + [param])
        results[key] = r
        #if r is None:
          # print param, "<skipped>"
        if r is not None:
          train_result, test_result = r
          score = train_result.score
          if best_result is None or not np.isfinite(best_result['train'].score) or \
             best_result['train'].score <  score:
            print "New best:", key
            print "result for training data:", train_result
            if test_result: 
              print "result for testing data:", test_result
            print
            best_result  = { 
              'params':key, 
              'train': train_result,  
              'test': test_result
            }
            best_model = curr_best_model 
    print 
    print "Current best for %d features: %s" % (feature_num+1, best_result)
    print
    curr_best_params = best_result['params']
    if len(curr_best_params) < feature_num+1:
      print "Got no improvement from adding %d'th feature, stopping..."
      break
    else:
      chosen_params.append(curr_best_params[-1])
  return best_result, best_model, results
Esempio n. 53
0
    
    ==PARAMS==
    results: A list of Counter() objects, as produced by cloud.result() method
    
    ==RETURNS==
    a Counter() object with total word-counts for the whole body of text
    """
    total_wordcount = Counter()
    for r in results:
        total_wordcount.update(r)
    return total_wordcount


##job_ids=cloud.map(wordcount,chunker(f))

## where are the files we care about?
path = '../www.gutenberg.lib.md.us/etext00'

## start cloud jobs over chunks of text
job_ids = cloud.map(wordcount, filechunker(path))

while True:
    c = cloud_status(job_ids)
    print c
    if c['processing'] == 0:
        break
    else:
        sleep(10)

res = cloud.result(job_ids)
Esempio n. 54
0
@files(experiment_generator)
def run_exp((data_filename, inits), wait_file, kernel_config_name):
    # put the filenames in the data
    irm.experiments.to_bucket(data_filename, BUCKET_BASE)
    [irm.experiments.to_bucket(init_f, BUCKET_BASE) for init_f in inits]

    kc = KERNEL_CONFIGS[kernel_config_name]
    CHAINS_TO_RUN = len(inits)
    ITERS = kc['ITERS']
    kernel_config = kc['kernels']
    init_type = kc.get('init', None)

    jids = cloud.map(irm.experiments.inference_run,
                     inits, [data_filename] * CHAINS_TO_RUN,
                     [kernel_config] * CHAINS_TO_RUN, [ITERS] * CHAINS_TO_RUN,
                     range(CHAINS_TO_RUN), [BUCKET_BASE] * CHAINS_TO_RUN,
                     [init_type] * CHAINS_TO_RUN,
                     _env='connectivitymotif',
                     _type='f2')

    pickle.dump(
        {
            'jids': jids,
            'data_filename': data_filename,
            'inits': inits,
            'kernel_config_name': kernel_config_name
        }, open(wait_file, 'w'))


@transform(run_exp, suffix('.wait'), '.samples')
def get_results(exp_wait, exp_results):
Esempio n. 55
0
def param_search(
        features,
        train_files,
        test_files,
        debug=False,
        regression=False,
        signal=signals.bid_offer_cross,
        ensemble=ClassifierEnsemble,
        base_models=[ClusteredClassifier(20)],
        num_models=[25],
        bagging_percents=[0.75],
        dict_types=[None],  # , 'kmeans'
        dict_sizes=[None],  # , 50
        pca_types=[None, 'whiten'],
        compute_pairwise_products=[False],
        binning=[False],
        stacking_models=[None, LogisticRegression()],
        start_hour=None,
        end_hour=None):
    print "Features:", features
    print "Training files:", train_files
    print "Testing files:", test_files

    def do_work(p):
        return worker(p, features, train_files, test_files, start_hour,
                      end_hour)

    oversampling_factors = [0]

    possible_encoder_params = {
        'dictionary_type': dict_types,
        'dictionary_size': dict_sizes,
        'pca_type': pca_types,
        'compute_pairwise_products': compute_pairwise_products,
        'binning': binning,
    }

    all_encoders = [
        FeatureEncoder(**p) for p in cartesian_product(possible_encoder_params)
        if (p['dictionary_size'] is not None or p['dictionary_type'] is None)
    ]

    possible_ensemble_params = {
        'base_model': base_models,
        'num_models': num_models,
        'stacking_model': stacking_models,
        'verbose': [True],
        'feature_subset_percent': [0.75],
        'bagging_percent': bagging_percents,
    }
    # classification ensembles get weighted by F-score
    if not regression:
        possible_ensemble_params['weighting'] = [0.25]

    all_ensembles = [
        ensemble(**params)
        for params in cartesian_product(possible_ensemble_params)
    ]

    if regression:
        train_params = {}
    else:
        train_params = {'class_weight': {0: 1, 1: 5, -1: 10}}
    worklist = []
    for smote_factor in oversampling_factors:
        general_params = {
            'oversampling_factor': smote_factor,
            'signal': signal,
            'regression': regression,
        }
        for encoder in all_encoders:
            for model in all_ensembles:
                params = {
                    'general': general_params,
                    'encoder': encoder,
                    'model': model,
                    'training': train_params
                }
                worklist.append(params)
    if debug:
        print "[Debug mode]"
        result_list = map(do_work, worklist[:1])
        for params, features, e, svm, result in result_list:
            print params, "=>", result
    else:
        init_cloud()
        label = ", ".join(train_files)
        jobids = cloud.map(do_work,
                           worklist,
                           _fast_serialization=2,
                           _type='m1',
                           _label=label,
                           _env='param_search')
        results = []
        print "Launched", len(worklist), "jobs, waiting for results..."
        for x in cloud.iresult(jobids):
            if x is not None:
                results.append(x)
                print x['params']
                print x['model']
                print x['result']
                print "---"

        def cmp(x, y):
            return int(np.sign(x['result']['cmp'] - y['result']['cmp']))

        results.sort(cmp=cmp)

        print "Best:"
        for item in results[-3:]:
            print item['params']
            r = item['result']
            print[(k, r[k]) for k in sorted(r.keys())]
Esempio n. 56
0
    # now the input args
    chunk_size = 80000
    chunks = int(np.ceil(len(sv) / float(chunk_size)))

    args = []
    for i in range(chunks):
        args += [(i * chunk_size, (i + 1) * chunk_size)]

    CN = chunks
    results = []
    if USE_CLOUD:
        print "MAPPING TO THE CLOUD"
        jids = cloud.map(picloud_score_frame, [dataset_name] * CN,
                         [x_range] * CN, [y_range] * CN, [phi_range] * CN,
                         [theta_range] * CN,
                         args, [frame] * CN, [EO] * CN, [likelihood_i] * CN,
                         _type='f2',
                         _vol="my-vol",
                         _env="base/precise")
    else:
        jids = map(picloud_score_frame, [dataset_name] * CN, [x_range] * CN,
                   [y_range] * CN, [phi_range] * CN, [theta_range] * CN, args,
                   [frame] * CN, [EO] * CN, [likelihood_i] * CN)

    np.savez_compressed(outfile_npz,
                        x_range=x_range,
                        y_range=y_range,
                        phi_range=phi_range,
                        theta_range=theta_range)
    pickle.dump(
        {
Esempio n. 57
0
                                      cmpr_step,
                                      w_smooth,
                                      gamma,
                                      p_sample,
                                      detection_step,
                                      min_dist_step,
                                      detection_window_hrs,
                                      req_consec_detections)

param_product_old = set(param_product_old)
param_product_new = set(param_product_new)

param_product = param_product_new.difference(param_product_old)
"""

jids = cloud.map(detect_trials, *zip(*param_product), _type='f2')

params_sub_jids = cloud.result(jids)
params = [elt[0] for elt in params_sub_jids]
sub_jids = [elt[1] for elt in params_sub_jids]
stats = cloud.result(sub_jids)

dt = datetime.now()

# Write out as plain text just in case.
out_path_txt = 'data/param_explore_%d%d%d%d%d%d.txt' % \
  (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
open(out_path_txt, 'w').write(str((params, stats)))

params, stats = fix_results_nesting((params, stats))
out_path_pkl = 'data/param_explore_%d%d%d%d%d%d.pkl' % \