def test_ordered_aggregation(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') resx = [c for c, _ in result_iterator(res)] sum_millis = {} for ad_id, millis in resx: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27', order_by=2, limit=3) results = [c for c, _ in result_iterator(res)] lowest = 0 for ad_id, millis, count in results: self.assertLessEqual(lowest, count) lowest = count ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) self.assertTrue(len(results) == min(len(sum_millis), 3))
def estimate(master, input, center, k, iterations, map_reader=chain_reader): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ job = master.new_job(name='k-clustering_init', input=input, map_reader=map_reader, map_init=map_init, map=random_init_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(k=k, seed=None, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() for j in range(iterations): job = master.new_job(name='k-clustering_iteration_%s' % (j, ), input=input, map_reader=map_reader, map=estimate_map, combiner=estimate_combiner, reduce=estimate_reduce, params=Params(centers=centers, **center), nr_reduces=k) centers = [(i, c) for i, c in result_iterator(job.wait())] job.purge() return centers
def test_log_reg(self): # python tests_classification.py Tests_Classification.test_log_reg from discomll.classification import logistic_regression train_data1, test_data1 = datasets.breastcancer_cont_orange() train_data2, test_data2 = datasets.breastcancer_cont_discomll() learner = Orange.classification.logreg.LogRegLearner(fitter=Orange.classification.logreg.LogRegFitter_Cholesky) classifier = learner(train_data1) thetas1 = classifier.beta predictions1 = [] probabilities1 = [] for inst in test_data1: target, probs = classifier(inst, Orange.classification.Classifier.GetBoth) predictions1.append(target.value) probabilities1.append(probs.values()) thetas_url = logistic_regression.fit(train_data2, alpha=1e-8, max_iterations=10) thetas2 = [v for k, v in result_iterator(thetas_url["logreg_fitmodel"]) if k == "thetas"] results_url = logistic_regression.predict(test_data2, thetas_url) predictions2 = [] probabilities2 = [] for k, v in result_iterator(results_url): predictions2.append(v[0]) probabilities2.append(v[1]) self.assertTrue(np.allclose(thetas1, thetas2)) self.assertTrue(np.allclose(probabilities1, probabilities2, atol=1e-5)) self.assertListEqual(predictions1, predictions2)
def test_simple_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) res = select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id), order_by='amount') results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertEqual(len(results), len(join)) for jtup in join: self.assertIn(jtup, results) lowest = 0 for ad_id, amount in results: self.assertLessEqual(lowest, amount) lowest = amount
def runTest(self): ducks = ['huey', 'dewey', 'louie'] self.job = MapResultsJob().run(input=['raw://%s' % d for d in ducks]) self.assertAllEqual(sorted(result_iterator(self.job.wait())), sorted(('%s!?' % d, '') for d in ducks)) self.assertAllEqual(sorted(result_iterator(self.job.mapresults())), sorted(('%s!' % d, '') for d in ducks))
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive > 0))))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive > 0)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertEqual(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def test_kmeans_breastcancer(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 2 random_seed = 2 x_train, _, x_test, _ = datasets.breastcancer_disc() train_data, test_data = datasets.breastcancer_disc_discomll() kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = kmeans2.cluster_centers_ predictions1 = kmeans2.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) predictions2 = [v[0] for k, v in result_iterator(predictions_url)] centroids2 = [ v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"]) ] centroids2[0], centroids2[1] = centroids2[1], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def stat(where, limit=16, **kwargs): from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final
def results_to_file(request, input_dict, output_dict, widget): from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir tag = input_dict["string"] folder = 'discomll_results' add = "add" if input_dict["add_params"] == "true" else "" destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + add + '.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists f = open(destination, 'w') if input_dict["add_params"] == "true": for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v) + "\n") else: for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v[0]) + "\n") f.close() filename = folder + "/" + tag[0][6:] + add + '.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def test_aggregate_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = {} for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: if imp_ad_id not in join: join[imp_ad_id] = [0, 0] join[imp_ad_id][0] += pix_amount join[imp_ad_id][1] += 1 res = select(imps.ad_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) results = [(ad_id, amount, count) for (ad_id, amount, count), _ in result_iterator(res)] self.assertEqual(len(results), len(join)) for (ad_id, amount, count) in results: ramount, rcount = join[ad_id] self.assertEqual(ramount, amount) self.assertEqual(rcount, count)
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive > 0))))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive > 0)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def estimate(master, input, center, k, iterations, map_reader = reader): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ job = master.new_job(name = 'k-clustering_init', input = input, map_reader = map_reader, map_init = map_init, map = random_init_map, combiner = estimate_combiner, reduce = estimate_reduce, params = Params(k = k, seed = None, **center), nr_reduces = k) centers = [(i,c) for i,c in result_iterator(job.wait())] job.purge() for j in range(iterations): job = master.new_job(name = 'k-clustering_iteration_%s' %(j,), input = input, map_reader = map_reader, map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = Params(centers = centers, **center), nr_reduces = k) centers = [(i,c) for i,c in result_iterator(job.wait())] job.purge() return centers
def runTest(self): ducks = ['huey', 'dewey', 'louie'] self.job = MapResultsJob().run(input=['raw://{0}'.format(d) for d in ducks]) self.assertAllEqual(sorted(result_iterator(self.job.wait())), sorted(('{0}!?'.format(d), '') for d in ducks)) self.assertAllEqual(sorted(result_iterator(self.job.stageresults("map"))), sorted(('{0}!'.format(d), '') for d in ducks))
def test_kmeans_breastcancer(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 2 random_seed = 2 x_train, _, x_test, _ = datasets.breastcancer_disc() train_data, test_data = datasets.breastcancer_disc_discomll() kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = kmeans2.cluster_centers_ predictions1 = kmeans2.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) predictions2 = [v[0] for k, v in result_iterator(predictions_url)] centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])] centroids2[0], centroids2[1] = centroids2[1], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def test_aggregate_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = {} for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: if imp_ad_id not in join: join[imp_ad_id] = [0, 0] join[imp_ad_id][0] += pix_amount join[imp_ad_id][1] += 1 res = select(imps.ad_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) results = [(ad_id, amount, count) for (ad_id, amount, count), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for (ad_id, amount, count) in results: ramount, rcount = join[ad_id] self.assertEqual(ramount, amount) self.assertEqual(rcount, count)
def results_to_file(request,input_dict,output_dict,widget): from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir tag = input_dict["string"] folder = 'discomll_results' add = "add" if input_dict["add_params"] == "true" else "" destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+add+'.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists f = open(destination,'w') if input_dict["add_params"] == "true": for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v) + "\n") else: for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v[0]) + "\n") f.close() filename = folder+"/"+tag[0][6:]+add+'.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def test_simple_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) res = select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id), order_by='amount') results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for jtup in join: self.assertIn(jtup, results) lowest = 0 for ad_id, amount in results: self.assertLessEqual(lowest, amount) lowest = amount
def test_chunk(self): from disco.core import result_iterator url = 'http://discoproject.org/media/text/chekhov.txt' self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024) self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4) self.assert_(list(result_iterator(['tag://disco:test:chunk'])), list(result_iterator([url], reader=None))) self.ddfs.delete('disco:test:chunk')
def test_chunk(self): from disco.core import result_iterator url = 'http://discoproject.org/media/text/chekhov.txt' self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100 * 1024) self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4) self.assert_(list(result_iterator(['tag://disco:test:chunk'])), list(result_iterator([url], reader=None))) self.ddfs.delete('disco:test:chunk')
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) if "drf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict_voting if voting else map_predict_dist))] job.params = dataset.params for k, v in result_iterator(fitmodel_url["drf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def test_lwlr(self): # python -m unittest tests_regression.Tests_Regression.test_lwlr import locally_weighted_linear_regression as lwlr1 from discomll.regression import locally_weighted_linear_regression as lwlr2 x_train, y_train, x_test, y_test = datasets.regression_data() train_data, test_data = datasets.regression_data_discomll() lwlr1 = lwlr1.Locally_Weighted_Linear_Regression() taus = [1, 10, 25] sorted_indices = np.argsort([str(el) for el in x_test[:, 1].tolist()]) for tau in taus: thetas1, estimation1 = lwlr1.fit(x_train, y_train, x_test, tau=tau) thetas1, estimation1 = np.array(thetas1)[sorted_indices], np.array( estimation1)[sorted_indices] results = lwlr2.fit_predict(train_data, test_data, tau=tau) thetas2, estimation2 = [], [] for x_id, (est, thetas) in result_iterator(results): estimation2.append(est) thetas2.append(thetas) self.assertTrue(np.allclose(thetas1, thetas2, atol=1e-8)) self.assertTrue(np.allclose(estimation1, estimation2, atol=1e-3))
def run(self): if self.config['print_to_stdout']: self.job.run(input = do_split(self.config), map = self.map, reduce = self.reduce, params = self.params, map_input_stream = mongodb_input_stream, required_modules= ['mongodb_io', 'mongodb_input', 'config_util', 'mongo_util', 'mongodb_output']) for key, value in result_iterator(self.job.wait(show=True)): print key, value else: self.job.run(input = do_split(self.config), map = self.map, reduce = self.reduce, params = self.params, map_input_stream = mongodb_input_stream, reduce_output_stream = mongodb_output_stream, required_modules= ['mongodb_io', 'mongodb_input', 'config_util', 'mongo_util', 'mongodb_output']) if self.config.get("job_wait",False): self.job.wait(show=True)
def test_combo_where_on_mixed_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000)))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 2) self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
def test_combo_where_on_or_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 27) self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\ load_method=offdimetlmr, dimnames= repr([]), \ go_live=1, profile=False): dim_job = master.new_job( name = 'dim', input = input, map_init = load_method.dim_map_init, map_reader = load_method.map_reader, map = load_method.dim_map_func, partition = load_method.dim_partition_func, combiner = load_method.dim_combiner_func, reduce = load_method.dim_reduce_func, scheduler = {'max_cores': nr_maps}, nr_reduces = nr_reduces, required_modules=[('config', config_path)], profile = profile, status_interval = 1000000, params = Params(count=0, dimnames=dimnames, \ nr_maps=nr_maps, nr_reduces=nr_reduces) ) results = dim_job.wait() shelvedb_paths = [] if results!=None: for key,value in result_iterator(results): shelvedb_paths.append(key) if go_live==1: load_method.golive(config, shelvedb_paths)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [ (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"]) ] job.pipeline = [("split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map))] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def auth(clazz, province, input, output, date): dirList = os.listdir(input) ptime = datetime.strptime(date, "%Y%m%d") file_filter = ptime.strftime('%Y-%m-%d') input = ["file:///" + input + "/" + file for file in dirList if ( re.search(date, file) or re.search(file_filter, file) )] if input: if clazz == 'c+w': if cw_map_funs.has_key(province): mapfun = cw_map_funs[province] else: mapfun = cw_map else: if fixed_map_funs.has_key(province): mapfun = fixed_map_funs[province] else: mapfun = fixed_map job = Job().run(input=input, map=mapfun) file = open(output + "/" + clazz + "-" + date + ".ctl", "w") sqldr_header(file) for user, line in result_iterator(job.wait(show=True)): print >>file, line file.close() else: print 'resolve.py: Can not find any auth files.'
def unwrapMatrix(self, m, n, dtype=float64): """ Instantiate matrix from wrapper. """ rows = [] cols = [] vals = [] for url in self.urls: if type(url) == list: # dfs protocol is a nested list url = url[0] protocol,path = url.split("://") if protocol == MatrixWrapper.RAW: elems = path.split(";") for elem in elems: i,j,val = elem.split(",") rows.append(int(i)) cols.append(int(j)) vals.append(dtype(val)) elif protocol == MatrixWrapper.DIR: total = 0 for key, val in result_iterator([url]): elems = key.split(";") for elem in elems: i,j,val = elem.split(",") rows.append(int(i)) cols.append(int(j)) vals.append(dtype(val)) total += 1 assert total <= m*n, "cardinality of result set exceeds %dx%d=%d entries" % (m,n,m*n) elif protocol == MatrixWrapper.DFS: raise Exception('dfs protocol not supported yet') else: raise Exception('invalid protocol') return coo_matrix((vals,(rows,cols)), dtype=dtype, dims=(m,n))
def test_lwlr(self): # python -m unittest tests_regression.Tests_Regression.test_lwlr import locally_weighted_linear_regression as lwlr1 from discomll.regression import locally_weighted_linear_regression as lwlr2 x_train, y_train, x_test, y_test = datasets.regression_data() train_data, test_data = datasets.regression_data_discomll() lwlr1 = lwlr1.Locally_Weighted_Linear_Regression() taus = [1, 10, 25] sorted_indices = np.argsort([str(el) for el in x_test[:, 1].tolist()]) for tau in taus: thetas1, estimation1 = lwlr1.fit(x_train, y_train, x_test, tau=tau) thetas1, estimation1 = np.array(thetas1)[sorted_indices], np.array(estimation1)[sorted_indices] results = lwlr2.fit_predict(train_data, test_data, tau=tau) thetas2, estimation2 = [], [] for x_id, (est, thetas) in result_iterator(results): estimation2.append(est) thetas2.append(thetas) self.assertTrue(np.allclose(thetas1, thetas2, atol=1e-8)) self.assertTrue(np.allclose(estimation1, estimation2, atol=1e-3))
def test_kmeans_iris(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 3 random_seed = 0 x_train, y_train, x_test, y_test = datasets.iris() train_data, test_data = datasets.iris_discomll() sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = sk_kmeans.cluster_centers_ # predictions1 = sk_kmeans.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) # predictions2 = [v[1] for k,v in result_iterator(predictions_url)] centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])] centroids2[0], centroids2[2] = centroids2[2], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def dump(result_urls, width=80): """ Dump the results of a non-nested query. :type result_urls: sequence of strings :param result_urls: result of an (unnested) query :type width: int :param width: the number of columns to constrain output to """ from disco.core import result_iterator alignments = None for columns, _ in result_iterator(result_urls): if not alignments: alignments = [] for column in columns: try: float(column) alignments.append(_ALG_RIGHT) except: alignments.append(_ALG_LEFT) _print_line(columns, width=width, cols=len(alignments), alignments=alignments)
def run(self): if self.config['print_to_stdout']: self.job.run(input=do_split(self.config), map=self.map, reduce=self.reduce, params=self.params, map_input_stream=mongodb_input_stream, required_modules=[ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongo_util', 'mongodisco.mongodb_output' ]) for key, value in result_iterator(self.job.wait(show=True)): print key, value else: self.job.run(input=do_split(self.config), map=self.map, reduce=self.reduce, params=self.params, map_input_stream=mongodb_input_stream, reduce_output_stream=mongodb_output_stream, required_modules=[ 'mongodisco.mongodb_io', 'mongodisco.mongodb_input', 'mongodisco.mongo_util', 'mongodisco.mongodb_output' ]) if self.config.get("job_wait", False): self.job.wait(show=True)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])] job.pipeline = [ ( "split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map), ) ] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def test_single_int_order(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis) lowest = 0 for (a, d, c), _ in result_iterator(res): self.assertLessEqual(lowest, c) lowest = c
def test_overall(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4, order_by='ad_id', desc=True) results = [a for (a, d), _ in result_iterator(res)] self.assertEqual(len(results), 4) self.assertListEqual(results, [30019, 30018, 30017, 30015])
def test_kmeans_iris(self): # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris from discomll.clustering import kmeans from sklearn.cluster import KMeans max_iter = 10 clusters = 3 random_seed = 0 x_train, y_train, x_test, y_test = datasets.iris() train_data, test_data = datasets.iris_discomll() sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train) centroids1 = sk_kmeans.cluster_centers_ # predictions1 = sk_kmeans.predict(x_test) centroids_url = kmeans.fit(train_data, n_clusters=clusters, max_iterations=max_iter, random_state=random_seed) predictions_url = kmeans.predict(test_data, centroids_url) # predictions2 = [v[1] for k,v in result_iterator(predictions_url)] centroids2 = [ v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"]) ] centroids2[0], centroids2[2] = centroids2[2], centroids2[0] self.assertTrue(np.allclose(centroids1, centroids2))
def _linreg_model(fitmodel): output = "Linear regression model\n\n" for k, v in result_iterator(fitmodel): if k == "thetas": output += "Thetas\n" output += ", ".join(map(str, v)) + "\n\n" return output
def measure(test_data, predictions, measure="ca", save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.worker.task_io import task_input_stream, chain_reader if measure not in ["ca", "mse"]: raise Exception("measure should be ca or mse.") if test_data.params["id_index"] == -1: raise Exception("ID index should be defined.") if predictions == []: return "No predictions", None # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))] job.params = test_data.params job.run(name="ma_parse_testdata", input=test_data.params["data_tag"]) parsed_testdata = job.wait(show=show) reduce_proces = reduce_ca if measure == "ca" else reduce_mse job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader], process=map_predictions)), ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))] job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions) measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0] return measure, acc
def _linsvm_model(fitmodel): output = "Linear SVM model\n\n" for k, v in result_iterator(fitmodel): if k == "params": output += "Parameters\n" output += ", ".join(map(str, v)) + "\n\n" return output
def bigdata_mse(request,input_dict,output_dict,widget): from discomll.utils import accuracy from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): #file doesnt exists results = accuracy.measure(test_data = input_dict["dataset"], predictions = input_dict["predictions"], measure = "mse") string = "Mean squared error\n" for k, v in result_iterator(results): string += str(v) + "\n" input_dict["string"] = string f = open(destination,'w') f.write(str(v)) f.close() else: string = "Mean squared error\n" f = open(destination,'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def nnz(disco, m, n, A): """ Return the number of nonzero matrix elements. @param m Number of rows of matrix op(A). @param n Number of columns of matrix op(A). @param A MatrixWrapper object encapsulating matrix A. @return An integer. """ def _mapNnz(e, params): nnz = 0 if type(e) == tuple: e = e[0] elems = e.split(";") for elem in elems: i, j, val = elem.split(",") assert int(i) < params.m, "row index %d exceeds matrix dimensions" % int(i) assert int(j) < params.n, "col index %d exceeds matrix dimensions" % int(j) nnz += 1 return [("nnz", nnz)] from disco.core import Params, result_iterator from disco.func import chain_reader from matrixWrap import MatrixWrapper jobA = disco.new_job(input=A.urls, name="nnz", map_reader=A.mapReader, params=Params(m=m, n=n), map=_mapNnz) res = jobA.wait(clean=False) nnz = 0 for k, v in result_iterator(res): nnz += int(v) # clean up jobA.purge() return nnz
def sync_dims_across_servers(results): if "DISCO_CONFIG" not in os.environ: print >> sys.stderr, "Specify DISCO_CONFIG(data lives at $DISCO_ROOT/data)" sys.exit(1) conf = os.environ["DISCO_CONFIG"] f = open(conf, 'r') line = f.readline() serverlist = eval(line.rstrip('\n')) servers = set() for addr, nodes in serverlist: if int(nodes): if addr in ['127.0.0.1', 'localhost']: addr = socket.getfqdn() servers.add(addr) prefilldim_addr = pre_fill_dimensions() for path, addr in prefilldim_addr.iteritems(): targetservers = servers - set([addr]) for target in targetservers: scp_file(path, target) for path, addr in result_iterator(results): if addr in ['127.0.0.1', 'localhost']: addr = socket.getfqdn() targetservers = servers - set([addr]) for target in targetservers: scp_file(path, target)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model. Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase save_results - save results to ddfs show - show info about job execution Returns ------- Urls with predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linsvm_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0] job.run(name="linsvm_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def test_combo_where_on_or_partition_ex2(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010]))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 1) self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
def test_equality_on_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 10) found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506) self.assertIsNotNone(found) self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
def test_simple_aggregation(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] sum_millis = {} for ad_id, millis in results: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] for ad_id, millis, count in results: ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def _logreg_model(fitmodel): output = "Logistic regression model\n\n" for k, v in result_iterator(fitmodel): if k == "thetas": output += "Thetas\n" output += ", ".join(map(str, v)) + "\n\n" elif k == "J": output += "J cost function\n" output += str(v) + "\n\n" return output
def test_multiple_group_bys(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') results = [c for c, _ in result_iterator(res)] sum_millis = {} for ad_id, dt, millis in results: key = str(ad_id) + dt if key not in sum_millis: sum_millis[key] = [0, 0] sum_millis[key][0] += millis sum_millis[key][1] += 1 res = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22') results = [c for c, _ in result_iterator(res)] for ad_id, dt, millis, count in results: ad_tup = sum_millis[str(ad_id) + dt] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def test_corrupt(self): def corrupt_reader(fd, size, url, params): yield 'hello' if 'corrupt' in url: raise DataError("Corrupt!", url) yield 'there' self.assertAllEqual( result_iterator([['raw://corrupt'] * 9 + ['raw://decent']], reader=corrupt_reader), ['hello', 'there'])
def stat(where, limit=16, **kwargs): """ Fetch statistical information of a collection of selected `Table <hustle.Table>`. :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* :type limit: int :param limit: the maximum number of blobs from the where clause, default value is 16 Return a dict of column key cardinalities [0-100] for indexed columns in a table """ from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final