Exemple #1
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        resx = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                     where=imps.date == '2014-01-27',
                     order_by=2,
                     limit=3)
        results = [c for c, _ in result_iterator(res)]
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertTrue(len(results) == min(len(sum_millis), 3))
def estimate(master, input, center, k, iterations, map_reader=chain_reader):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    job = master.new_job(name='k-clustering_init',
                         input=input,
                         map_reader=map_reader,
                         map_init=map_init,
                         map=random_init_map,
                         combiner=estimate_combiner,
                         reduce=estimate_reduce,
                         params=Params(k=k, seed=None, **center),
                         nr_reduces=k)

    centers = [(i, c) for i, c in result_iterator(job.wait())]
    job.purge()

    for j in range(iterations):
        job = master.new_job(name='k-clustering_iteration_%s' % (j, ),
                             input=input,
                             map_reader=map_reader,
                             map=estimate_map,
                             combiner=estimate_combiner,
                             reduce=estimate_reduce,
                             params=Params(centers=centers, **center),
                             nr_reduces=k)

        centers = [(i, c) for i, c in result_iterator(job.wait())]
        job.purge()

    return centers
    def test_log_reg(self):
        # python tests_classification.py Tests_Classification.test_log_reg
        from discomll.classification import logistic_regression

        train_data1, test_data1 = datasets.breastcancer_cont_orange()
        train_data2, test_data2 = datasets.breastcancer_cont_discomll()

        learner = Orange.classification.logreg.LogRegLearner(fitter=Orange.classification.logreg.LogRegFitter_Cholesky)
        classifier = learner(train_data1)
        thetas1 = classifier.beta

        predictions1 = []
        probabilities1 = []
        for inst in test_data1:
            target, probs = classifier(inst, Orange.classification.Classifier.GetBoth)
            predictions1.append(target.value)
            probabilities1.append(probs.values())

        thetas_url = logistic_regression.fit(train_data2, alpha=1e-8, max_iterations=10)
        thetas2 = [v for k, v in result_iterator(thetas_url["logreg_fitmodel"]) if k == "thetas"]
        results_url = logistic_regression.predict(test_data2, thetas_url)

        predictions2 = []
        probabilities2 = []
        for k, v in result_iterator(results_url):
            predictions2.append(v[0])
            probabilities2.append(v[1])
        self.assertTrue(np.allclose(thetas1, thetas2))
        self.assertTrue(np.allclose(probabilities1, probabilities2, atol=1e-5))
        self.assertListEqual(predictions1, predictions2)
Exemple #4
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=pix.date < '2014-01-13'))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id, pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Exemple #5
0
    def test_log_reg(self):
        # python tests_classification.py Tests_Classification.test_log_reg
        from discomll.classification import logistic_regression

        train_data1, test_data1 = datasets.breastcancer_cont_orange()
        train_data2, test_data2 = datasets.breastcancer_cont_discomll()

        learner = Orange.classification.logreg.LogRegLearner(fitter=Orange.classification.logreg.LogRegFitter_Cholesky)
        classifier = learner(train_data1)
        thetas1 = classifier.beta

        predictions1 = []
        probabilities1 = []
        for inst in test_data1:
            target, probs = classifier(inst, Orange.classification.Classifier.GetBoth)
            predictions1.append(target.value)
            probabilities1.append(probs.values())

        thetas_url = logistic_regression.fit(train_data2, alpha=1e-8, max_iterations=10)
        thetas2 = [v for k, v in result_iterator(thetas_url["logreg_fitmodel"]) if k == "thetas"]
        results_url = logistic_regression.predict(test_data2, thetas_url)

        predictions2 = []
        probabilities2 = []
        for k, v in result_iterator(results_url):
            predictions2.append(v[0])
            probabilities2.append(v[1])
        self.assertTrue(np.allclose(thetas1, thetas2))
        self.assertTrue(np.allclose(probabilities1, probabilities2, atol=1e-5))
        self.assertListEqual(predictions1, predictions2)
Exemple #6
0
 def runTest(self):
     ducks = ['huey', 'dewey', 'louie']
     self.job = MapResultsJob().run(input=['raw://%s' % d for d in ducks])
     self.assertAllEqual(sorted(result_iterator(self.job.wait())),
                         sorted(('%s!?' % d, '') for d in ducks))
     self.assertAllEqual(sorted(result_iterator(self.job.mapresults())),
                         sorted(('%s!' % d, '') for d in ducks))
Exemple #7
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=((pix.date < '2014-01-13') &
                                                                           (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
    def test_kmeans_breastcancer(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 2
        random_seed = 2

        x_train, _, x_test, _ = datasets.breastcancer_disc()
        train_data, test_data = datasets.breastcancer_disc_discomll()

        kmeans2 = KMeans(n_clusters=clusters,
                         max_iter=max_iter,
                         n_init=1,
                         random_state=random_seed).fit(x_train)
        centroids1 = kmeans2.cluster_centers_
        predictions1 = kmeans2.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        predictions2 = [v[0] for k, v in result_iterator(predictions_url)]
        centroids2 = [
            v["x"]
            for k, v in result_iterator(centroids_url["kmeans_fitmodel"])
        ]

        centroids2[0], centroids2[1] = centroids2[1], centroids2[0]

        self.assertTrue(np.allclose(centroids1, centroids2))
Exemple #9
0
def stat(where, limit=16, **kwargs):
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
Exemple #10
0
def stat(where, limit=16, **kwargs):
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
def results_to_file(request, input_dict, output_dict, widget):
    from disco.core import result_iterator
    import os.path
    from mothra.settings import MEDIA_ROOT
    from workflows.helpers import ensure_dir

    tag = input_dict["string"]
    folder = 'discomll_results'
    add = "add" if input_dict["add_params"] == "true" else ""

    destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + add + '.txt'

    ensure_dir(destination)

    if not os.path.isfile(destination):  #file doesnt exists

        f = open(destination, 'w')
        if input_dict["add_params"] == "true":
            for k, v in result_iterator(tag):
                f.writelines(str(k) + " " + str(v) + "\n")
        else:
            for k, v in result_iterator(tag):
                f.writelines(str(k) + " " + str(v[0]) + "\n")
        f.close()
    filename = folder + "/" + tag[0][6:] + add + '.txt'

    output_dict['filename'] = filename
    return render(request, 'visualizations/string_to_file.html', {
        'widget': widget,
        'input_dict': input_dict,
        'output_dict': output_dict
    })
Exemple #12
0
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=pix.date < '2014-01-13'))]

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id, h_sum(pix.amount), h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = [(ad_id, amount, count) for (ad_id, amount, count), _ in result_iterator(res)]
        self.assertEqual(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Exemple #13
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Exemple #14
0
def estimate(master, input, center, k, iterations, map_reader = reader):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    job = master.new_job(name = 'k-clustering_init',
                         input = input,
                         map_reader = map_reader,
                         map_init = map_init,
                         map = random_init_map,
                         combiner = estimate_combiner,
                         reduce = estimate_reduce,
                         params = Params(k = k, seed = None,
                                         **center),
                         nr_reduces = k)

    centers = [(i,c) for i,c in result_iterator(job.wait())]
    job.purge()

    for  j in range(iterations):
        job = master.new_job(name = 'k-clustering_iteration_%s' %(j,),
                             input = input,
                             map_reader = map_reader,
                             map = estimate_map,
                             combiner = estimate_combiner,
                             reduce = estimate_reduce,
                             params = Params(centers = centers,
                                             **center),
                             nr_reduces = k)

        centers = [(i,c) for i,c in result_iterator(job.wait())]
        job.purge()

    return centers
Exemple #15
0
 def runTest(self):
     ducks = ['huey', 'dewey', 'louie']
     self.job = MapResultsJob().run(input=['raw://{0}'.format(d) for d in ducks])
     self.assertAllEqual(sorted(result_iterator(self.job.wait())),
                         sorted(('{0}!?'.format(d), '') for d in ducks))
     self.assertAllEqual(sorted(result_iterator(self.job.stageresults("map"))),
                         sorted(('{0}!'.format(d), '') for d in ducks))
    def test_kmeans_breastcancer(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 2
        random_seed = 2

        x_train, _, x_test, _ = datasets.breastcancer_disc()
        train_data, test_data = datasets.breastcancer_disc_discomll()

        kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train)
        centroids1 = kmeans2.cluster_centers_
        predictions1 = kmeans2.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        predictions2 = [v[0] for k, v in result_iterator(predictions_url)]
        centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])]

        centroids2[0], centroids2[1] = centroids2[1], centroids2[0]

        self.assertTrue(np.allclose(centroids1, centroids2))
Exemple #17
0
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id,
                     h_sum(pix.amount),
                     h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = [(ad_id, amount, count)
                   for (ad_id, amount, count), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
def results_to_file(request,input_dict,output_dict,widget):
    from disco.core import result_iterator
    import os.path
    from mothra.settings import MEDIA_ROOT
    from workflows.helpers import ensure_dir

    tag = input_dict["string"]
    folder = 'discomll_results'
    add = "add" if input_dict["add_params"] == "true" else ""
    
    destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+add+'.txt'
    
    ensure_dir(destination)
    
    if not os.path.isfile(destination): #file doesnt exists
        
        f = open(destination,'w')
        if input_dict["add_params"] == "true":
            for k, v in result_iterator(tag):
                f.writelines(str(k) + " " + str(v) + "\n")
        else:
            for k, v in result_iterator(tag):
                f.writelines(str(k) + " " + str(v[0]) + "\n")
        f.close()
    filename = folder+"/"+tag[0][6:]+add+'.txt'

    output_dict['filename'] = filename
    return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
Exemple #19
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id,
                     pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Exemple #20
0
 def test_chunk(self):
     from disco.core import result_iterator
     url = 'http://discoproject.org/media/text/chekhov.txt'
     self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
     self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
     self.assert_(list(result_iterator(['tag://disco:test:chunk'])),
                  list(result_iterator([url], reader=None)))
     self.ddfs.delete('disco:test:chunk')
Exemple #21
0
 def test_chunk(self):
     from disco.core import result_iterator
     url = 'http://discoproject.org/media/text/chekhov.txt'
     self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100 * 1024)
     self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
     self.assert_(list(result_iterator(['tag://disco:test:chunk'])),
                  list(result_iterator([url], reader=None)))
     self.ddfs.delete('disco:test:chunk')
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
                                    process=map_predict_voting if voting else map_predict_dist))]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
    def test_lwlr(self):
        # python -m unittest tests_regression.Tests_Regression.test_lwlr
        import locally_weighted_linear_regression as lwlr1
        from discomll.regression import locally_weighted_linear_regression as lwlr2

        x_train, y_train, x_test, y_test = datasets.regression_data()
        train_data, test_data = datasets.regression_data_discomll()

        lwlr1 = lwlr1.Locally_Weighted_Linear_Regression()
        taus = [1, 10, 25]
        sorted_indices = np.argsort([str(el) for el in x_test[:, 1].tolist()])

        for tau in taus:
            thetas1, estimation1 = lwlr1.fit(x_train, y_train, x_test, tau=tau)
            thetas1, estimation1 = np.array(thetas1)[sorted_indices], np.array(
                estimation1)[sorted_indices]

            results = lwlr2.fit_predict(train_data, test_data, tau=tau)
            thetas2, estimation2 = [], []

            for x_id, (est, thetas) in result_iterator(results):
                estimation2.append(est)
                thetas2.append(thetas)

            self.assertTrue(np.allclose(thetas1, thetas2, atol=1e-8))
            self.assertTrue(np.allclose(estimation1, estimation2, atol=1e-3))
Exemple #24
0
    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     reduce_output_stream = mongodb_output_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])

            if self.config.get("job_wait",False):
                self.job.wait(show=True)
Exemple #25
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Exemple #26
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Exemple #27
0
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\
                 load_method=offdimetlmr, dimnames= repr([]), \
                 go_live=1, profile=False):
	dim_job = master.new_job(
		name = 'dim',
		input = input,
		map_init = load_method.dim_map_init,
		map_reader = load_method.map_reader,
		map = load_method.dim_map_func,
	        partition = load_method.dim_partition_func,
		combiner = load_method.dim_combiner_func,
		reduce = load_method.dim_reduce_func,
		scheduler = {'max_cores': nr_maps},
		nr_reduces = nr_reduces,
		required_modules=[('config', config_path)],
		profile = profile,
		status_interval = 1000000,
		params = Params(count=0, dimnames=dimnames, \
	                        nr_maps=nr_maps, nr_reduces=nr_reduces)
	)
	results = dim_job.wait()
	shelvedb_paths = []
	if results!=None:
		for key,value in result_iterator(results):
			shelvedb_paths.append(key)
		if go_live==1:
			load_method.golive(config, shelvedb_paths)
Exemple #28
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [
        (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])
    ]

    job.pipeline = [("split",
                     Stage("kmeans_predict",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=predict_map))]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemple #29
0
def auth(clazz, province, input, output, date):
    dirList      = os.listdir(input)
    ptime        = datetime.strptime(date, "%Y%m%d")
    file_filter  = ptime.strftime('%Y-%m-%d')

    input = ["file:///" + input + "/" + file for file in dirList 
            if ( re.search(date, file) or re.search(file_filter, file) )]
    if input:
        if clazz == 'c+w':
            if cw_map_funs.has_key(province):
                mapfun = cw_map_funs[province]
            else:
                mapfun = cw_map
        else:
            if fixed_map_funs.has_key(province):
                mapfun = fixed_map_funs[province]
            else:
                mapfun = fixed_map

        job = Job().run(input=input, map=mapfun)
        file = open(output + "/" + clazz + "-" + date + ".ctl", "w")
        sqldr_header(file)
        for user, line in result_iterator(job.wait(show=True)):
            print >>file, line
        file.close()
    else:
        print 'resolve.py: Can not find any auth files.'
Exemple #30
0
	def unwrapMatrix(self, m, n, dtype=float64):
		"""
		Instantiate matrix from wrapper.
		"""
		rows = []
		cols = []
		vals = []
		for url in self.urls:
			if type(url) == list:
				# dfs protocol is a nested list
				url = url[0]
			protocol,path = url.split("://")
			if protocol == MatrixWrapper.RAW:
				elems = path.split(";")
				for elem in elems:
					i,j,val = elem.split(",")
					rows.append(int(i))
					cols.append(int(j))
					vals.append(dtype(val))
			elif protocol == MatrixWrapper.DIR:
				total = 0
				for key, val in result_iterator([url]):
					elems = key.split(";")
					for elem in elems:
						i,j,val = elem.split(",")
						rows.append(int(i))
						cols.append(int(j))
						vals.append(dtype(val))
						total += 1
						assert total <= m*n, "cardinality of result set exceeds %dx%d=%d entries" % (m,n,m*n)
			elif protocol == MatrixWrapper.DFS:
				raise Exception('dfs protocol not supported yet')
			else:
				raise Exception('invalid protocol')
		return coo_matrix((vals,(rows,cols)), dtype=dtype, dims=(m,n))
    def test_lwlr(self):
        # python -m unittest tests_regression.Tests_Regression.test_lwlr
        import locally_weighted_linear_regression as lwlr1
        from discomll.regression import locally_weighted_linear_regression as lwlr2

        x_train, y_train, x_test, y_test = datasets.regression_data()
        train_data, test_data = datasets.regression_data_discomll()

        lwlr1 = lwlr1.Locally_Weighted_Linear_Regression()
        taus = [1, 10, 25]
        sorted_indices = np.argsort([str(el) for el in x_test[:, 1].tolist()])

        for tau in taus:
            thetas1, estimation1 = lwlr1.fit(x_train, y_train, x_test, tau=tau)
            thetas1, estimation1 = np.array(thetas1)[sorted_indices], np.array(estimation1)[sorted_indices]

            results = lwlr2.fit_predict(train_data, test_data, tau=tau)
            thetas2, estimation2 = [], []

            for x_id, (est, thetas) in result_iterator(results):
                estimation2.append(est)
                thetas2.append(thetas)

            self.assertTrue(np.allclose(thetas1, thetas2, atol=1e-8))
            self.assertTrue(np.allclose(estimation1, estimation2, atol=1e-3))
    def test_kmeans_iris(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 3
        random_seed = 0

        x_train, y_train, x_test, y_test = datasets.iris()
        train_data, test_data = datasets.iris_discomll()

        sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train)
        centroids1 = sk_kmeans.cluster_centers_
        # predictions1 = sk_kmeans.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        # predictions2 = [v[1] for k,v in result_iterator(predictions_url)]

        centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])]
        centroids2[0], centroids2[2] = centroids2[2], centroids2[0]
        self.assertTrue(np.allclose(centroids1, centroids2))
Exemple #33
0
def dump(result_urls, width=80):
    """
    Dump the results of a non-nested query.

    :type result_urls: sequence of strings
    :param result_urls: result of an (unnested) query

    :type width: int
    :param width: the number of columns to constrain output to
    """
    from disco.core import result_iterator
    alignments = None
    for columns, _ in result_iterator(result_urls):
        if not alignments:
            alignments = []
            for column in columns:
                try:
                    float(column)
                    alignments.append(_ALG_RIGHT)
                except:
                    alignments.append(_ALG_LEFT)

        _print_line(columns,
                    width=width,
                    cols=len(alignments),
                    alignments=alignments)
Exemple #34
0
    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         reduce_output_stream=mongodb_output_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])

            if self.config.get("job_wait", False):
                self.job.wait(show=True)
Exemple #35
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])]

    job.pipeline = [
        (
            "split",
            Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map),
        )
    ]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemple #36
0
 def test_single_int_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis)
     lowest = 0
     for (a, d, c), _ in result_iterator(res):
         self.assertLessEqual(lowest, c)
         lowest = c
Exemple #37
0
 def test_overall(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4,
                  order_by='ad_id', desc=True)
     results = [a for (a, d), _ in result_iterator(res)]
     self.assertEqual(len(results), 4)
     self.assertListEqual(results, [30019, 30018, 30017, 30015])
    def test_kmeans_iris(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 3
        random_seed = 0

        x_train, y_train, x_test, y_test = datasets.iris()
        train_data, test_data = datasets.iris_discomll()

        sk_kmeans = KMeans(n_clusters=clusters,
                           max_iter=max_iter,
                           n_init=1,
                           random_state=random_seed).fit(x_train)
        centroids1 = sk_kmeans.cluster_centers_
        # predictions1 = sk_kmeans.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        # predictions2 = [v[1] for k,v in result_iterator(predictions_url)]

        centroids2 = [
            v["x"]
            for k, v in result_iterator(centroids_url["kmeans_fitmodel"])
        ]
        centroids2[0], centroids2[2] = centroids2[2], centroids2[0]
        self.assertTrue(np.allclose(centroids1, centroids2))
Exemple #39
0
def _linreg_model(fitmodel):
    output = "Linear regression model\n\n"
    for k, v in result_iterator(fitmodel):
        if k == "thetas":
            output += "Thetas\n"
            output += ", ".join(map(str, v)) + "\n\n"
    return output
Exemple #40
0
def measure(test_data, predictions, measure="ca", save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader],
                                    process=map_predictions)),
                    ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0]
    return measure, acc
Exemple #41
0
def _linsvm_model(fitmodel):
    output = "Linear SVM model\n\n"
    for k, v in result_iterator(fitmodel):
        if k == "params":
            output += "Parameters\n"
            output += ", ".join(map(str, v)) + "\n\n"
    return output
def bigdata_mse(request,input_dict,output_dict,widget):
    from discomll.utils import accuracy
    from disco.core import result_iterator
    import os.path
    from mothra.settings import MEDIA_ROOT
    from workflows.helpers import ensure_dir

    folder = 'discomll_measures'
    tag = input_dict["predictions"]
    destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt'
    ensure_dir(destination)

    if input_dict["dataset"].params["id_index"] == -1:
        input_dict["string"] = "ID index should be defined."
    elif not os.path.isfile(destination): #file doesnt exists
        results = accuracy.measure(test_data = input_dict["dataset"],
                                predictions = input_dict["predictions"],
                                measure = "mse")
        string = "Mean squared error\n"
        for k, v in result_iterator(results):
            string += str(v) + "\n"
        input_dict["string"] = string

        f = open(destination,'w')
        f.write(str(v))
        f.close()
        
    else:
        string = "Mean squared error\n"
        f = open(destination,'r')
        input_dict["string"] = string + str(f.readlines()[0])
        f.close()


    return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
Exemple #43
0
def nnz(disco, m, n, A):
	"""
	Return the number of nonzero matrix elements.
	@param m Number of rows of matrix op(A).
	@param n Number of columns of matrix op(A).
	@param A MatrixWrapper object encapsulating matrix A.
	@return An integer.
	"""
	def _mapNnz(e, params):
		nnz = 0
		if type(e) == tuple:
			e = e[0]
		elems = e.split(";")
		for elem in elems:
			i, j, val = elem.split(",")
			assert int(i) < params.m, "row index %d exceeds matrix dimensions" % int(i)
			assert int(j) < params.n, "col index %d exceeds matrix dimensions" % int(j)
			nnz += 1
		return [("nnz", nnz)]

	from disco.core import Params, result_iterator
	from disco.func import chain_reader
	from matrixWrap import MatrixWrapper
	jobA = disco.new_job(input=A.urls, name="nnz", map_reader=A.mapReader, params=Params(m=m, n=n), map=_mapNnz)
	res = jobA.wait(clean=False)
	nnz = 0
	for k, v in result_iterator(res):
		nnz += int(v)
	# clean up
	jobA.purge()
	return nnz 
Exemple #44
0
def sync_dims_across_servers(results):
    if "DISCO_CONFIG" not in os.environ:
        print >> sys.stderr, "Specify DISCO_CONFIG(data lives at $DISCO_ROOT/data)"
        sys.exit(1)
    conf = os.environ["DISCO_CONFIG"]
    f = open(conf, 'r')
    line = f.readline()
    serverlist = eval(line.rstrip('\n'))
    servers = set()
    for addr, nodes in serverlist:
        if int(nodes):
            if addr in ['127.0.0.1', 'localhost']:
                addr = socket.getfqdn()
            servers.add(addr)
    prefilldim_addr = pre_fill_dimensions()
    for path, addr in prefilldim_addr.iteritems():
        targetservers = servers - set([addr])
        for target in targetservers:
            scp_file(path, target)

    for path, addr in result_iterator(results):
        if addr in ['127.0.0.1', 'localhost']:
            addr = socket.getfqdn()
        targetservers = servers - set([addr])
        for target in targetservers:
            scp_file(path, target)
Exemple #45
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Exemple #46
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linsvm_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params
    job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
    job.run(name="linsvm_predict", input=dataset.params["data_tag"])

    return job.wait(show=show)
Exemple #47
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010])))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 1)
     self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Exemple #48
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Exemple #49
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Exemple #50
0
    def test_simple_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        results = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in results:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
        results = [c for c, _ in result_iterator(res)]
        for ad_id, millis, count in results:
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Exemple #51
0
def _logreg_model(fitmodel):
    output = "Logistic regression model\n\n"
    for k, v in result_iterator(fitmodel):
        if k == "thetas":
            output += "Thetas\n"
            output += ", ".join(map(str, v)) + "\n\n"
        elif k == "J":
            output += "J cost function\n"
            output += str(v) + "\n\n"
    return output
Exemple #52
0
    def test_multiple_group_bys(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22')
        results = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1

        res = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22')
        results = [c for c, _ in result_iterator(res)]
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
    def test_corrupt(self):
        def corrupt_reader(fd, size, url, params):
            yield 'hello'
            if 'corrupt' in url:
                raise DataError("Corrupt!", url)
            yield 'there'

        self.assertAllEqual(
            result_iterator([['raw://corrupt'] * 9 + ['raw://decent']],
                            reader=corrupt_reader), ['hello', 'there'])
Exemple #54
0
def stat(where, limit=16, **kwargs):
    """
    Fetch statistical information of a collection of selected `Table <hustle.Table>`.

    :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

    :type limit: int
    :param limit: the maximum number of blobs from the where clause, default value is 16

    Return a dict of column key cardinalities [0-100] for indexed columns in a table
    """
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final