Ejemplo n.º 1
0
def measure(test_data,
            predictions,
            measure="ca",
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=test_data.params["input_chain"],
                           init=simple_init,
                           process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           init=simple_init,
                           input_chain=[task_input_stream, chain_reader],
                           process=map_predictions)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_proces,
                           sort=True,
                           combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [
        (measure, acc) for measure, acc in result_iterator(job.wait(show=show))
    ][0]
    return measure, acc
Ejemplo n.º 2
0
def get(program, key, jobname):
    """Usage: key jobname

    Print the oob value for the given key and jobname.
    """
    from disco.core import Job
    print Job(program.disco, jobname).oob_get(key)
Ejemplo n.º 3
0
    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)
Ejemplo n.º 4
0
def oob(program, jobname):
    """Usage: jobname

    Print the oob keys for the named job.
    """
    from disco.core import Job
    for key in Job(program.disco, jobname).oob_list():
        print key
Ejemplo n.º 5
0
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
Ejemplo n.º 6
0
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception(
            "Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 7
0
def process_featurization_with_disco(input_list, params, partitions=4):
    '''
	Called from within featurize_in_parallel.
	Returns disco.core.result_iterator
	Arguments:
		input_list: path to file listing filename,class_name for each individual time series data file.
		params: dictionary of parameters to be passed to each map & reduce function.
		partitions: Number of nodes/partitions in system.
	'''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=map,
                    partitions=partitions,
                    reduce=featurize_reduce,
                    params=params)

    result = result_iterator(job.wait(show=True))
    return result
Ejemplo n.º 8
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [
        v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])
    ][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
Ejemplo n.º 9
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    if "logreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["thetas"] = [
        v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"])
        if k == "thetas"
    ][0]  # thetas are loaded from ddfs

    job.run(name="logreg_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
Ejemplo n.º 10
0
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 11
0
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    if "dwf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 12
0
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 13
0
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fit_data.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
def predict(dataset,
            fitmodel_url,
            voting=False,
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=simple_init,
               process=map_predict_voting if voting else map_predict_dist))
    ]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 15
0
        yield strippedWord, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input="There are known knowns.\
                           These are things we know that we know.\
                           There are known unknowns. \
                           That is to say,\
                           there are things that \
                           we know we do not know.\
                           But there are also unknown unknowns.\
                           There are things \
                           we do not know we don't know",
                    map=map,
                    reduce=reduce)


    sort_in_numerical_order =\
                            open('SortNumerical.txt', 'w')
    sort_in_alpbabetically_order = \
                                 open('SortAlphabetical.txt', 'w')

    wordCount = []
    for word, count in \
        result_iterator(job.wait(show=True)):
    tweeter, tweet = count_tweet_words.get_username_tweet(line)
    # return each word in the tweet (to count frequency of each term)
    for word in tweet.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    input_filename = "./tweet_data/tweets_357.json"
    #input_filename = "./tweet_data/tweets_859157.json"
    #input_filename = "/media/3TBStorage/tweets_all.json"

    # we need a fully qualified file name for the server
    fully_qualified_path = os.path.realpath(input_filename)
    input = [fully_qualified_path]

    # import this module so pickle knows what to send to workers
    import count_tweet_words

    job = Job().run(input=input, map=map, reduce=reduce)

    out = open(OUTPUT_FILENAME, 'w')
    for word, count in result_iterator(job.wait(show=True)):
        #print(word, count)
        out.write(json.dumps([word, count]) + '\n')
Ejemplo n.º 17
0

if __name__ == "__main__":
    pattern = re.compile("^logs-")
    current_directory = os.path.realpath(
        os.path.dirname(os.path.realpath(__file__))) + "/"
    input_directory = os.path.normpath(current_directory +
                                       "../../generate/target/")
    input_files = [
        input_directory + "/" + file for file in os.listdir(input_directory)
        if pattern.match(file)
        and os.path.isfile(os.path.join(input_directory, file))
    ]
    job = Job().run(required_files=[
        os.path.normpath(current_directory + "../model/log.py")
    ],
                    input=input_files,
                    map=map,
                    reduce=reduce)

    data = []
    timestamp = int(time.time())
    for word, count in result_iterator(job.wait(show=True)):
        datapoint = {
            "metric": "provider.channel.bps",
            "timestamp": timestamp,
            "value": count,
            "tags": {
                "provider": word[0],
                "channel": word[1]
            }
        }
Ejemplo n.º 18
0
def fit(dataset,
        trees_per_chunk=1,
        bootstrap=True,
        max_tree_nodes=50,
        min_samples_leaf=10,
        min_samples_split=5,
        class_majority=1,
        separate_max=True,
        measure="info_gain",
        accuracy=1,
        random_state=None,
        save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(
            max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception(
                "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap."
            )
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=map_init,
               process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all',
         Stage("reduce", init=simple_init, process=reduce_fit, combine=True))
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url
 def runTest(self):
     threading.Thread(target=startServer).start()
     input = 'http:' + self.disco.master.split(':')[1] + ":" + str(PORT)
     self.job = Job().run(input=[input], map=map, reduce=reduce)
     self.assertEqual(sorted(self.results(self.job)), [(b'Hello', 1), (b'World', 1)])
Ejemplo n.º 20
0
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option(
        '--damping-factor',
        default=0.85,
        help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results,
                        map=send_score,
                        map_reader=chain_reader,
                        reduce=receive_score,
                        params=params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]
Ejemplo n.º 21
0
from disco.core import Job, result_iterator


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input=["erl://erl_inputs:test/dummy"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print word, count
Ejemplo n.º 22
0
def map(line, params):
    for char in line.lower():
        if char >= 'a' and char <= 'z':
            yield char, 1

def reduce(iter, params):
    from disco.util import kvgroup
    for char, counts in kvgroup(sorted(iter)):
        yield char, sum(counts)

# run the disco job
from disco.core import Job, result_iterator
job = Job().run(input=["http://en.wikipedia.org/wiki/MapReduce"], map=map, reduce=reduce)

# plot the results with matplotlib
#%matplotlib inline
xs, ys = zip(*result_iterator(job.wait()))
import scipy
from matplotlib import pylab
x = scipy.arange(len(xs))
y = scipy.array(ys)
f = pylab.figure()
ax = f.add_axes([0, 0, 3, 1])
ax.bar(x, y, align='center')
ax.set_xticks(x)
ax.set_xticklabels(xs)
f.show()
Ejemplo n.º 23
0
# This program estimates the value of pi (3.14...)
# Usage:
# python estimate_pi.py

from disco.core import Job, result_iterator


def map(line, params):
    from random import random
    x, y = random(), random()
    yield 0, 1 if x * x + y * y < 1 else 0


if __name__ == '__main__':
    COUNT = 5000
    job = Job().run(input=["raw://0"] * COUNT, map=map)
    tot = 0
    for k, v in result_iterator(job.wait()):
        tot += v
    print(4.0 * tot) / COUNT
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    #input_filename = "./tweet_data/tweets_357.json"
    #input_filename = "./tweet_data/tweets_859157.json"
    #input_filename = "/media/3TBStorage/tweets_all.json"

    # we need a fully qualified file name for the server
    #fully_qualified_path = os.path.realpath(input_filename)
    #input = [fully_qualified_path]

    #input = ["tag://data:tweets357"]
    input = ["tag://data:tweets859157xa"]

    # import this module so pickle knows what to send to workers
    import count_tweet_words

    job = Job().run(input=input,
                    map=map,
                    map_reader=chain_reader,
                    reduce=reduce,
                    combiner=sum_combiner)

    out = open(OUTPUT_FILENAME, 'w')
    for word, count in result_iterator(job.wait(show=True)):
        #print(word, count)
        out.write(json.dumps([word, count]) + '\n')
Ejemplo n.º 25
0
        item = q.get()
        if item == 0:
            return
        yield item
        q.task_done()


def map(line, params):
    import __builtin__
    unwanted = u",!.#()][{}-><=|/\"'*:?"
    words = line.translate(
        __builtin__.dict.fromkeys([ord(x) for x in unwanted], u" ")).lower()
    for word in words.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input=["tag://" + DDFS_TAG],
                    map=map,
                    reduce=reduce,
                    map_reader=chain_reader)

    for line, count in result_iterator(job.wait(show=True)):
        print(line, count)
Ejemplo n.º 26
0
from disco.core import Job, result_iterator

def map(line, params):
    for word in line.split():
        yield word,1

def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)

'''
def mongodb_output(stream,partition,url,params):
    return mongoDisco_output.MongoDBoutput(stream,params)
'''

if __name__ == '__main__':

    job = Job().run(input=["r"],
            map=map,
            reduce=reduce,
            map_input_stream = mongodb_input_stream
            reduce_output_stream=mongodb_output_stream)

    job.wait(show=True)
Ejemplo n.º 27
0
from mongoDisco_output import MongoDBoutput
from disco.worker.classic.func import task_output_stream
import logging


def map(record, params):
    logging.info("%s" % record.get('_id'))
    yield record.get('name', "NoName"), 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


def mongodb_output(stream, partition, url, params):
    return mongoDisco_output.MongoDBoutput(stream, params)


if __name__ == '__main__':
    mongodb_stream = tuple([mongodb_output])
    job = Job().run(input=["mongodb://localhost/test.modforty"],
                    map=map,
                    reduce=reduce,
                    reduce_output_stream=mongodb_stream)

    job.wait(show=True)
#    for word, count in result_iterator(job.wait(show=True)):
#       print word, count
Ejemplo n.º 28
0
from disco.core import Job, result_iterator


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    print "runnning job"
    job = Job().run(input=["http://discoproject.org/media/text/chekhov.txt"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print(word, count)
Ejemplo n.º 29
0
def map(line, params):
    import github_crawler
    n = int(line)
    users = github_crawler.get_users_with_n_followers(n)
    print str(len(users)) + " users"
    for user in users:
        repos = github_crawler.get_user_parent_repos(user)
        print str(len(repos)) + " repos"
        for owner, repo, branch in repos:
            print owner + "/" + repo + "#" + branch
            directory = github_crawler.clone_repo(owner, repo, branch)
            for item in github_crawler.analyze_repo(directory):
                yield item


def reduce(iter, params):
    from disco.util import kvgroup
    for extension, ratios in kvgroup(sorted(iter)):
        l_ratios = [r for r in ratios]
        yield extension, sum(l_ratios) / len(l_ratios)


if __name__ == "__main__":
    input = ["raw://" + str(i) for i in range(0, 1000, 10)]
    job = Job().run(input=input,
                    map=map,
                    reduce=reduce,
                    required_files=["github_crawler.py"])
    for extension, avg in result_iterator(job.wait(show=True)):
        print extension, ": ", avg
Ejemplo n.º 30
0
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods