コード例 #1
0
ファイル: resolve.py プロジェクト: CrazyWisdom/auth
def auth(clazz, province, input, output, date):
    dirList      = os.listdir(input)
    ptime        = datetime.strptime(date, "%Y%m%d")
    file_filter  = ptime.strftime('%Y-%m-%d')

    input = ["file:///" + input + "/" + file for file in dirList 
            if ( re.search(date, file) or re.search(file_filter, file) )]
    if input:
        if clazz == 'c+w':
            if cw_map_funs.has_key(province):
                mapfun = cw_map_funs[province]
            else:
                mapfun = cw_map
        else:
            if fixed_map_funs.has_key(province):
                mapfun = fixed_map_funs[province]
            else:
                mapfun = fixed_map

        job = Job().run(input=input, map=mapfun)
        file = open(output + "/" + clazz + "-" + date + ".ctl", "w")
        sqldr_header(file)
        for user, line in result_iterator(job.wait(show=True)):
            print >>file, line
        file.close()
    else:
        print 'resolve.py: Can not find any auth files.'
コード例 #2
0
ファイル: job.py プロジェクト: sajal/MongoDisco
    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)
コード例 #3
0
ファイル: map_hashtag.py プロジェクト: fangjin/Hate
def	main():
	args = parse_args()
	news_file = args.news_file
	job = Job().run(
                    input=news_file,
                    map_reader=disco.worker.classic.func.chain_reader,
                    map=read_twitter,
                    reduce=reduce)
	with open("output_result",'w') as out:
		for word, count in result_iterator(job.wait(show=False)):
			out.write(word + "\t" + str(count))
コード例 #4
0
ファイル: naive_bayes.py プロジェクト: nicolasramy/disco
def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name="naive_bayes_predict")
    job.run(
        input=input,
        map_reader=map_reader,
        map=predict_map,
        params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter),
        clean=False,
    )
    return job.wait()
コード例 #5
0
ファイル: naive_bayes.py プロジェクト: sajal/disco
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
コード例 #6
0
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    if "dwfr_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")
    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
コード例 #7
0
def get(program, key, jobname):
    """Usage: key jobname

    Print the oob value for the given key and jobname.
    """
    from disco.core import Job
    print Job(program.disco, jobname).oob_get(key)
コード例 #8
0
ファイル: job.py プロジェクト: 10genNYUITP/MongoDisco
    def __init__(self,config,map,reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)
コード例 #9
0
def oob(program, jobname):
    """Usage: jobname

    Print the oob keys for the named job.
    """
    from disco.core import Job
    for key in Job(program.disco, jobname).oob_list():
        print key
コード例 #10
0
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
                                    process=map_predict_voting if voting else map_predict_dist))]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
コード例 #11
0
ファイル: linear_svm.py プロジェクト: romanorac/discomll
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linsvm_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params
    job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
    job.run(name="linsvm_predict", input=dataset.params["data_tag"])

    return job.wait(show=show)
コード例 #12
0
ファイル: parallel_processing.py プロジェクト: stefanv/cesium
def process_prediction_data_featurization_with_disco(input_list,params,partitions=4):
    '''
    Called from within featurize_prediction_data_in_parallel
    Returns disco.core.result_iterator
    Arguments:
        input_list: path to file listing filename,unused_string for each individual time series data file.
        params: dictionary of parameters to be passed to each map & reduce function.
        partitions: Number of nodes/partitions in system.
    '''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=pred_map,
                    partitions=partitions,
                    reduce=pred_featurize_reduce,
                    params=params)
    
    result = result_iterator(job.wait(show=True))
    return result
コード例 #13
0
def process_featurization_with_disco(input_list, params, partitions=4):
    '''
	Called from within featurize_in_parallel.
	Returns disco.core.result_iterator
	Arguments:
		input_list: path to file listing filename,class_name for each individual time series data file.
		params: dictionary of parameters to be passed to each map & reduce function.
		partitions: Number of nodes/partitions in system.
	'''
    from disco.core import Job, result_iterator
    job = Job().run(input=input_list,
                    map=map,
                    partitions=partitions,
                    reduce=featurize_reduce,
                    params=params)

    result = result_iterator(job.wait(show=True))
    return result
コード例 #14
0
def main():
    job = Job().run(input=[TRAIN_IN], map=mapper, reduce=reducer, sort=True)
    category_options = defaultdict(dict)
    category_values = defaultdict(int)
    for cat_id, counter in result_iterator(job.wait(show=True)):
        if len(counter) > MAX_CATEGORICAL_OPTIONS:
            continue

        for cat_value in counter:
            if cat_value not in category_options[cat_id]:
                category_options[cat_id][cat_value] = category_values[cat_id]
                category_values[cat_id] += 1

    # save possible categorical data
    with open(CATEGORY_MAPPING_OUT, 'w') as f:
        f.write(dumps(category_options))

    with open(CATEGORY_STATUS_OUT, 'w') as f:
        f.write(dumps(category_values))
コード例 #15
0
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception("Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [
            ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
            ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url
コード例 #16
0
ファイル: naivebayes.py プロジェクト: romanorac/discomll
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
            fit_model[iv] = np.nan_to_num(
                np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
                                "prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
コード例 #17
0
ファイル: build_model.py プロジェクト: gitter-badger/mltsp
def fit_model_disco(data_dict, featureset_key, model_type):
    """
    """
    from disco.core import Job, result_iterator
    params = {"data_dict": data_dict,
              "featureset_key": featureset_key,
              "model_type": model_type}
    input_list = [("placeholder")]
    job = Job('with_modules').run(
        input=input_list,
        reduce=reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__))),
                          "sklearn"])
    result_iter = result_iterator(job.wait(show=True))
    rf_fit = None
    for rf_obj, dummy_str in result_iter:
        rf_fit = rf_obj
    return rf_fit
コード例 #18
0
    def start(self):
        """Starts the entire process of querying twitter and classification.

        This method is responsible for running MapReduce for feature
        extraction.
        """
        def range_reader(stream, size, url):
           page_num = stream.getvalue()
           # Map readers should return a list of values, so page_num is
           # explicitly converted to an integer and then wrapped into a
           # list. By doing this each mapper instance will get exactly
           # one page number
           # If we don't do this, the mapper API just reads the numbers
           # character by character and we end up fetching the same 10
           # pages: digits 0, 9 all through since each character of a number
           # should be one of these 10 digits.
           return [int(page_num)]

        job = Job()

        inputs = [('raw://%d' % (i)) for i in range(1, self.num_pages)]

        job.run(input=inputs, map=mapper, reduce=reducer,
                map_reader=range_reader, params=Params(
                    query=self.query,
                    trained_vectorizer=self.vectorizer
                    ),
                required_modules=[
                    ('vectorizer', os.path.join(datasettings.PROJECT_ROOT,
                        'analyzer',
                        'vectorizer.py'),),
                    ('models', os.path.join(datasettings.PROJECT_ROOT,
                        'webui', 'fatninja',
                        'models.py'),),
                        ])

        self.feature_vector, self.row_num_to_tweet_id_map = \
            self.vectorizer.build_feature_matrix(job)

        self.classify()
コード例 #19
0
ファイル: naivebayes.py プロジェクト: romanorac/discomll
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
コード例 #20
0
def process_featurization_with_disco(input_list, params, partitions=4):
    """Featurize time-series data in parallel as a Disco job.

    Called from within the `featurize_in_parallel` function.

    Parameters
    ----------
    input_list : str
        Path to file listing the file name and class name
        (comma-separated) for each individual time series data file,
        one per line.
    params : dict
        Dictionary of parameters to be passed to each map & reduce
        function.
    partitions : int, optional
        Number of nodes/partitions in system. Defaults to 4.

    Returns
    -------
    iterator
        disco.core.result_iterator(), an interator of two-element
        tuples, each containing the file name of the original time
        series data file, and a dictionary of the associated features
        generated.

    """
    from disco.core import Job, result_iterator
    job = Job('with_modules').run(
        input=input_list,
        map_reader=custom_reader,
        map=map,
        partitions=partitions,
        reduce=featurize_reduce,
        params=params,
        required_modules=[("mltsp",
                           os.path.dirname(os.path.dirname(__file__)))])

    result = result_iterator(job.wait(show=True))
    return result
コード例 #21
0
ファイル: job.py プロジェクト: dcrosta/mongo-disco
class DiscoJob():


    def __init__(self,config,map,reduce):
        import config_util

        self.config = config_util.config
        #if the user doesn't specify output, print to stdout
        if not config.get('output_uri') and not config.get('print_to_stdout'):
            config['print_to_stdout'] = True

        for item in config:
            self.config[item] = config[item]

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params()
        for key in self.config:
            self.params.__dict__[key] = self.config[key]

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     reduce_output_stream = mongodb_output_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])

            if self.config.get("job_wait",False):
                self.job.wait(show=True)
コード例 #22
0
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco

    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
    ]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau ** 2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900.0 + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(_fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(_fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
コード例 #23
0
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception(
            "Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
コード例 #24
0
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)),
        ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)),
    ]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
コード例 #25
0
ファイル: job.py プロジェクト: dcrosta/mongo-disco
    def __init__(self,config,map,reduce):
        import config_util

        self.config = config_util.config
        #if the user doesn't specify output, print to stdout
        if not config.get('output_uri') and not config.get('print_to_stdout'):
            config['print_to_stdout'] = True

        for item in config:
            self.config[item] = config[item]

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params()
        for key in self.config:
            self.params.__dict__[key] = self.config[key]
コード例 #26
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
コード例 #27
0
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    if "dwf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
コード例 #28
0
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fit_data.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
コード例 #29
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [
        v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])
    ][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
コード例 #30
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    if "logreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["thetas"] = [
        v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"])
        if k == "thetas"
    ][0]  # thetas are loaded from ddfs

    job.run(name="logreg_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
コード例 #31
0
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
コード例 #32
0
ファイル: distribution.py プロジェクト: romanorac/discomll
def measure(input, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = input.params  # job parameters (dataset object)

    job.run(name="Distribution", input=input.params["data_tag"])
    return job.wait(show=show)  # return results url
コード例 #33
0
def predict(dataset,
            fitmodel_url,
            voting=False,
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=simple_init,
               process=map_predict_voting if voting else map_predict_dist))
    ]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
コード例 #34
0
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
コード例 #35
0
ファイル: linear_svm.py プロジェクト: romanorac/discomll
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception("Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
コード例 #36
0
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
コード例 #37
0
# This program estimates the value of pi (3.14...)
# Usage:
# python estimate_pi.py

from disco.core import Job, result_iterator


def map(line, params):
    from random import random
    x, y = random(), random()
    yield 0, 1 if x * x + y * y < 1 else 0


if __name__ == '__main__':
    COUNT = 5000
    job = Job().run(input=["raw://0"] * COUNT, map=map)
    tot = 0
    for k, v in result_iterator(job.wait()):
        tot += v
    print(4.0 * tot) / COUNT
コード例 #38
0
from disco.core import Job, result_iterator

def map(line, params):
    for word in line.split():
        yield word, 1

def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)

if __name__ == '__main__':
    job = Job().run(input=["erl://erl_inputs:test/dummy"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print word, count
コード例 #39
0
        yield strippedWord, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input="There are known knowns.\
                           These are things we know that we know.\
                           There are known unknowns. \
                           That is to say,\
                           there are things that \
                           we know we do not know.\
                           But there are also unknown unknowns.\
                           There are things \
                           we do not know we don't know",
                    map=map,
                    reduce=reduce)


    sort_in_numerical_order =\
                            open('SortNumerical.txt', 'w')
    sort_in_alpbabetically_order = \
                                 open('SortAlphabetical.txt', 'w')

    wordCount = []
    for word, count in \
        result_iterator(job.wait(show=True)):
def reduce(iter, params):
    from disco.util import kvgroup
    for key, counts in kvgroup(sorted(iter)):
        Day = ''
        Num = 0
        DayList = list(counts)
        Days = set(DayList)
        for j in Days:
            if DayList.count(j) > Num:
                Num = DayList.count(j)
                Day = j
        
        if Num > 1:
            yield key, Day

if __name__ == '__main__':
    job = Job().run(input=["data:vcobssplit"],
                    map=map,
                    reduce=reduce)
    
    output_filename = "output.csv"

    if len(sys.argv) > 1:
        output_filename = sys.argv[1]

    with open(output_filename, 'w') as fp:
        writer = csv.writer(fp)
        for key, date in result_iterator(job.wait(show=True)):
            writer.writerow([key] + [date])

コード例 #41
0
ファイル: word_count.py プロジェクト: pombredanne/leisure
from disco.core import Job, result_iterator


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    print "runnning job"
    job = Job().run(input=["http://discoproject.org/media/text/chekhov.txt"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print(word, count)
コード例 #42
0
        item = q.get()
        if item == 0:
            return
        yield item
        q.task_done()


def map(line, params):
    import __builtin__
    unwanted = u",!.#()][{}-><=|/\"'*:?"
    words = line.translate(
        __builtin__.dict.fromkeys([ord(x) for x in unwanted], u" ")).lower()
    for word in words.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input=["tag://" + DDFS_TAG],
                    map=map,
                    reduce=reduce,
                    map_reader=chain_reader)

    for line, count in result_iterator(job.wait(show=True)):
        print(line, count)
コード例 #43
0
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option(
        '--damping-factor',
        default=0.85,
        help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results,
                        map=send_score,
                        map_reader=chain_reader,
                        reduce=receive_score,
                        params=params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]
コード例 #44
0

if __name__ == "__main__":
    pattern = re.compile("^logs-")
    current_directory = os.path.realpath(
        os.path.dirname(os.path.realpath(__file__))) + "/"
    input_directory = os.path.normpath(current_directory +
                                       "../../generate/target/")
    input_files = [
        input_directory + "/" + file for file in os.listdir(input_directory)
        if pattern.match(file)
        and os.path.isfile(os.path.join(input_directory, file))
    ]
    job = Job().run(required_files=[
        os.path.normpath(current_directory + "../model/log.py")
    ],
                    input=input_files,
                    map=map,
                    reduce=reduce)

    data = []
    timestamp = int(time.time())
    for word, count in result_iterator(job.wait(show=True)):
        datapoint = {
            "metric": "provider.channel.bps",
            "timestamp": timestamp,
            "value": count,
            "tags": {
                "provider": word[0],
                "channel": word[1]
            }
        }
コード例 #45
0
ファイル: run_disco_tst.py プロジェクト: kod3r/mltsp
def disco_word_count():
    job = Job().run(input=["http://discoproject.org/media/text/chekhov.txt"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print(word, count)
コード例 #46
0
def fit(dataset,
        trees_per_chunk=1,
        bootstrap=True,
        max_tree_nodes=50,
        min_samples_leaf=10,
        min_samples_split=5,
        class_majority=1,
        separate_max=True,
        measure="info_gain",
        accuracy=1,
        random_state=None,
        save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(
            max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception(
                "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap."
            )
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=map_init,
               process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all',
         Stage("reduce", init=simple_init, process=reduce_fit, combine=True))
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url
コード例 #47
0
 def runTest(self):
     threading.Thread(target=startServer).start()
     input = 'http:' + self.disco.master.split(':')[1] + ":" + str(PORT)
     self.job = Job().run(input=[input], map=map, reduce=reduce)
     self.assertEqual(sorted(self.results(self.job)), [(b'Hello', 1), (b'World', 1)])
コード例 #48
0
ファイル: job.py プロジェクト: sajal/MongoDisco
class DiscoJob():

    DEFAULT_CONFIG = {
        "job_output_key": "_id",
        "job_output_value": "value",
        "input_uri": "mongodb://localhost/test.in",
        "output_uri": "mongodb://localhost/test.out",
        "print_to_stdout": False,
        "job_wait": True,
        "split_size": 8,
        "split_key": {
            "_id": 1
        },
        "create_input_splits": True,
        "use_shards": False,
        "use_chunks": True,
        "slave_ok": False,
        "limit": 0,
        "skip": 0,
        "input_key": None,
        "sort": None,
        "timeout": False,
        "fields": None,
        "query": {}
    }

    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         reduce_output_stream=mongodb_output_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])

            if self.config.get("job_wait", False):
                self.job.wait(show=True)
コード例 #49
0
def map(line, params):
    for char in line.lower():
        if char >= 'a' and char <= 'z':
            yield char, 1

def reduce(iter, params):
    from disco.util import kvgroup
    for char, counts in kvgroup(sorted(iter)):
        yield char, sum(counts)

# run the disco job
from disco.core import Job, result_iterator
job = Job().run(input=["http://en.wikipedia.org/wiki/MapReduce"], map=map, reduce=reduce)

# plot the results with matplotlib
#%matplotlib inline
xs, ys = zip(*result_iterator(job.wait()))
import scipy
from matplotlib import pylab
x = scipy.arange(len(xs))
y = scipy.array(ys)
f = pylab.figure()
ax = f.add_axes([0, 0, 3, 1])
ax.bar(x, y, align='center')
ax.set_xticks(x)
ax.set_xticklabels(xs)
f.show()
コード例 #50
0
ファイル: naive_bayes.py プロジェクト: sajal/disco
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods
    tweeter, tweet = count_tweet_words.get_username_tweet(line)
    # return each word in the tweet (to count frequency of each term)
    for word in tweet.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    input_filename = "./tweet_data/tweets_357.json"
    #input_filename = "./tweet_data/tweets_859157.json"
    #input_filename = "/media/3TBStorage/tweets_all.json"

    # we need a fully qualified file name for the server
    fully_qualified_path = os.path.realpath(input_filename)
    input = [fully_qualified_path]

    # import this module so pickle knows what to send to workers
    import count_tweet_words

    job = Job().run(input=input, map=map, reduce=reduce)

    out = open(OUTPUT_FILENAME, 'w')
    for word, count in result_iterator(job.wait(show=True)):
        #print(word, count)
        out.write(json.dumps([word, count]) + '\n')
コード例 #52
0
from disco.core import Job, result_iterator

def map(line, params):
    for word in line.split():
        yield word,1

def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)

'''
def mongodb_output(stream,partition,url,params):
    return mongoDisco_output.MongoDBoutput(stream,params)
'''

if __name__ == '__main__':

    job = Job().run(input=["r"],
            map=map,
            reduce=reduce,
            map_input_stream = mongodb_input_stream
            reduce_output_stream=mongodb_output_stream)

    job.wait(show=True)
コード例 #53
0
ファイル: page_rank.py プロジェクト: AlexArgus/disco
        for t, v in vals:
            if t == "s":
                sum_v += v
            else:
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors

if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations',
                      default=10,
                      help='Numbers of iteration')
    parser.add_option('--damping-factor',
                      default=0.85,
                      help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results, map=send_score, map_reader = chain_reader, reduce=receive_score, params = params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]
コード例 #54
0
def fit(dataset, trees_per_chunk=3, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1,
        measure="info_gain", accuracy=1, separate_max=True, random_state=None, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)

        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="distributed_random_forest_fit", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"drf_fitmodel": fitmodel_url}  # return fitmodel url
コード例 #55
0
ファイル: test_job.py プロジェクト: isabella232/mongo-disco
from mongoDisco_output import MongoDBoutput
from disco.worker.classic.func import task_output_stream
import logging


def map(record, params):
    logging.info("%s" % record.get('_id'))
    yield record.get('name', "NoName"), 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


def mongodb_output(stream, partition, url, params):
    return mongoDisco_output.MongoDBoutput(stream, params)


if __name__ == '__main__':
    mongodb_stream = tuple([mongodb_output])
    job = Job().run(input=["mongodb://localhost/test.modforty"],
                    map=map,
                    reduce=reduce,
                    reduce_output_stream=mongodb_stream)

    job.wait(show=True)
#    for word, count in result_iterator(job.wait(show=True)):
#       print word, count
コード例 #56
0
from disco.core import Job, result_iterator


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    job = Job().run(input=["erl://erl_inputs:test/dummy"],
                    map=map,
                    reduce=reduce)
    for word, count in result_iterator(job.wait(show=True)):
        print word, count
コード例 #57
0
ファイル: disco_job.py プロジェクト: pooya/github_crawler
def map(line, params):
    import github_crawler
    n = int(line)
    users = github_crawler.get_users_with_n_followers(n)
    print str(len(users)) + " users"
    for user in users:
        repos = github_crawler.get_user_parent_repos(user)
        print str(len(repos)) + " repos"
        for owner, repo, branch in repos:
            print owner + "/" + repo + "#" + branch
            directory = github_crawler.clone_repo(owner, repo, branch)
            for item in github_crawler.analyze_repo(directory):
                yield item


def reduce(iter, params):
    from disco.util import kvgroup
    for extension, ratios in kvgroup(sorted(iter)):
        l_ratios = [r for r in ratios]
        yield extension, sum(l_ratios) / len(l_ratios)


if __name__ == "__main__":
    input = ["raw://" + str(i) for i in range(0, 1000, 10)]
    job = Job().run(input=input,
                    map=map,
                    reduce=reduce,
                    required_files=["github_crawler.py"])
    for extension, avg in result_iterator(job.wait(show=True)):
        print extension, ": ", avg
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


if __name__ == '__main__':
    #input_filename = "./tweet_data/tweets_357.json"
    #input_filename = "./tweet_data/tweets_859157.json"
    #input_filename = "/media/3TBStorage/tweets_all.json"

    # we need a fully qualified file name for the server
    #fully_qualified_path = os.path.realpath(input_filename)
    #input = [fully_qualified_path]

    #input = ["tag://data:tweets357"]
    input = ["tag://data:tweets859157xa"]

    # import this module so pickle knows what to send to workers
    import count_tweet_words

    job = Job().run(input=input,
                    map=map,
                    map_reader=chain_reader,
                    reduce=reduce,
                    combiner=sum_combiner)

    out = open(OUTPUT_FILENAME, 'w')
    for word, count in result_iterator(job.wait(show=True)):
        #print(word, count)
        out.write(json.dumps([word, count]) + '\n')