Beispiel #1
0
def estimate(master, input, center, k, iterations, map_reader=reader):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    job = master.new_job(name='k-clustering_init',
                         input=input,
                         map_reader=map_reader,
                         map_init=map_init,
                         map=random_init_map,
                         combiner=estimate_combiner,
                         reduce=estimate_reduce,
                         params=Params(k=k, seed=None, **center),
                         nr_reduces=k)

    centers = [(i, c) for i, c in result_iterator(job.wait())]
    job.purge()

    for j in range(iterations):
        job = master.new_job(name='k-clustering_iteration_%s' % (j, ),
                             input=input,
                             map_reader=map_reader,
                             map=estimate_map,
                             combiner=estimate_combiner,
                             reduce=estimate_reduce,
                             params=Params(centers=centers, **center),
                             nr_reduces=k)

        centers = [(i, c) for i, c in result_iterator(job.wait())]
        job.purge()

    return centers
Beispiel #2
0
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
Beispiel #3
0
    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)
Beispiel #4
0
 def _assert_csv_reader(self, fields, values, expected):
     stream = StringIO.StringIO(values)
     params = Params()
     params.csv_fields = fields
     params.csv_dialect = csv.excel_tab
     actual = csv_reader(stream, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
Beispiel #5
0
 def _assert_reduce(self, data, expected, **kwargs):
     # turn disco_debug on for more code coverage
     if kwargs is None:
         kwargs = dict()
     kwargs['disco_debug'] = True
     params = Params(**kwargs)
     actual = keyset_reduce(data, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
Beispiel #6
0
class PartialJob(TestJob):
    map_init = partial(init, extra='d')
    map = partial(map, extra='a')
    combiner = partial(combiner, extra='b')
    reduce_init = partial(init, extra='e')
    reduce = partial(reduce, extra='c')
    map_reader = partial(reader, extra='f')
    reduce_reader = partial(reader, extra='h')
    params = Params(foo=partial(foo, extra='z'))
Beispiel #7
0
def run(task, zinput, payload=None):
    import pagerank

    zclass = getattr(pagerank, task)
    job = zclass();
    job.params     = Params(payload=payload)

    job.run(input=zinput)
    result = job.wait(show = False)

    return result
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
Beispiel #9
0
def predict(master, input, center, centers, map_reader=reader):
    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()
Beispiel #10
0
 def setUp(self):
     sys.stdout = self.capture_stdout = cStringIO.StringIO()
     self.params = Params()
     self.params.keysets = {
         'last_name_keyset': dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
          ),
         'first_name_keyset': dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
          )}
class ParamsJob(TestJob):
    params = Params(x=5, f1=fun1, f2=fun2, now=datetime.now())
    sort = False

    @staticmethod
    def map(e, params):
        yield e, params.f1(int(e), params.x)

    @staticmethod
    def reduce(iter, params):
        for k, v in iter:
            yield k, params.f2(int(v))
Beispiel #12
0
    def test_http(self):
        url = 'http://google.com/'

        source = datasources.source_for(url)
        assert isinstance(source, HTTPSource)
        urls = source.segment_between(datetime(2011, 5, 31),
                                      datetime(2011, 6, 1))
        eq_(len(urls), 1)

        params = Params()
        input_stream = datasources.input_stream_for(None, None, urls[0],
                                                    params)
Beispiel #13
0
 def test_keyset_multiplier(self):
     params = Params()
     params.keysets = {
         'last_name_keyset':
         dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
         ),
         'first_name_keyset':
         dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
         )
     }
     data = [{
         'first_name': 'Willow',
         'last_name': 'Harvey'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke'
     }]
     expected = [{
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'last_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'last_name_keyset'
     }]
     actual = keyset_multiplier(data, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
                neighbors = v
        score = 1 - d + d * sum_v
        yield node_id, str(node_id) + " " + str(score) + " " + neighbors


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option(
        '--damping-factor',
        default=0.85,
        help='probability a web surfer will continue clicking on links')

    (options, input) = parser.parse_args()

    results = input

    params = Params(damping_factor=float(options.damping_factor))

    for j in range(int(options.iterations)):
        job = Job().run(input=results,
                        map=send_score,
                        map_reader=chain_reader,
                        reduce=receive_score,
                        params=params)
        results = job.wait()

    for _, node in result_iterator(results):
        fields = node.split()
        print fields[0], ":", fields[1]
                shortest_length = cost
                shortest_path = tour
        yield (None, (shortest_length, shortest_path))

    @staticmethod
    def reduce(iter, params):
        from disco.util import kvgroup
        for _, winners in kvgroup(sorted(iter)):
            yield min(winners)

if __name__ == '__main__':
    line = sys.stdin.readline()
    sales_trip = json.loads(line)
    m = numpy.matrix(sales_trip['graph'])
    num_nodes = m.shape[0]
    num_tours = factorial(num_nodes - 1)

    #Here we break down the full range of possible tours into smaller
    #pieces. Each piece is passed along as a key along with the trip
    #description.
    step_size = int(100 if num_tours < 100**2 else num_tours / 100)
    steps = range(0, num_tours, step_size) + [num_tours]
    ranges = zip(steps[0:-1], steps[1:])

    input = map(lambda x: 'raw://' + str(x[0]) + "-" + str(x[1]), ranges)

    from travelling_salesman import TSPJob
    job = TSPJob().run(input=input, params=Params(trip=sales_trip))
    for k, v in result_iterator(job.wait()):
        print k, v
Beispiel #16
0
 def setUp(self):
     self.settings = InfernoSettings()
     self._make_temp_pid_dir()
     self.job = InfernoJob(InfernoRule(name='some_rule_name'), {}, Params())
     self.pid_dir = pid.pid_dir(self.settings)
Beispiel #17
0
    def __init__(self,
                 # name, on/off
                 name='_unnamed_',
                 run=True,

                 # throttle
                 min_blobs=1,
                 max_blobs=sys.maxint,
                 partitions=200,
                 partition_function=crc_partition,
                 scheduler=None,
                 worker=None,
                 time_delta=None,
                 newest_first=True,

                 # archive
                 archive=False,
                 archive_tag_prefix='processed',
                 archive_lookback=0,

                 # nuke
                 nuke=False,

                 # map
                 map_init_function=lambda x, y: x,
                 map_function=keyset_map,
                 map_input_stream=chunk_csv_stream,
                 map_output_stream=(map_output_stream, disco_output_stream),

                 #combine
                 combiner_function=None,

                 # reduce
                 reduce_function=keyset_reduce,
                 reduce_output_stream=(reduce_output_stream, disco_output_stream),

                 # result
                 # result_iterator_override -->
                 #   see inferno.lib.disco_ext.sorted_iterator for signature
                 result_iterator_override=None,
                 result_processor=keyset_result,
                 result_tag=None,
                 result_tag_suffix=True,
                 save=False,
                 sort=True,
                 sort_buffer_size='10%',
                 sorted_results=True,

                 # keysets
                 keysets=None,
                 key_parts=None,
                 value_parts=None,
                 column_mappings=None,
                 table=None,
                 keyset_parts_preprocess=None,
                 parts_postprocess=None,

                 # input
                 day_range=0,
                 day_offset=0,
                 day_start=None,
                 source_tags=None,
                 source_urls=None,

                 # other
                 rule_init_function=None,
                 rule_cleanup=None,
                 parts_preprocess=None,
                 field_transforms=None,
                 required_files=None,
                 required_modules=None,
                 retry=False,
                 retry_limit=2,
                 retry_delay=1,

                 # notifications --> notify_addresses must be list of addresses
                 notify_on_fail=False,
                 notify_on_success=False,
                 notify_addresses=None,
                 notify_pagerduty=False,
                 notify_pagerduty_key=None,
                 **kwargs):

        self.qualified_name = name
        if kwargs:
            self.params = Params(**kwargs)
        else:
            self.params = Params()

        if not scheduler:
            scheduler = {'force_local': False, 'max_cores': 200}

        # name, on/off
        self.run = run
        self.name = name

        # throttle
        self.min_blobs = min_blobs
        self.max_blobs = max_blobs
        self.partitions = partitions
        self.partition_function = partition_function
        self.scheduler = scheduler
        self.time_delta = time_delta
        if self.time_delta is None:
            self.time_delta = {'minutes': 5}
        self.newest_first = newest_first
        self.worker = worker

        # archive
        self.archive = archive
        self.archive_tag_prefix = archive_tag_prefix

        # nuke
        self.nuke = nuke

        # map
        self.map_init_function = map_init_function
        self.map_function = map_function
        self.map_input_stream = map_input_stream
        self.map_output_stream = map_output_stream
        self.combiner_function = combiner_function

        # reduce
        self.reduce_function = reduce_function
        self.reduce_output_stream = reduce_output_stream

        # result
        self.result_processor = result_processor
        self.result_tag = result_tag
        self.result_tag_suffix = result_tag_suffix
        self.save = save
        self.sort = sort
        self.sort_buffer_size = sort_buffer_size
        if result_iterator_override:
            self.result_iterator = result_iterator_override
        elif self.sort and sorted_results:
            self.result_iterator = sorted_iterator
        else:
            self.result_iterator = result_iterator

        # input
        if isinstance(source_tags, basestring):
            source_tags = [source_tags]
        if archive_lookback:
            source_tags = get_date_lookback(source_tags, archive_lookback)

        self.day_range = day_range
        self.day_offset = day_offset
        self.day_start = day_start
        self.source_tags = source_tags or []

        # keysets
        keyset_dict = {}
        if keysets:
            for keyset_name, keyset_obj in keysets.items():
                keyset_dict[keyset_name] = keyset_obj.as_dict()
        else:
            keyset_dict['_default'] = Keyset(
                key_parts,
                value_parts,
                column_mappings,
                table,
                keyset_parts_preprocess,
                parts_postprocess).as_dict()
        self.params.keysets = keyset_dict

        self.params.parts_preprocess = parts_preprocess or []
        self.params.field_transforms = field_transforms or dict()

        # other
        self.rule_init_function = rule_init_function
        self.rule_cleanup = rule_cleanup
        self.retry = retry
        self.retry_limit = retry_limit
        self.retry_delay = retry_delay
        self.required_modules = required_modules or []
        self.required_files = required_files or []
        self.notify_on_fail = notify_on_fail
        self.notify_on_success = notify_on_success
        self.notify_addresses = notify_addresses or []
        self.notify_pagerduty = notify_pagerduty
        self.notify_pagerduty_key = notify_pagerduty_key
        self.source_urls = source_urls
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods
 def __test_warc_mime_type(self):
     params = Params()
     input_stream = datasources.input_stream_for(None, None, segment[0],
                                                 params)