def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict)) ] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau ** 2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900.0 + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append(_fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append(_fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
def submit(master, jobpack): from disco.settings import DiscoSettings from disco.core import Disco settings = DiscoSettings() dmaster = Disco(master) print "Submitting job to ", master status, response = json.loads(dmaster.request('/disco/job/new', jobpack)) if status != 'ok': errmsg('Failed to start job. Server replied: %s' % response) print response
def __init__(self, *args, **kwargs): # load the defaults super(Settings, self).update(defaults) # override with the settings file path = kwargs.get('settings_file') or self['settings_file'] if path and os.path.exists(path): try: import yaml self.update(yaml.load(open(path))) except: pass # if ya can't ya can't # final overrides super(Settings, self).update(overrides) super(Settings, self).__init__(*args, **kwargs) # set up ddfs and disco if not self['server'].startswith('disco://'): self['server'] = 'disco://' + self['server'] if 'ddfs' not in self: self['ddfs'] = DDFS(self['server']) self['server'] = Disco(self['server']) # set up worker if 'worker' not in self: worker_mod, _, worker_class = self['worker_class'].rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) self['worker'] = getattr(mod, worker_class)()
def get_disco_handle(server): from disco.core import Disco from disco.ddfs import DDFS if server and not server.startswith('disco://'): server = 'disco://' + server return Disco(server), DDFS(server)
def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False): # TODO(sqs): refactoring potential with PagerankJob self.spec = spec self.discodex = discodex self.docset = Docset(spec.docset_name) self.disco = Disco(DiscoSettings()["DISCO_MASTER"]) self.nr_partitions = 8 self.profile = profile
class IndexJob(object): def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False): # TODO(sqs): refactoring potential with PagerankJob self.spec = spec self.discodex = discodex self.docset = Docset(spec.docset_name) self.disco = Disco(DiscoSettings()["DISCO_MASTER"]) self.nr_partitions = 8 self.profile = profile def start(self): results = self.__run_job(self.__index_job()) self.__run_discodex_index(results) def __run_job(self, job): results = job.wait() if self.profile: self.__profile_job(job) return results def __index_job(self): return self.disco.new_job( name="index_tfidf", input=["tag://" + self.docset.ddfs_tag], map_reader=docparse, map=TfIdf.map, reduce=TfIdf.reduce, sort=True, partitions=self.nr_partitions, partition=TfIdf.partition, merge_partitions=False, profile=self.profile, params=dict(doc_count=self.docset.doc_count), ) def __run_discodex_index(self, results): opts = { "parser": "disco.func.chain_reader", "demuxer": "freequery.index.tf_idf.TfIdf_demux", "nr_ichunks": 1, # TODO(sqs): after disco#181 fixed, increase this } ds = DataSet(input=results, options=opts) origname = self.discodex.index(ds) self.disco.wait(origname) # origname is also the disco job name self.discodex.clone(origname, self.spec.invindex_name)
def __init__(self, spec, disco_addr="disco://localhost", alpha=0.15, niter=2, profile=False): self.spec = spec self.docset = Docset(spec.docset_name) self.disco = Disco("disco://localhost") self.alpha = alpha self.niter = niter self.nr_partitions = 16 self.merge_partitions = False self.profile = profile
class LinkParseJob(object): def __init__(self, spec, verbose=False, **kwargs): self.spec = spec self.docset = Docset(self.spec.docset_name) self.disco = Disco("disco://localhost") self.verbose = verbose def start(self): from disco import func job = self.disco.new_job( name="linkparse", input=self.docset.dump_uris(), map_reader=docparse, map=linkparse_map, map_output_stream=(func.map_output_stream, func.disco_output_stream, LinkFileOutputStream.disco_output_stream), partitions=0, save=True, ) results = job.wait() self.__tag_results(results) if self.verbose: self.__print_results(results) def __tag_results(self, results): from disco.ddfs import DDFS ddfs = DDFS() results_tag = results[0] ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag))) # remove old, temporary tag ddfs.delete(results_tag) def __print_results(self, results): for doc in result_iterator(results, tempdir=False, reader=doclinksparse): print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
Results, Query) from disco.core import Disco from disco.ddfs import DDFS from disco.error import DiscoError from disco.util import ddfs_name, flatten, parse_dir from discodb import Q discodex_settings = settings.DiscodexSettings() disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER'] disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX'] index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX'] purge_file = discodex_settings['DISCODEX_PURGE_FILE'] disco_master = Disco(disco_master_url) ddfs = DDFS(disco_master_url) NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead' class IndexCollection(Collection): allowed_methods = ('GET', 'POST') def delegate(self, request, *args, **kwargs): name = str(kwargs.pop('name')) return IndexResource(name)(request, *args, **kwargs) @property def names(self): return ddfs.list(index_prefix)
input = inputs or [ maybe_list(line.split()) for line in fileinput.input(inputs) ] job = reify(jobclass)(program.disco, name) try: params = job.params except AttributeError: params = Params() params.__dict__.update(**dict(program.options.params)) job.run(input=input, **program.option_parser.jobdict) print job.name @Disco.command def wait(program, jobname): """Usage: jobname Wait for the named job to complete. """ program.disco.wait(jobname) if __name__ == '__main__': Disco(option_parser=DiscoOptionParser()).main() # Workaround for "disco test" in Python2.5 which doesn't shutdown the # test_server thread properly. sys.exit(0) # XXX still needed?
def data_gen(path): return "\n".join(ani) def fun_map(e, params): if type(e) == tuple: return [(e[0] + params['suffix'], int(e[1]) + 1)] else: return [(e + params['suffix'], 0)] def fun_reduce(iter, out, params): for k, v in iter: out.add(k + "-", v) tserver.run_server(data_gen) disco = Disco(sys.argv[1]) results = disco.new_job(name = "test_chain_0", input = tserver.makeurl([""] * 100), map = fun_map, reduce = fun_reduce, nr_reduces = 4, sort = False, params = {'suffix': '0'}).wait() i = 1 while i < 10: nresults = disco.new_job(name = "test_chain_%d" % i, input = results, map = fun_map, reduce = fun_reduce, nr_reduces = 4, map_reader = chain_reader, sort = False, params = {'suffix': str(i)}).wait() disco.purge(jobname(results[0])) results = nresults i += 1
def fun_reduce(iter, out, params): for k, v in iter: out.add("red:" + k, v) def data_gen(path): return path[1:] tserver.run_server(data_gen) inputs = ["apple", "orange", "pear"] job = Disco(sys.argv[1]).new_job( name="test_streams", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, map_reader = map_reader, map_input_stream = [map_input_stream, map_input1, map_input2, map_input3], reduce_output_stream = [reduce_output1, reduce_output2]) for k, v in result_iterator(job.wait(), input_stream = [resultiter_input1, map_input_stream]): if not k.startswith("red:cba"): raise Exception("Invalid prefix in key. Got '%s' "\ "expected prefix 'red:cba'" % k) if k[7:] not in inputs: raise Exception("Invalid result '%s'" % k) inputs.remove(k[7:])
return [(w, 1) for w in re.sub("\W", " ", e).lower().split()] def fun_reduce(iter, out, params): s = {} for k, v in iter: if k in s: s[k] += int(v) else: s[k] = int(v) for k, v in s.iteritems(): out.add(k, v) tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job(name="test_50k", input=tserver.makeurl([""] * int(5e4)), map=fun_map, reduce=fun_reduce, nr_reduces=300, sort=False) ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)} i = 0 for key, value in result_iterator(job.wait()): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match" if i != 3: raise "Wrong number of results: Got %d expected 3" % i job.purge()
if v != results[k]: raise "%s: Invalid result for key %s, got %s, "\ "expected %s" % (job.name, k, v, results[k]) tserver.run_server(data_gen) N = 10 results = {} inputs = [] for i in range(N): a = [i] * 10 b = range(i, i + 10) inputs += ["%d:%d" % x for x in zip(a, b)] results[str(i)] = sum(b) disco = Disco(sys.argv[1]) # map results in individual files, one per input file (default mode) job1 = disco.new_job(\ name = "test_partfile1", input = tserver.makeurl(inputs), map = fun_map) # map results in one big partition file per host job2 = disco.new_job(\ name = "test_partfile2", input = tserver.makeurl(inputs), map = fun_map, nr_reduces = 1) check_results(job1)
import sys from disco.core import Disco, result_iterator def fun_map(e, params): for i in range(3): msg("--special_test_string_%d--" % i) return [(e, "")] inputs = ["raw://discoapi"] job = Disco(sys.argv[1]).new_job(name = "test_discoapi", input = inputs, map = fun_map) r = list(result_iterator(job.wait())) if [("discoapi", "")] != r: raise Exception("Invalid result: <%s> " % r) n = job.jobspec()["name"] if not n.startswith("test_discoapi"): raise Exception("Invalid jobspec: Expected name prefix test_discoapi, "\ "got %s" % n) events = [ev[2] for offs, ev in job.events()] for i in range(3): m = "--special_test_string_%d--" % i if not [x for x in events if m in x]: raise Exception("Message '%s' not found in events" % m) job.purge()
def __init__(self, spec, verbose=False, **kwargs): self.spec = spec self.docset = Docset(self.spec.docset_name) self.disco = Disco("disco://localhost") self.verbose = verbose
import tserver, sys, time from disco.core import Disco def data_gen(path): return "1 2 3\n" def fun_map(e, params): import time time.sleep(100) return [] disco = Disco(sys.argv[1]) num = sum(x['max_workers'] for x in disco.nodeinfo()['available']) print >> sys.stderr, num, "slots available" tserver.run_server(data_gen) job = disco.new_job(name = "test_kill", input = tserver.makeurl([""] * num * 2), map = fun_map) time.sleep(10) print >> sys.stderr, "Killing", job.name job.kill() time.sleep(5) if job.jobinfo()['active'] == "dead": print "ok" job.purge() else: raise Exception("Killing failed")
fail = ["1", "2", "3"] def data_gen(path): lock.acquire() e = path[1:] if e in fail: fail.remove(e) lock.release() raise tserver.FailedReply() else: lock.release() return str(int(e) * 10) + "\n" def fun_map(e, params): return [(int(e) * 10, "")] tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job( name = "test_tempfail", input = tserver.makeurl(map(str, range(10))), map = fun_map) res = sum(int(x) for x, y in result_iterator(job.wait())) if res != 4500: raise Exception("Invalid result: Got %d, expected 4500" % res) job.purge() print "ok"
2. Online ODAT; 3. Offline dim') parser.add_option('--post-fix', default=1, help='Does post-fixing for ODAT? (default=1): 1. Yes; 2. No') parser.add_option('--go-live', default=1, help='Load offline dim data to DW DBMS? (default=1): 1. yes; 2. No') parser.add_option('--profile', default=False, help='Profile (default=False)') parser.add_option('--config', default='conf/config.py', help='The path to config.py (default=conf/config.py)') (options, input_paths) = parser.parse_args() master = Disco("disco://"+options.disco_master) load_method = odotetlmr seq_process = None post_fixing = -1 load_step = int(options.load_step) if options.load_method=='2': load_method = odatetlmr if load_step==1: post_fixing = int(options.post_fix) seq_process = multiprocessing.Process(target=seq_server) seq_process.start() elif options.load_method=='3': load_method = offdimetlmr input_file_urls = []
import sys import json from disco.core import Disco, result_iterator from disco.settings import DiscoSettings from disco import func import time from mapper import map from reducer import reduce name = "gap-%s" % int(time.time()) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job (%s).." % name print "Go to %s to see status of the job." % disco.master results = disco.new_job(name=name, input=["tag://gap:1million"], map_input_stream=( func.map_input_stream, func.chain_reader, ), map=map, reduce=reduce, save=True).wait() print "Job done. Results:" f = open('data.js', 'w') for time_of_day, scores in result_iterator(results):
except ValueError: print msg(line) # bad hack :-( time = timestamp.replace("'", "") date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off nearest_minute = date_obj - timedelta(minutes=date_obj.minute % 1, seconds=date_obj.second, microseconds=date_obj.microsecond) yield (nearest_minute, {'unique_id': uid, 'query': query, 'frequency': frequency}) def reduce(iter, params): # This doesn't work at all, its from an old example. for unique_id, counts in kvgroup(sorted(iter)): yield unique_id, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master """ :clicks (ad id,people who clicked the ads) """ results = disco.new_job(name="bartekc", input=["tag://hackreduce:search:history"], map_input_stream=( func.map_input_stream, func.chain_reader, ), map=map, reduce=reduce, save=True).wait()
import sys from disco.core import Disco, result_iterator def fun_map(e, params): return [("", e + ":map")] inputs = ["raw://eeny", "raw://meeny", "raw://miny", "raw://moe"] job = Disco(sys.argv[1]).new_job(name = "test_raw", input = inputs, map = fun_map) res = dict((x[6:] + ":map", True) for x in inputs) for x in result_iterator(job.wait()): if x[1] not in res: raise "Invalid result: <%s> " % x[1] del res[x[1]] if res: raise "Invalid number of results %d" %\ (len(inputs) - len(res)) job.purge() print "ok"
import sys from disco.core import Disco OK_STATUS = ['job_ready', 'job_died'] disco = Disco(sys.argv[1]) def op_show(n, s): print n def op_kill(n, s): if s == "job_active": print "Killing", n disco.kill(n) def op_clean(n, s): print "Cleaning", n disco.clean(n) def op_purge(n, s): print "Purging", n disco.purge(n) for t, s, name in disco.joblist(): if sys.argv[3] in name: globals()["op_" + sys.argv[2]](name, s)
checkl("testfun4-norecurse", modutil.find_modules([testfun4],\ recurse = False), [("mod1", abspath("extra/mod1.py"))]) print "local tests ok" local_tests() def data_gen(path): return path[1:] + "\n" def fun_map(e, params): x, y = map(float, e.split("|")) return [(mod1.plusceil(x, y) + math.ceil(1.5), "")] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) inputs = ["0.5|1.2"] print "disco tests.." # default job = disco.new_job( name = "test_modutil1", input = tserver.makeurl(inputs), map = fun_map) checkl("test_modutil1", result_iterator(job.wait()), [("4.0", "")]) job.purge() print "test_modutil1 ok" job = disco.new_job(
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(entry, params): for word in entry.split(): yield word, 1 def reduce(iter, out, params): s = {} for word, freq in iter: s[word] = s.get(word, 0) + int(freq) for word, freq in s.iteritems(): out.add(word, freq) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="wordcount", input=["http://discoproject.org/chekhov.txt"], map=map, reduce=reduce).wait() print "Job done. Results:" for word, freq in result_iterator(results): print word, freq
s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] job = Disco(sys.argv[1]).new_job( name = "test_writers", input = tserver.makeurl(inputs), map = fun_map, map_writer = fun_map_writer, reduce = fun_reduce, reduce_reader = fun_reduce_reader, reduce_writer = fun_reduce_writer, nr_reduces = 1, sort = False) res = list(result_iterator(job.wait(), reader = result_reader)) if res != [ANS]: raise Exception("Invalid answer: %s" % res) job.purge() print "ok"
def __init__(self, name=None, master=None, worker=None, settings=None): from disco.core import Disco self.name = name or type(self).__name__ self.disco = master if isinstance(master, Disco) else Disco(master) self.worker = worker or self.Worker() self.settings = settings or DiscoSettings()
def fun_map(e, params): return [({"PI": math.pi}, time.strptime(e, "%d/%m/%Y"))] def fun_reduce(iter, out, params): for k, v in iter: out.add({"PI2": k["PI"]}, datetime.datetime(*v[0:6])) tserver.run_server(data_gen) inputs = ["01/11/1965", "14/03/1983", "12/12/2002"] job = Disco(sys.argv[1]).new_job(name = "test_objectrw", input = tserver.makeurl(inputs), map = fun_map, map_writer = func.object_writer, reduce = fun_reduce, reduce_reader = func.object_reader, reduce_writer = func.object_writer, required_modules = ["math", "datetime", "time"], nr_reduces = 1, sort = False) i = 0 for k, v in result_iterator(job.wait(), reader = func.object_reader): if k["PI2"] != math.pi: raise "Invalid key: %s" % k if v.strftime("%d/%m/%Y") not in inputs: raise "Invalid value: %s" % v i += 1 if i != 30: raise "Wrong number of results, got %d, expected 30" % i
import sys from disco.core import Disco, result_iterator from disco.settings import DiscoSettings def map(line, params): for word in line.split(): yield word, 1 def reduce(iter, params): from disco.util import kvgroup for word, counts in kvgroup(sorted(iter)): yield word, sum(counts) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job( name="wordcount", input=["http://discoproject.org/media/text/chekhov.txt"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" for word, count in result_iterator(results): print word, count
from discodex import settings from discodex.mapreduce import (Indexer, DiscoDBIterator) from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict) from disco.core import Disco from disco.ddfs import DDFS from disco.error import DiscoError from disco.util import flatten, parse_dir discodex_settings = settings.DiscodexSettings() disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER'] disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX'] index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX'] purge_file = discodex_settings['DISCODEX_PURGE_FILE'] disco_master = Disco(disco_master_url) ddfs = DDFS(disco_master_url) NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead' class IndexCollection(Collection): allowed_methods = ('GET', 'POST') def delegate(self, request, *args, **kwargs): name = str(kwargs.pop('name')) return IndexResource(name)(request, *args, **kwargs) @property def names(self): return ddfs.list(index_prefix)
def disco(self): from disco.core import Disco return Disco(settings=self.settings)
def disco(self): return Disco(self.disco_master_url)
time.sleep(2) return [] def fun_map2(e, params): time.sleep(3) return [] def fun_map3(e, params): fail def fun_map4(e, params): time.sleep(4) return [] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) jobs = [] for i, m in enumerate([fun_map1, fun_map2, fun_map3, fun_map4]): jobs.append(disco.new_job( name = "test_waitmany_%d" % (i + 1), input = tserver.makeurl([""] * 5), map = m)) res = [] while jobs: cont = False ready, jobs = disco.results(jobs, timeout = 2000) res += ready for n, r in res:
def fun_reduce(iter, out, params): s = {} for k, v in iter: if k in s: s[k] += int(v) else: s[k] = int(v) for k, v in s.iteritems(): out.add(k, v) tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job(\ name = "test_profile",\ input = tserver.makeurl([""] * int(100)),\ map = really_unique_function_name,\ reduce = fun_reduce,\ nr_reduces = 30,\ sort = False,\ profile = True) ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)} i = 0 for key, value in result_iterator(job.wait()): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match (%s): Got %d expected %d" %\ (key, int(value), ANS[key]) if i != 3: raise "Too few results"
def disco(self): from disco.core import Disco return Disco(self.settings['DISCO_MASTER'])
def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [("=" + e, e)] def fun_reduce(iter, out, params): s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] job = Disco(sys.argv[1]).new_job( name="test_simple", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, sort=False ) if list(result_iterator(job.wait())) != [("result", ANS)]: raise Exception("Invalid answer") job.purge() print "ok"
def data(): return Disco(self.master).jobpack(self.jobname)
distances = list(distances) newdistances = {} def minFrom(d, a): for k, v in a.items(): d[k] = mymin(d.get(k, -1), v) for d in distances: if d.get("nodes"): nodes = d["nodes"] minFrom(newdistances, d["distances"]) yield node, json.dumps([node, newdistances, nodes]) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="shortestpath", input=["file:///home/marko/tmp/disco/out.txt"], map=map, reduce=reduce, save=True).wait() print "Job done" out = file("out.txt", "w") for node, data in result_iterator(results): print >> out, data out.close()
def disco(self): return Disco(settings=self.settings)
def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): import time, random time.sleep(random.randint(1, 3)) return [(e, 0)] def fun_reduce(iter, out, params): for k, v in iter: out.add("[%s]" % k, v) tserver.run_server(data_gen) disco = Disco(sys.argv[1]) num = sum(x['max_workers'] for x in disco.nodeinfo()['available']) print >> sys.stderr, num, "slots available" inputs = tserver.makeurl(range(num * 10)) random.shuffle(inputs) jobs = [] for i in range(5): jobs.append(disco.new_job(name = "test_async_%d" % i, input = inputs[i * (num * 2):(i + 1) * (num * 2)], map = fun_map, reduce = fun_reduce, nr_reduces = 11, sort = False)) time.sleep(1) all = dict(("[%s]" % i, 0) for i in range(num * 10)) while jobs:
import csv recordReader = csv.reader(output, delimiter=',', quotechar='"') for line in recordReader: title = line[-4] year = line[-1] yield year, title def reduce(iter, params): from disco.util import kvgroup for year, titles in kvgroup(sorted(iter)): romantic_titles = [title for title in titles if "love" in title.lower()] yield year, len(romantic_titles) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.." print "Go to %s to see status of the job." % disco.master results = disco.new_job(name="song-titles", input=["tag://hackreduce:millionsongs:subset"], map=map, reduce=reduce, save=True).wait() print "Job done. Results:" chart_url = "http://chart.apis.google.com/chart?chxr=0,0,15&chxt=y&chbh=a,4,10&chs=738x220&cht=bvs&chco=4D89F9&chds=0,15&chd=t:" res_list = [] # Print result to user for year, titles in result_iterator(results): res_list.append(str(titles))
def fun_map(e, params): msg(e) return [] def fun_map2(e, params): return [] def fun_map3(e, params): for i in range(100000): print "foobar" return [] tserver.run_server(data_gen) inputs = tserver.makeurl([1]) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit", input = inputs, map = fun_map) time.sleep(5) check_dead(job) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit2", input = inputs, map = fun_map2, status_interval = 1) time.sleep(5) check_dead(job) job = Disco(sys.argv[1]).new_job(name = "test_ratelimit3", input = inputs, map = fun_map3, status_interval = 1) time.sleep(5) check_dead(job)
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from disco.core import Disco, result_iterator from disco.util import external ext_map_exec = "java_map.sh" ext_reduce_exec = "java_reduce.sh" map_class = "rmaus.disco.external.sample.WordCountMap" reduce_class = "rmaus.disco.external.sample.WordCountReduce" job = Disco("http://discomaster-dr-01:8989").new_job( name = "java_wordcount", input = ["raw://foo", "raw://bar", "raw://foo"], ext_params = { "mapFunction" : map_class, "reduceFunction" : reduce_class, "testKey" : "testValue" }, map = external([ext_map_exec]), reduce = external([ext_reduce_exec])) results = job.wait(show=True) for result in sorted(result_iterator(results), key=lambda x:x[1]): print result
from disco.core import Disco, result_iterator from disco.util import kvgroup from disco.settings import DiscoSettings def fun_map((key, value), params): bucket_range = (params.upper - params.lower) // params.num_buckets bucket = value // bucket_range if bucket >= params.num_buckets: yield params.num_buckets - 1, value yield bucket, value def fun_reduce(iter, params): for k, v in kvgroup(sorted(iter)) yield k, sorted(v) disco = Disco(DiscoSettings()['DISCO_MASTER']) print "Starting Disco job.. " results = disco.new_job(name = "Sorting job", input = [(1, 1), (2, 2), (5, 5), (4, 4), (-1, -1)] map = fun_map, reduce = fun_reduce, params = disco.core.Params(lower = 0, upper = 10, num_buckets = 3)).wait() print "Job done. Results:" for k, v in result_iterator(results): print k, v
""" Predict the closest clusters for the datapoints in input. """ job = master.new_job(name='kcluster_predict', input=input, map_reader=map_reader, map=predict_map, params=Params(centers=centers, **center), nr_reduces=0) return job.wait() if __name__ == '__main__': parser = OptionParser(usage='%prog [options] inputs') parser.add_option('--disco-master', default=getenv('DISCO_MASTER'), help='Disco master') parser.add_option('--iterations', default=10, help='Numbers of iteration') parser.add_option('--clusters', default=10, help='Numbers of clusters') (options, input) = parser.parse_args() master = Disco(options.disco_master) centers = estimate(master, input, mean_point_center, int(options.clusters), int(options.iterations)) res = predict(master, input, mean_point_center, centers) print '\n'.join(res)
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau**2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900. + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append( _fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append( _fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]