def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed
def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn, ) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed
def run_disco(limit): results = disco.job(sys.argv[1], "test_sort", tserver.makeurl([""] * int(1e3)), fun_map, reduce = fun_reduce, nr_reduces = 50, sort = True, mem_sort_limit = limit) k = len(list(disco.result_iterator(results))) if k != int(1e5): raise "not enough results: Got %d, expected %d" % (k, 1e5)
def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader): ys=dict([(id,1) for id in ys]) results = disco.job(host, name = 'naive_bayes_estimate', input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(ys=ys,splitter=splitter), sort = False, clean = False) total=0 items={} classes={} pairs={} for key,value in disco.result_iterator(results): l=key.split(splitter) value=int(value) if len(l)==1: if l[0]=='': total=value elif ys.has_key(l[0]): classes[l[0]]=value else: items[l[0]]=value else: pairs[key]=value #counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]] counts={} for i in items: for y in ys: key=y+splitter+i counts[key]=[0,0,0,0] if pairs.has_key(key): counts[key][0]=pairs[key] counts[key][1]=items[i]-counts[key][0] if not classes.has_key(y): counts[key][2]=0 else: counts[key][2]=classes[y]-counts[key][0] counts[key][3]=total-sum(counts[key][:3]) # add pseudocounts counts[key]=map(lambda x: x+1, counts[key]) total+=4 import math loglikelihoods={} for key,value in counts.iteritems(): log_c=math.log(value[0]+value[2]) l=key.split(splitter) if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0 loglikelihoods[l[0]]+=math.log(value[2])-log_c loglikelihoods[key]=math.log(value[0])-math.log(value[2]) return loglikelihoods
def estimate(input, y_id, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader): for i in range(iterations): results = disco.job(host, name = 'perceptron_estimate_' + str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(w = w, learning_rate=learning_rate,y_id=y_id), sort = False, clean = True) for key,value in disco.result_iterator(results): v=map(float,value.split(' ')) if w==None: w=v else: w=[w[i]+v[i] for i in range(len(w))] print >>sys.stderr,w return w
def _disco_process(self, gff_files, limit_info): """Process GFF addition, using Disco to parallelize the process. """ # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f) for f in gff_files] results = disco.job(self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) return processed
def estimate(input, y_ids, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader): y_ids=dict([(y,1) for y in y_ids]) for i in range(iterations): results = disco.job(host, name = 'widrow_hoff_estimate_' + str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(w = w, learning_rate=learning_rate, y_ids=y_ids), sort = False, clean = True) if w==None: w={} for k,v in disco.result_iterator(results): k=int(k) v=map(float, v.split(' ')) if not w.has_key(k): w[k]=v else: w[k]=[w[k][i]+v[i] for i in range(len(v))] print >>sys.stderr, w return w
def _disco_process(self, gff_files, limit_info): """Process GFF addition, using Disco to parallelize the process. """ # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f) for f in gff_files] results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) return processed
def estimate(input, centers, k, iterations=10, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None): if centers!=None: k=len(centers) if nr_reduces==None: nr_reduces=k results=None if centers==None: results = disco.job(host, name = 'kmeans_init', input_files = input, map_reader = map_reader, fun_map = init_map, combiner = estimate_combiner, reduce = estimate_reduce, nr_reduces = nr_reduces, params = disco.Params(k=k), sort = False, clean = True) for i in range(iterations): if results!=None: centers=[None]*k counts=[None]*k for key,value in disco.result_iterator(results): x=map(float,value.split(' ')) centers[int(key)]=x[:-1] counts[int(key)]=x[-1] results = disco.job(host, name = 'kmeans_iterate_'+str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, nr_reduces = nr_reduces, params = disco.Params(centers=centers,dist=d2), sort = False, clean = True) return centers
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader): results = disco.job(host, name = 'naive_linear_regression_estimate', input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, params=disco.Params(y_id=y_id), sort = False, clean = False) c=0 y=0.0 l=None x=None x2=None xy=None for key,value in disco.result_iterator(results): v=map(float,value.split(' ')) if l==None: l=(len(v)-2)/3 x=[0.0]*l x2=[0.0]*l xy=[0.0]*l c+=v[-1] y+=v[-2] for i in range(l): x[i]+=v[i] x2[i]+=v[l+i] xy[i]+=v[2*l+i] b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ] a = [ (y-b[i]*x[i])/c for i in range(l) ] return zip(*(a,b))
return 0 def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [(e, params.f1(int(e), params.x))] def fun_reduce(iter, out, params): for k, v in iter: out.add(k, params.f2(int(v))) tserver.run_server(data_gen) inputs = range(10) job = Disco(sys.argv[1]).new_job(name = "test_params", input = tserver.makeurl(inputs), map = fun_map, params = Params(x = 5, f1 = fun1, f2 = fun2), reduce = fun_reduce, nr_reduces = 1, sort = False) for x, y in result_iterator(job.wait()): if fun2(int(x) + 5) != int(y): raise "Invalid result: %s and %s" % (x, y) job.purge() print "ok"
out.add("red_" + k, "red_" + v) tserver.run_server(data_gen) inputs = ["ape", "cat", "dog"] params = {"test1": "1,2,3",\ "one two three": "dim\ndam\n",\ "dummy": "value"} job = Disco(sys.argv[1]).new_job( name = "test_external", input = tserver.makeurl(inputs), map = external(["ext_test"]), reduce = fun_reduce, ext_params = params, nr_reduces = 1, sort = False) results = sorted([(v, k) for k, v in result_iterator(job.wait())]) for i, e in enumerate(results): v, k = e if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]: raise Exception("Invalid answer: %s, %s" % (k, v)) if len(results) != 9: raise Exception("Wrong number of results: %u vs. 9" % len(results)) job.purge() print "ok"
if x > 10: return 1 else: return 0 def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [(e, params.f1(int(e), params.x))] def fun_reduce(iter, out, params): for k, v in iter: out.add(k, params.f2(int(v))) tserver.run_server(data_gen) inputs = range(10) results = disco.job(sys.argv[1], "test_params", tserver.makeurl(inputs), fun_map, params = disco.Params(x = 5, f1 = fun1, f2 = fun2), reduce = fun_reduce, nr_reduces = 1, sort = False) for x, y in disco.result_iterator(results): if fun2(int(x) + 5) != int(y): raise "Invalid result: %s and %s" % (x, y) print "ok"
def fun_map(e, params): k = str(int(math.ceil(float(e))) ** 2) return [(md5.new(k).hexdigest(), "")] tserver.run_server(data_gen) disco = Disco(sys.argv[1]) inputs = [1, 485, 3245] job = disco.new_job(name = "test_reqmodules", nr_reduces = 1, input = tserver.makeurl(inputs), map = fun_map, required_modules = ["math", "md5"], sort = False) res = list(result_iterator(job.wait())) if len(res) != len(inputs): raise Exception("Too few results: Got: %d Should be %d" % (len(res), len(inputs))) cor = map(lambda x: md5.new(str(int(math.ceil(x)) ** 2)).hexdigest(), inputs) for k, v in res: if k not in cor: raise Exception("Invalid answer: %s" % k) cor.remove(k) job.purge() print "ok"
ANS = "1028380578493512611198383005758052057919386757620401"\ "58350002406688858214958513887550465113168573010369619140625" def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [('=' + e, e)] def fun_reduce(iter, out, params): s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] results = disco.job(sys.argv[1], "test_simple", tserver.makeurl(inputs), fun_map, reduce = fun_reduce, nr_reduces = 1, sort = False) if list(disco.result_iterator(results)) != [("result", ANS)]: raise Exception("Invalid answer") print "ok"
def fun_reduce(iter, out, params): for k, v in iter: out.add("red_" + k, "red_" + v) tserver.run_server(data_gen) inputs = ["ape", "cat", "dog"] params = {"test1": "1,2,3",\ "one two three": "dim\ndam\n",\ "dummy": "value"} results = Disco(sys.argv[1]).new_job( name = "test_external", input = tserver.makeurl(inputs), map = external(["ext_test"]), reduce = fun_reduce, ext_params = params, nr_reduces = 1, sort = False).wait() results = sorted([(v, k) for k, v in result_iterator(results)]) for i, e in enumerate(results): v, k = e if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]: raise Exception("Invalid answer: %s, %s" % (k, v)) if len(results) != 9: raise Exception("Wrong number of results: %u vs. 9" % len(results)) print "ok"
for k, v in iter: if k in s: s[k] += int(v) else: s[k] = int(v) for k, v in s.iteritems(): out.add(k, v) tserver.run_server(data_gen) results = disco.job(sys.argv[1], "test_50k", tserver.makeurl([""] * int(5e4)), fun_map, reduce = fun_reduce, nr_reduces = 300, sort = False) ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)} i = 0 for key, value in disco.result_iterator(results): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match" if i != 3: raise "Too few results" disco.Disco(sys.argv[1]).purge(disco.util.jobname(results[0])) print "ok"
tserver.run_server(data_gen) disco = Disco(sys.argv[1]) num = sum(x['max_workers'] for x in disco.nodeinfo()['available']) print >> sys.stderr, num, "slots available" inputs = tserver.makeurl(range(num * 10)) random.shuffle(inputs) jobs = [] for i in range(5): jobs.append(disco.new_job(name = "test_async_%d" % i, input = inputs[i * (num * 2):(i + 1) * (num * 2)], map = fun_map, reduce = fun_reduce, nr_reduces = 11, sort = False)) time.sleep(1) all = dict(("[%s]" % i, 0) for i in range(num * 10)) for job in jobs: results = job.wait() print "Job", job, "done" for k, v in result_iterator(results): all[k] += 1 job.clean() for v in all.values(): if v != 10: raise "Invalid results: %s" % all print "ok"
if x != "value:" + e: raise "Invalid value for key %s: %s" % (e, x) return [("good", "")] tserver.run_server(data_gen) inputs = list(string.ascii_lowercase) disco = Disco(sys.argv[1]) job1 = disco.new_job(name = "test_oob1", input = tserver.makeurl(inputs), map = fun_map, reduce = fun_reduce, nr_reduces = 10) res = list(result_iterator(job1.wait())) if [("all", "ok")] * 10 != res: raise "Invalid result: %s" % res keys = ["reduce:%d" % i for i in range(10)] + inputs lst = job1.oob_list() if len(lst) != len(keys): raise "Invalid number of OOB keys: got %d, expected %d" %\ (len(lst), len(keys)) for key in job1.oob_list(): if key not in keys: raise "Invalid key: %s" % key x = job1.oob_get(key) if x != "value:" + key:
tserver.run_server(data_gen) inputs = ["01/11/1965", "14/03/1983", "12/12/2002"] job = Disco(sys.argv[1]).new_job(name = "test_objectrw", input = tserver.makeurl(inputs), map = fun_map, map_writer = func.object_writer, reduce = fun_reduce, reduce_reader = func.object_reader, reduce_writer = func.object_writer, required_modules = ["math", "datetime", "time"], nr_reduces = 1, sort = False) i = 0 for k, v in result_iterator(job.wait(), reader = func.object_reader): if k["PI2"] != math.pi: raise "Invalid key: %s" % k if v.strftime("%d/%m/%Y") not in inputs: raise "Invalid value: %s" % v i += 1 if i != 30: raise "Wrong number of results, got %d, expected 30" % i job.purge() print "ok"
params={"suffix": "0"}, ).wait() i = 1 while i < 10: nresults = disco.new_job( name="test_chain_%d" % i, input=results, map=fun_map, reduce=fun_reduce, nr_reduces=4, map_reader=chain_reader, sort=False, clean=True, params={"suffix": str(i)}, ).wait() disco.purge(jobname(results[0])) results = nresults i += 1 for key, value in result_iterator(results): if key[:5] not in ani or key[5:] != "0-1-2-3-4-5-6-7-8-9-": raise "Corrupted key: %s" % key if value != "9": raise "Corrupted value: %s" % value disco.purge(jobname(results[0])) print "ok"
s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] job = Disco(sys.argv[1]).new_job( name = "test_writers", input = tserver.makeurl(inputs), map = fun_map, map_writer = fun_map_writer, reduce = fun_reduce, reduce_reader = fun_reduce_reader, reduce_writer = fun_reduce_writer, nr_reduces = 1, sort = False) res = list(result_iterator(job.wait(), reader = result_reader)) if res != [ANS]: raise Exception("Invalid answer: %s" % res) job.purge() print "ok"
inputs = [] for i in range(N): a = [i] * 10 b = range(i, i + 10) inputs += ["%d:%d" % x for x in zip(a, b)] results[str(i)] = str(sum(b)) disco = Disco(sys.argv[1]) job = disco.new_job(\ name = "test_chunked", input = tserver.makeurl(inputs), map = fun_map, partition = fun_partition, reduce = fun_reduce, chunked = False, nr_reduces = N, sort = False) for k, v in result_iterator(job.wait()): if results[k] != v: raise "Invalid result, got %s, expected %s" %\ (v, results[k]) del results[k] if results: raise "Not enough results" job.purge() print "ok"
out.add(k, v) tserver.run_server(data_gen) job = Disco(sys.argv[1]).new_job(\ name = "test_profile",\ input = tserver.makeurl([""] * int(100)),\ map = really_unique_function_name,\ reduce = fun_reduce,\ nr_reduces = 30,\ sort = False,\ profile = True) ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)} i = 0 for key, value in result_iterator(job.wait()): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match (%s): Got %d expected %d" %\ (key, int(value), ANS[key]) if i != 3: raise "Too few results" buf = cStringIO.StringIO() sys.stdout = buf job.profile_stats().print_stats() sys.stdout = sys.__stdout__ #stats = job.profile_stats()