def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed
def _gff_process(self, gff_files, limit_info, target_lines=None): """Process GFF addition, using Disco to parallelize the process. """ assert target_lines is None, "Cannot split parallelized jobs" # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [] for f in gff_files: if f.split(":")[0] != "disco": full_files.append(os.path.abspath(f)) else: full_files.append(f) results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn, ) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) yield processed
def run_disco(limit): results = disco.job(sys.argv[1], "test_sort", tserver.makeurl([""] * int(1e3)), fun_map, reduce = fun_reduce, nr_reduces = 50, sort = True, mem_sort_limit = limit) k = len(list(disco.result_iterator(results))) if k != int(1e5): raise "not enough results: Got %d, expected %d" % (k, 1e5)
def predict(input, model, host="disco://localhost", map_reader=disco.chain_reader): results = disco.job(host, name = 'naive_linear_regression_predict', input_files = input, map_reader = map_reader, fun_map = predict_map, params=model, sort = False, clean = False) return results
def predict(input, y_id, w, host="disco://localhost", map_reader=disco.chain_reader): results = disco.job(host, name = 'perceptron_predict', input_files = input, map_reader = map_reader, fun_map = predict_map, params=disco.Params(w=w, y_id=y_id), sort = False, clean = False) return results
def predict(input, loglikelihoods, ys, splitter, host="disco://localhost", map_reader=disco.chain_reader): ys=dict([(id,1) for id in ys]) results = disco.job(host, name = 'naive_bayes_predict', input_files = input, map_reader = map_reader, fun_map = predict_map, params=disco.Params(loglikelihoods=loglikelihoods,ys=ys,splitter=splitter), sort = False, clean = False) return results
def predict(input, y_ids, w, host="disco://localhost", map_reader=disco.chain_reader): y_ids=dict([(y,1) for y in y_ids]) dropped.sort() results = disco.job(host, name = 'widrow_hoff_predict', input_files = input, map_reader = map_reader, fun_map = predict_map, params=disco.Params(w=w, y_ids=y_ids), sort = False, clean = False) return results
def predict(input, centers, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None): if nr_reduces==None: nr_reduces=len(centers) results = disco.job(host, name = 'kmeans_output', input_files = input, map_reader = map_reader, fun_map = predict_map, nr_reduces = nr_reduces, params = disco.Params(centers=centers,dist=d2), sort = False, clean = True) return results
def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader): ys=dict([(id,1) for id in ys]) results = disco.job(host, name = 'naive_bayes_estimate', input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(ys=ys,splitter=splitter), sort = False, clean = False) total=0 items={} classes={} pairs={} for key,value in disco.result_iterator(results): l=key.split(splitter) value=int(value) if len(l)==1: if l[0]=='': total=value elif ys.has_key(l[0]): classes[l[0]]=value else: items[l[0]]=value else: pairs[key]=value #counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]] counts={} for i in items: for y in ys: key=y+splitter+i counts[key]=[0,0,0,0] if pairs.has_key(key): counts[key][0]=pairs[key] counts[key][1]=items[i]-counts[key][0] if not classes.has_key(y): counts[key][2]=0 else: counts[key][2]=classes[y]-counts[key][0] counts[key][3]=total-sum(counts[key][:3]) # add pseudocounts counts[key]=map(lambda x: x+1, counts[key]) total+=4 import math loglikelihoods={} for key,value in counts.iteritems(): log_c=math.log(value[0]+value[2]) l=key.split(splitter) if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0 loglikelihoods[l[0]]+=math.log(value[2])-log_c loglikelihoods[key]=math.log(value[0])-math.log(value[2]) return loglikelihoods
def estimate(input, centers, k, iterations=10, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None): if centers!=None: k=len(centers) if nr_reduces==None: nr_reduces=k results=None if centers==None: results = disco.job(host, name = 'kmeans_init', input_files = input, map_reader = map_reader, fun_map = init_map, combiner = estimate_combiner, reduce = estimate_reduce, nr_reduces = nr_reduces, params = disco.Params(k=k), sort = False, clean = True) for i in range(iterations): if results!=None: centers=[None]*k counts=[None]*k for key,value in disco.result_iterator(results): x=map(float,value.split(' ')) centers[int(key)]=x[:-1] counts[int(key)]=x[-1] results = disco.job(host, name = 'kmeans_iterate_'+str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, nr_reduces = nr_reduces, params = disco.Params(centers=centers,dist=d2), sort = False, clean = True) return centers
def _disco_process(self, gff_files, limit_info): """Process GFF addition, using Disco to parallelize the process. """ # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f) for f in gff_files] results = disco.job(self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) return processed
def estimate(input, y_id, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader): for i in range(iterations): results = disco.job(host, name = 'perceptron_estimate_' + str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(w = w, learning_rate=learning_rate,y_id=y_id), sort = False, clean = True) for key,value in disco.result_iterator(results): v=map(float,value.split(' ')) if w==None: w=v else: w=[w[i]+v[i] for i in range(len(w))] print >>sys.stderr,w return w
def estimate(input, y_ids, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader): y_ids=dict([(y,1) for y in y_ids]) for i in range(iterations): results = disco.job(host, name = 'widrow_hoff_estimate_' + str(i), input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, reduce = estimate_reduce, params = disco.Params(w = w, learning_rate=learning_rate, y_ids=y_ids), sort = False, clean = True) if w==None: w={} for k,v in disco.result_iterator(results): k=int(k) v=map(float, v.split(' ')) if not w.has_key(k): w[k]=v else: w[k]=[w[k][i]+v[i] for i in range(len(v))] print >>sys.stderr, w return w
def _disco_process(self, gff_files, limit_info): """Process GFF addition, using Disco to parallelize the process. """ # make these imports local; only need them when using disco import simplejson import disco # absolute path names unless they are special disco files full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f) for f in gff_files] results = disco.job( self._disco_host, name="gff_reader", input=full_files, params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._filter_info), required_modules=["simplejson", "collections", "re"], map=self._map_fn, reduce=self._reduce_fn) processed = dict() for out_key, out_val in disco.result_iterator(results): processed[out_key] = simplejson.loads(out_val) return processed
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader): results = disco.job(host, name = 'naive_linear_regression_estimate', input_files = input, map_reader = map_reader, fun_map = estimate_map, combiner = estimate_combiner, params=disco.Params(y_id=y_id), sort = False, clean = False) c=0 y=0.0 l=None x=None x2=None xy=None for key,value in disco.result_iterator(results): v=map(float,value.split(' ')) if l==None: l=(len(v)-2)/3 x=[0.0]*l x2=[0.0]*l xy=[0.0]*l c+=v[-1] y+=v[-2] for i in range(l): x[i]+=v[i] x2[i]+=v[l+i] xy[i]+=v[2*l+i] b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ] a = [ (y-b[i]*x[i])/c for i in range(l) ] return zip(*(a,b))
def fun_map(e, params): return [(w, 1) for w in re.sub("\W", " ", e).lower().split()] def fun_reduce(iter, out, params): s = {} for k, v in iter: if k in s: s[k] += int(v) else: s[k] = int(v) for k, v in s.iteritems(): out.add(k, v) tserver.run_server(data_gen) results = disco.job(sys.argv[1], "test_50k", tserver.makeurl([""] * int(5e4)), fun_map, reduce = fun_reduce, nr_reduces = 300, sort = False) ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)} i = 0 for key, value in disco.result_iterator(results): i += 1 if ANS[key] == int(value): print "Correct: %s %s" % (key, value) else: raise "Results don't match" if i != 3: raise "Too few results" disco.Disco(sys.argv[1]).purge(disco.util.jobname(results[0]))
if x > 10: return 1 else: return 0 def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [(e, params.f1(int(e), params.x))] def fun_reduce(iter, out, params): for k, v in iter: out.add(k, params.f2(int(v))) tserver.run_server(data_gen) inputs = range(10) results = disco.job(sys.argv[1], "test_params", tserver.makeurl(inputs), fun_map, params = disco.Params(x = 5, f1 = fun1, f2 = fun2), reduce = fun_reduce, nr_reduces = 1, sort = False) for x, y in disco.result_iterator(results): if fun2(int(x) + 5) != int(y): raise "Invalid result: %s and %s" % (x, y) print "ok"
ANS = "1028380578493512611198383005758052057919386757620401"\ "58350002406688858214958513887550465113168573010369619140625" def data_gen(path): return "\n".join([path[1:]] * 10) def fun_map(e, params): return [('=' + e, e)] def fun_reduce(iter, out, params): s = 1 for k, v in iter: if k != "=" + v: raise Exception("Corrupted key") s *= int(v) out.add("result", s) tserver.run_server(data_gen) inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31] results = disco.job(sys.argv[1], "test_simple", tserver.makeurl(inputs), fun_map, reduce = fun_reduce, nr_reduces = 1, sort = False) if list(disco.result_iterator(results)) != [("result", ANS)]: raise Exception("Invalid answer") print "ok"