コード例 #1
0
ファイル: GFFParser.py プロジェクト: Pfiver/RNA-Seqlyze
 def _gff_process(self, gff_files, limit_info, target_lines=None):
     """Process GFF addition, using Disco to parallelize the process.
     """
     assert target_lines is None, "Cannot split parallelized jobs"
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files
     full_files = []
     for f in gff_files:
         if f.split(":")[0] != "disco":
             full_files.append(os.path.abspath(f))
         else:
             full_files.append(f)
     results = disco.job(
         self._disco_host,
         name="gff_reader",
         input=full_files,
         params=disco.Params(limit_info=limit_info,
                             jsonify=True,
                             filter_info=self._examiner._filter_info),
         required_modules=["simplejson", "collections", "re"],
         map=self._map_fn,
         reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     yield processed
コード例 #2
0
ファイル: GFFParser.py プロジェクト: jhcepas/genbank-browser
    def _gff_process(self, gff_files, limit_info, target_lines=None):
        """Process GFF addition, using Disco to parallelize the process.
        """
        assert target_lines is None, "Cannot split parallelized jobs"
        # make these imports local; only need them when using disco
        import simplejson
        import disco

        # absolute path names unless they are special disco files
        full_files = []
        for f in gff_files:
            if f.split(":")[0] != "disco":
                full_files.append(os.path.abspath(f))
            else:
                full_files.append(f)
        results = disco.job(
            self._disco_host,
            name="gff_reader",
            input=full_files,
            params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info),
            required_modules=["simplejson", "collections", "re"],
            map=self._map_fn,
            reduce=self._reduce_fn,
        )
        processed = dict()
        for out_key, out_val in disco.result_iterator(results):
            processed[out_key] = simplejson.loads(out_val)
        yield processed
コード例 #3
0
ファイル: test_sort.py プロジェクト: saurabhdutta/disco
def run_disco(limit):
        results = disco.job(sys.argv[1], "test_sort",
                                tserver.makeurl([""] * int(1e3)),
                                fun_map, reduce = fun_reduce, nr_reduces = 50,
                                sort = True, mem_sort_limit = limit)
        k = len(list(disco.result_iterator(results)))
        if k != int(1e5): 
                raise "not enough results: Got %d, expected %d" % (k, 1e5)
コード例 #4
0
ファイル: naive_linreg.py プロジェクト: ak2consulting/disco
def predict(input, model, host="disco://localhost", map_reader=disco.chain_reader):
	results = disco.job(host, name = 'naive_linear_regression_predict',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = predict_map,
			    params=model,
			    sort = False, clean = False)

	return results
コード例 #5
0
ファイル: perceptron.py プロジェクト: davin/disco
def predict(input, y_id, w, host="disco://localhost", map_reader=disco.chain_reader):
	results = disco.job(host, name = 'perceptron_predict',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = predict_map,
			    params=disco.Params(w=w, y_id=y_id),
			    sort = False, clean = False)

	return results
コード例 #6
0
ファイル: naive_bayes.py プロジェクト: ak2consulting/disco
def predict(input, loglikelihoods, ys, splitter, host="disco://localhost", map_reader=disco.chain_reader):
	ys=dict([(id,1) for id in ys])
	results = disco.job(host, name = 'naive_bayes_predict',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = predict_map,
			    params=disco.Params(loglikelihoods=loglikelihoods,ys=ys,splitter=splitter),
			    sort = False, clean = False)

	return results
コード例 #7
0
ファイル: widrowhoff.py プロジェクト: ak2consulting/disco
def predict(input, y_ids, w, host="disco://localhost", map_reader=disco.chain_reader):
	y_ids=dict([(y,1) for y in y_ids])
	dropped.sort()

	results = disco.job(host, name = 'widrow_hoff_predict',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = predict_map,
			    params=disco.Params(w=w, y_ids=y_ids),
			    sort = False, clean = False)

	return results
コード例 #8
0
ファイル: kmeans.py プロジェクト: ak2consulting/disco
def predict(input, centers, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None):
	if nr_reduces==None: nr_reduces=len(centers)

	results = disco.job(host, name = 'kmeans_output', 
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = predict_map,
			    nr_reduces = nr_reduces, 
			    params = disco.Params(centers=centers,dist=d2),
			    sort = False, clean = True)

	return results
コード例 #9
0
ファイル: naive_bayes.py プロジェクト: ak2consulting/disco
def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader):
	ys=dict([(id,1) for id in ys])

	results = disco.job(host, name = 'naive_bayes_estimate',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = estimate_map,
			    combiner = estimate_combiner,
			    reduce = estimate_reduce,
			    params = disco.Params(ys=ys,splitter=splitter),
			    sort = False, clean = False)

	total=0
	items={}
	classes={}
	pairs={}
	for key,value in disco.result_iterator(results):
		l=key.split(splitter)
		value=int(value)
		if len(l)==1: 
			if l[0]=='': total=value
			elif ys.has_key(l[0]): classes[l[0]]=value
			else: items[l[0]]=value
		else:
			pairs[key]=value

#counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]]
	counts={}
	for i in items:
		for y in ys:
			key=y+splitter+i
			counts[key]=[0,0,0,0]
			if pairs.has_key(key): counts[key][0]=pairs[key]
			counts[key][1]=items[i]-counts[key][0]
			if not classes.has_key(y): counts[key][2]=0
			else: counts[key][2]=classes[y]-counts[key][0]
			counts[key][3]=total-sum(counts[key][:3])

			# add pseudocounts
			counts[key]=map(lambda x: x+1, counts[key])
	total+=4

	import math
	loglikelihoods={}
	for key,value in counts.iteritems():
		log_c=math.log(value[0]+value[2])
		l=key.split(splitter)
		if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0
		loglikelihoods[l[0]]+=math.log(value[2])-log_c
		loglikelihoods[key]=math.log(value[0])-math.log(value[2])

	return loglikelihoods
コード例 #10
0
ファイル: kmeans.py プロジェクト: ak2consulting/disco
def estimate(input, centers, k, iterations=10, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None):
	if centers!=None: k=len(centers)
	if nr_reduces==None: nr_reduces=k

	results=None
	if centers==None:	
		results = disco.job(host, name = 'kmeans_init',
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = init_map, 
				    combiner = estimate_combiner, 
				    reduce = estimate_reduce,
				    nr_reduces = nr_reduces, 
				    params = disco.Params(k=k),
				    sort = False, clean = True)

	for i in range(iterations):
		if results!=None:
			centers=[None]*k
			counts=[None]*k
			for key,value in disco.result_iterator(results):
				x=map(float,value.split(' '))
				centers[int(key)]=x[:-1]
				counts[int(key)]=x[-1]

		results = disco.job(host, name = 'kmeans_iterate_'+str(i),
				    input_files = input,
				    map_reader = map_reader,
				    fun_map = estimate_map, 
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    nr_reduces = nr_reduces, 
				    params = disco.Params(centers=centers,dist=d2),
				    sort = False, clean = True)
		
	return centers
コード例 #11
0
ファイル: GFFParser.py プロジェクト: boya888/oqtans_tools
 def _disco_process(self, gff_files, limit_info):
     """Process GFF addition, using Disco to parallelize the process.
     """
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files 
     full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f)
             for f in gff_files]
     results = disco.job(self._disco_host, name="gff_reader",
             input=full_files,
             params=disco.Params(limit_info=limit_info, jsonify=True,
                 filter_info=self._filter_info),
             required_modules=["simplejson", "collections", "re"],
             map=self._map_fn, reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     return processed
コード例 #12
0
ファイル: perceptron.py プロジェクト: davin/disco
def estimate(input, y_id, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader):
	for i in range(iterations):
		results = disco.job(host, name = 'perceptron_estimate_' + str(i),
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = estimate_map,
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    params = disco.Params(w = w, learning_rate=learning_rate,y_id=y_id),
				    sort = False, clean = True)

		for key,value in disco.result_iterator(results):
			v=map(float,value.split(' '))
			if w==None: w=v
			else: w=[w[i]+v[i] for i in range(len(w))]

		print >>sys.stderr,w

	return w
コード例 #13
0
ファイル: widrowhoff.py プロジェクト: ak2consulting/disco
def estimate(input, y_ids, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader):
	y_ids=dict([(y,1) for y in y_ids])

	for i in range(iterations):
		results = disco.job(host, name = 'widrow_hoff_estimate_' + str(i),
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = estimate_map,
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    params = disco.Params(w = w, learning_rate=learning_rate, y_ids=y_ids),
				    sort = False, clean = True)

		if w==None: w={}
		for k,v in disco.result_iterator(results):
			k=int(k)
			v=map(float, v.split(' '))
			if not w.has_key(k): w[k]=v
			else: w[k]=[w[k][i]+v[i] for i in range(len(v))] 

		print >>sys.stderr, w

	return w
コード例 #14
0
ファイル: GFFParser.py プロジェクト: ratschlab/RNA-geeq
 def _disco_process(self, gff_files, limit_info):
     """Process GFF addition, using Disco to parallelize the process.
     """
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files
     full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f)
                   for f in gff_files]
     results = disco.job(
         self._disco_host,
         name="gff_reader",
         input=full_files,
         params=disco.Params(limit_info=limit_info,
                             jsonify=True,
                             filter_info=self._filter_info),
         required_modules=["simplejson", "collections", "re"],
         map=self._map_fn,
         reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     return processed
コード例 #15
0
ファイル: naive_linreg.py プロジェクト: ak2consulting/disco
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader):
	results = disco.job(host, name = 'naive_linear_regression_estimate',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = estimate_map,
			    combiner = estimate_combiner, 
			    params=disco.Params(y_id=y_id),
			    sort = False, clean = False)

	c=0
	y=0.0
	l=None
	x=None
	x2=None
	xy=None

	for key,value in disco.result_iterator(results):
		v=map(float,value.split(' '))
		
		if l==None:
			l=(len(v)-2)/3
			x=[0.0]*l
			x2=[0.0]*l
			xy=[0.0]*l

		c+=v[-1]
		y+=v[-2]
		for i in range(l):
			x[i]+=v[i]
			x2[i]+=v[l+i]
			xy[i]+=v[2*l+i]

	b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ]
	a = [ (y-b[i]*x[i])/c for i in range(l) ]

	return zip(*(a,b))
コード例 #16
0
ファイル: test_50k.py プロジェクト: ak2consulting/disco
def fun_map(e, params):
        return [(w, 1) for w in re.sub("\W", " ", e).lower().split()]

def fun_reduce(iter, out, params):
        s = {}
        for k, v in iter:
                if k in s:
                        s[k] += int(v)
                else:
                        s[k] = int(v)
        for k, v in s.iteritems():
                out.add(k, v)

tserver.run_server(data_gen)
results = disco.job(sys.argv[1], "test_50k", tserver.makeurl([""] * int(5e4)),
                       fun_map, reduce = fun_reduce, nr_reduces = 300,
                       sort = False)

ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)}
i = 0
for key, value in disco.result_iterator(results):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match"
if i != 3:
        raise "Too few results"

disco.Disco(sys.argv[1]).purge(disco.util.jobname(results[0]))
コード例 #17
0
ファイル: test_params.py プロジェクト: nick-b/disco
        if x > 10:
                return 1
        else:
                return 0

def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        return [(e, params.f1(int(e), params.x))]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add(k, params.f2(int(v)))

tserver.run_server(data_gen)

inputs = range(10)
results = disco.job(sys.argv[1], "test_params", tserver.makeurl(inputs),
                fun_map, 
                params = disco.Params(x = 5, f1 = fun1, f2 = fun2),
		reduce = fun_reduce, 
		nr_reduces = 1,
		sort = False)

for x, y in disco.result_iterator(results):
        if fun2(int(x) + 5) != int(y):
                raise "Invalid result: %s and %s" % (x, y)

print "ok"
コード例 #18
0
ファイル: test_simple.py プロジェクト: nick-b/disco
ANS = "1028380578493512611198383005758052057919386757620401"\
      "58350002406688858214958513887550465113168573010369619140625"

def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        return [('=' + e, e)]

def fun_reduce(iter, out, params):
        s = 1
        for k, v in iter:
                if k != "=" + v:
                        raise Exception("Corrupted key")
                s *= int(v)
        out.add("result", s)

tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]
results = disco.job(sys.argv[1], "test_simple", tserver.makeurl(inputs),
                fun_map, 
		reduce = fun_reduce, 
		nr_reduces = 1,
		sort = False)

if list(disco.result_iterator(results)) != [("result", ANS)]:
        raise Exception("Invalid answer")

print "ok"