Exemple #1
0
 def _gff_process(self, gff_files, limit_info, target_lines=None):
     """Process GFF addition, using Disco to parallelize the process.
     """
     assert target_lines is None, "Cannot split parallelized jobs"
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files
     full_files = []
     for f in gff_files:
         if f.split(":")[0] != "disco":
             full_files.append(os.path.abspath(f))
         else:
             full_files.append(f)
     results = disco.job(
         self._disco_host,
         name="gff_reader",
         input=full_files,
         params=disco.Params(limit_info=limit_info,
                             jsonify=True,
                             filter_info=self._examiner._filter_info),
         required_modules=["simplejson", "collections", "re"],
         map=self._map_fn,
         reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     yield processed
Exemple #2
0
    def _gff_process(self, gff_files, limit_info, target_lines=None):
        """Process GFF addition, using Disco to parallelize the process.
        """
        assert target_lines is None, "Cannot split parallelized jobs"
        # make these imports local; only need them when using disco
        import simplejson
        import disco

        # absolute path names unless they are special disco files
        full_files = []
        for f in gff_files:
            if f.split(":")[0] != "disco":
                full_files.append(os.path.abspath(f))
            else:
                full_files.append(f)
        results = disco.job(
            self._disco_host,
            name="gff_reader",
            input=full_files,
            params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info),
            required_modules=["simplejson", "collections", "re"],
            map=self._map_fn,
            reduce=self._reduce_fn,
        )
        processed = dict()
        for out_key, out_val in disco.result_iterator(results):
            processed[out_key] = simplejson.loads(out_val)
        yield processed
Exemple #3
0
def run_disco(limit):
        results = disco.job(sys.argv[1], "test_sort",
                                tserver.makeurl([""] * int(1e3)),
                                fun_map, reduce = fun_reduce, nr_reduces = 50,
                                sort = True, mem_sort_limit = limit)
        k = len(list(disco.result_iterator(results)))
        if k != int(1e5): 
                raise "not enough results: Got %d, expected %d" % (k, 1e5)
Exemple #4
0
def estimate(input, ys, splitter=' ', host="disco://localhost", map_reader=disco.chain_reader):
	ys=dict([(id,1) for id in ys])

	results = disco.job(host, name = 'naive_bayes_estimate',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = estimate_map,
			    combiner = estimate_combiner,
			    reduce = estimate_reduce,
			    params = disco.Params(ys=ys,splitter=splitter),
			    sort = False, clean = False)

	total=0
	items={}
	classes={}
	pairs={}
	for key,value in disco.result_iterator(results):
		l=key.split(splitter)
		value=int(value)
		if len(l)==1: 
			if l[0]=='': total=value
			elif ys.has_key(l[0]): classes[l[0]]=value
			else: items[l[0]]=value
		else:
			pairs[key]=value

#counts[key]=[[c,i], [not c, i], [c, not i], [not c, not i]]
	counts={}
	for i in items:
		for y in ys:
			key=y+splitter+i
			counts[key]=[0,0,0,0]
			if pairs.has_key(key): counts[key][0]=pairs[key]
			counts[key][1]=items[i]-counts[key][0]
			if not classes.has_key(y): counts[key][2]=0
			else: counts[key][2]=classes[y]-counts[key][0]
			counts[key][3]=total-sum(counts[key][:3])

			# add pseudocounts
			counts[key]=map(lambda x: x+1, counts[key])
	total+=4

	import math
	loglikelihoods={}
	for key,value in counts.iteritems():
		log_c=math.log(value[0]+value[2])
		l=key.split(splitter)
		if not loglikelihoods.has_key(l[0]): loglikelihoods[l[0]]=0.0
		loglikelihoods[l[0]]+=math.log(value[2])-log_c
		loglikelihoods[key]=math.log(value[0])-math.log(value[2])

	return loglikelihoods
Exemple #5
0
def estimate(input, y_id, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader):
	for i in range(iterations):
		results = disco.job(host, name = 'perceptron_estimate_' + str(i),
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = estimate_map,
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    params = disco.Params(w = w, learning_rate=learning_rate,y_id=y_id),
				    sort = False, clean = True)

		for key,value in disco.result_iterator(results):
			v=map(float,value.split(' '))
			if w==None: w=v
			else: w=[w[i]+v[i] for i in range(len(w))]

		print >>sys.stderr,w

	return w
Exemple #6
0
 def _disco_process(self, gff_files, limit_info):
     """Process GFF addition, using Disco to parallelize the process.
     """
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files 
     full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f)
             for f in gff_files]
     results = disco.job(self._disco_host, name="gff_reader",
             input=full_files,
             params=disco.Params(limit_info=limit_info, jsonify=True,
                 filter_info=self._filter_info),
             required_modules=["simplejson", "collections", "re"],
             map=self._map_fn, reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     return processed
Exemple #7
0
def estimate(input, y_ids, w=None, learning_rate=1.0, iterations=10, host="disco://localhost", map_reader=disco.chain_reader):
	y_ids=dict([(y,1) for y in y_ids])

	for i in range(iterations):
		results = disco.job(host, name = 'widrow_hoff_estimate_' + str(i),
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = estimate_map,
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    params = disco.Params(w = w, learning_rate=learning_rate, y_ids=y_ids),
				    sort = False, clean = True)

		if w==None: w={}
		for k,v in disco.result_iterator(results):
			k=int(k)
			v=map(float, v.split(' '))
			if not w.has_key(k): w[k]=v
			else: w[k]=[w[k][i]+v[i] for i in range(len(v))] 

		print >>sys.stderr, w

	return w
Exemple #8
0
 def _disco_process(self, gff_files, limit_info):
     """Process GFF addition, using Disco to parallelize the process.
     """
     # make these imports local; only need them when using disco
     import simplejson
     import disco
     # absolute path names unless they are special disco files
     full_files = [(os.path.abspath(f) if f.split(":")[0] != "disco" else f)
                   for f in gff_files]
     results = disco.job(
         self._disco_host,
         name="gff_reader",
         input=full_files,
         params=disco.Params(limit_info=limit_info,
                             jsonify=True,
                             filter_info=self._filter_info),
         required_modules=["simplejson", "collections", "re"],
         map=self._map_fn,
         reduce=self._reduce_fn)
     processed = dict()
     for out_key, out_val in disco.result_iterator(results):
         processed[out_key] = simplejson.loads(out_val)
     return processed
Exemple #9
0
def estimate(input, centers, k, iterations=10, host="disco://localhost", map_reader=disco.chain_reader, nr_reduces=None):
	if centers!=None: k=len(centers)
	if nr_reduces==None: nr_reduces=k

	results=None
	if centers==None:	
		results = disco.job(host, name = 'kmeans_init',
				    input_files = input, 
				    map_reader = map_reader, 
				    fun_map = init_map, 
				    combiner = estimate_combiner, 
				    reduce = estimate_reduce,
				    nr_reduces = nr_reduces, 
				    params = disco.Params(k=k),
				    sort = False, clean = True)

	for i in range(iterations):
		if results!=None:
			centers=[None]*k
			counts=[None]*k
			for key,value in disco.result_iterator(results):
				x=map(float,value.split(' '))
				centers[int(key)]=x[:-1]
				counts[int(key)]=x[-1]

		results = disco.job(host, name = 'kmeans_iterate_'+str(i),
				    input_files = input,
				    map_reader = map_reader,
				    fun_map = estimate_map, 
				    combiner = estimate_combiner,
				    reduce = estimate_reduce,
				    nr_reduces = nr_reduces, 
				    params = disco.Params(centers=centers,dist=d2),
				    sort = False, clean = True)
		
	return centers
Exemple #10
0
def estimate(input, y_id, host="disco://localhost", map_reader=disco.chain_reader):
	results = disco.job(host, name = 'naive_linear_regression_estimate',
			    input_files = input, 
			    map_reader = map_reader, 
			    fun_map = estimate_map,
			    combiner = estimate_combiner, 
			    params=disco.Params(y_id=y_id),
			    sort = False, clean = False)

	c=0
	y=0.0
	l=None
	x=None
	x2=None
	xy=None

	for key,value in disco.result_iterator(results):
		v=map(float,value.split(' '))
		
		if l==None:
			l=(len(v)-2)/3
			x=[0.0]*l
			x2=[0.0]*l
			xy=[0.0]*l

		c+=v[-1]
		y+=v[-2]
		for i in range(l):
			x[i]+=v[i]
			x2[i]+=v[l+i]
			xy[i]+=v[2*l+i]

	b = [ (c*xy[i] - x[i]*y)/(c*x2[i]+x[i]*x[i]) for i in range(l) ]
	a = [ (y-b[i]*x[i])/c for i in range(l) ]

	return zip(*(a,b))
Exemple #11
0
                return 0

def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        return [(e, params.f1(int(e), params.x))]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add(k, params.f2(int(v)))

tserver.run_server(data_gen)

inputs = range(10)
job = Disco(sys.argv[1]).new_job(name = "test_params",
                input = tserver.makeurl(inputs),
                map = fun_map, 
                params = Params(x = 5, f1 = fun1, f2 = fun2),
                reduce = fun_reduce, 
                nr_reduces = 1,
                sort = False)

for x, y in result_iterator(job.wait()):
        if fun2(int(x) + 5) != int(y):
                raise "Invalid result: %s and %s" % (x, y)

job.purge()

print "ok"
Exemple #12
0
                out.add("red_" + k, "red_" + v)
        
tserver.run_server(data_gen)

inputs = ["ape", "cat", "dog"]
params = {"test1": "1,2,3",\
          "one two three": "dim\ndam\n",\
          "dummy": "value"}

job = Disco(sys.argv[1]).new_job(
            name = "test_external",
            input = tserver.makeurl(inputs),
            map = external(["ext_test"]), 
            reduce = fun_reduce, 
            ext_params = params,
            nr_reduces = 1,
            sort = False)

results = sorted([(v, k) for k, v in result_iterator(job.wait())])
for i, e in enumerate(results): 
        v, k = e
        if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]:
                raise Exception("Invalid answer: %s, %s" % (k, v))

if len(results) != 9:
        raise Exception("Wrong number of results: %u vs. 9" % len(results))

job.purge()

print "ok"
Exemple #13
0
        if x > 10:
                return 1
        else:
                return 0

def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        return [(e, params.f1(int(e), params.x))]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add(k, params.f2(int(v)))

tserver.run_server(data_gen)

inputs = range(10)
results = disco.job(sys.argv[1], "test_params", tserver.makeurl(inputs),
                fun_map, 
                params = disco.Params(x = 5, f1 = fun1, f2 = fun2),
		reduce = fun_reduce, 
		nr_reduces = 1,
		sort = False)

for x, y in disco.result_iterator(results):
        if fun2(int(x) + 5) != int(y):
                raise "Invalid result: %s and %s" % (x, y)

print "ok"
def fun_map(e, params):
        k = str(int(math.ceil(float(e))) ** 2)
        return [(md5.new(k).hexdigest(), "")]

tserver.run_server(data_gen)
disco = Disco(sys.argv[1])

inputs = [1, 485, 3245]
job = disco.new_job(name = "test_reqmodules",
                nr_reduces = 1,
                input = tserver.makeurl(inputs),
                map = fun_map,
                required_modules = ["math", "md5"],
                sort = False)

res = list(result_iterator(job.wait()))
if len(res) != len(inputs):
        raise Exception("Too few results: Got: %d Should be %d" %
                (len(res), len(inputs)))

cor = map(lambda x: md5.new(str(int(math.ceil(x)) ** 2)).hexdigest(), inputs)

for k, v in res:
        if k not in cor:
                raise Exception("Invalid answer: %s" % k)
        cor.remove(k)	

job.purge()
print "ok"
Exemple #15
0
ANS = "1028380578493512611198383005758052057919386757620401"\
      "58350002406688858214958513887550465113168573010369619140625"

def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        return [('=' + e, e)]

def fun_reduce(iter, out, params):
        s = 1
        for k, v in iter:
                if k != "=" + v:
                        raise Exception("Corrupted key")
                s *= int(v)
        out.add("result", s)

tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]
results = disco.job(sys.argv[1], "test_simple", tserver.makeurl(inputs),
                fun_map, 
		reduce = fun_reduce, 
		nr_reduces = 1,
		sort = False)

if list(disco.result_iterator(results)) != [("result", ANS)]:
        raise Exception("Invalid answer")

print "ok"
Exemple #16
0
def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add("red_" + k, "red_" + v)
        
tserver.run_server(data_gen)

inputs = ["ape", "cat", "dog"]
params = {"test1": "1,2,3",\
          "one two three": "dim\ndam\n",\
          "dummy": "value"}

results = Disco(sys.argv[1]).new_job(
                name = "test_external",
                input = tserver.makeurl(inputs),
                map = external(["ext_test"]), 
                reduce = fun_reduce, 
                ext_params = params,
                nr_reduces = 1,
                sort = False).wait()

results = sorted([(v, k) for k, v in result_iterator(results)])
for i, e in enumerate(results): 
        v, k = e
        if k != "red_dkey" or v != "red_test_%s" % inputs[i / 3]:
                raise Exception("Invalid answer: %s, %s" % (k, v))

if len(results) != 9:
        raise Exception("Wrong number of results: %u vs. 9" % len(results))

print "ok"
Exemple #17
0
        for k, v in iter:
                if k in s:
                        s[k] += int(v)
                else:
                        s[k] = int(v)
        for k, v in s.iteritems():
                out.add(k, v)

tserver.run_server(data_gen)
results = disco.job(sys.argv[1], "test_50k", tserver.makeurl([""] * int(5e4)),
                       fun_map, reduce = fun_reduce, nr_reduces = 300,
                       sort = False)

ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)}
i = 0
for key, value in disco.result_iterator(results):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match"
if i != 3:
        raise "Too few results"

disco.Disco(sys.argv[1]).purge(disco.util.jobname(results[0]))

print "ok"



Exemple #18
0
tserver.run_server(data_gen)

disco = Disco(sys.argv[1])
num = sum(x['max_workers'] for x in disco.nodeinfo()['available'])
print >> sys.stderr, num, "slots available"
inputs = tserver.makeurl(range(num * 10))
random.shuffle(inputs)

jobs = []
for i in range(5):
        jobs.append(disco.new_job(name = "test_async_%d" % i,
                       input = inputs[i * (num * 2):(i + 1) * (num * 2)],
                       map = fun_map, reduce = fun_reduce, nr_reduces = 11,
                       sort = False))
        time.sleep(1)

all = dict(("[%s]" % i, 0) for i in range(num * 10))
for job in jobs:
        results = job.wait()
        print "Job", job, "done"
        for k, v in result_iterator(results):
                all[k] += 1
        job.clean()

for v in all.values():
        if v != 10:
                raise "Invalid results: %s" % all

print "ok"

Exemple #19
0
        if x != "value:" + e:
                raise "Invalid value for key %s: %s" % (e, x)
        return [("good", "")]

tserver.run_server(data_gen)

inputs = list(string.ascii_lowercase)
disco = Disco(sys.argv[1])

job1 = disco.new_job(name = "test_oob1",
        input = tserver.makeurl(inputs),
        map = fun_map,
        reduce = fun_reduce,
        nr_reduces = 10)

res = list(result_iterator(job1.wait()))
if [("all", "ok")] * 10 != res:
        raise "Invalid result: %s" % res

keys = ["reduce:%d" % i for i in range(10)] + inputs
lst = job1.oob_list()

if len(lst) != len(keys):
        raise "Invalid number of OOB keys: got %d, expected %d" %\
                (len(lst), len(keys))

for key in job1.oob_list():
        if key not in keys:
                raise "Invalid key: %s" % key
        x = job1.oob_get(key)
        if x != "value:" + key:
Exemple #20
0
tserver.run_server(data_gen)

inputs = ["01/11/1965", "14/03/1983", "12/12/2002"]

job = Disco(sys.argv[1]).new_job(name = "test_objectrw",
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = func.object_writer,
                reduce = fun_reduce, 
                reduce_reader = func.object_reader,
                reduce_writer = func.object_writer,
                required_modules = ["math", "datetime", "time"],
                nr_reduces = 1,
                sort = False)

i = 0
for k, v in result_iterator(job.wait(), reader = func.object_reader):
        if k["PI2"] != math.pi:
                raise "Invalid key: %s" % k
        if v.strftime("%d/%m/%Y") not in inputs:
                raise "Invalid value: %s" % v
        i += 1

if i != 30:
        raise "Wrong number of results, got %d, expected 30" % i

job.purge()

print "ok"
Exemple #21
0
    params={"suffix": "0"},
).wait()

i = 1
while i < 10:
    nresults = disco.new_job(
        name="test_chain_%d" % i,
        input=results,
        map=fun_map,
        reduce=fun_reduce,
        nr_reduces=4,
        map_reader=chain_reader,
        sort=False,
        clean=True,
        params={"suffix": str(i)},
    ).wait()

    disco.purge(jobname(results[0]))
    results = nresults
    i += 1

for key, value in result_iterator(results):
    if key[:5] not in ani or key[5:] != "0-1-2-3-4-5-6-7-8-9-":
        raise "Corrupted key: %s" % key
    if value != "9":
        raise "Corrupted value: %s" % value

disco.purge(jobname(results[0]))

print "ok"
Exemple #22
0
        s = 1
        for k, v in iter:
                if k != "=" + v:
                        raise Exception("Corrupted key")
                s *= int(v)
        out.add("result", s)

tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]

job = Disco(sys.argv[1]).new_job(
                name = "test_writers", 
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = fun_map_writer,
                reduce = fun_reduce, 
                reduce_reader = fun_reduce_reader,
                reduce_writer = fun_reduce_writer,
                nr_reduces = 1,
                sort = False)

res = list(result_iterator(job.wait(), reader = result_reader))

if res != [ANS]:
        raise Exception("Invalid answer: %s" % res)

job.purge()

print "ok"
Exemple #23
0
inputs = []
for i in range(N):
        a = [i] * 10
        b = range(i, i + 10)
        inputs += ["%d:%d" % x for x in zip(a, b)]
        results[str(i)] = str(sum(b))

disco = Disco(sys.argv[1])
job = disco.new_job(\
                name = "test_chunked",
                input = tserver.makeurl(inputs),
                map = fun_map,
                partition = fun_partition,
                reduce = fun_reduce,
		chunked = False, 
                nr_reduces = N,
                sort = False)

for k, v in result_iterator(job.wait()):
        if results[k] != v:
                raise "Invalid result, got %s, expected %s" %\
                        (v, results[k])
        del results[k]

if results:
        raise "Not enough results"

job.purge()

print "ok"
Exemple #24
0
                out.add(k, v)

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(\
        name = "test_profile",\
        input = tserver.makeurl([""] * int(100)),\
        map = really_unique_function_name,\
        reduce = fun_reduce,\
        nr_reduces = 30,\
        sort = False,\
        profile = True)

ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)}
i = 0
for key, value in result_iterator(job.wait()):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match (%s): Got %d expected %d" %\
                        (key, int(value), ANS[key])
if i != 3:
        raise "Too few results"

buf = cStringIO.StringIO()
sys.stdout = buf
job.profile_stats().print_stats()
sys.stdout = sys.__stdout__

#stats = job.profile_stats()