def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco

    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
    ]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau ** 2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900.0 + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(_fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(_fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
Esempio n. 2
0
def submit(master, jobpack):
    from disco.settings import DiscoSettings
    from disco.core import Disco
    settings = DiscoSettings()
    dmaster = Disco(master)
    print "Submitting job to ", master
    status, response = json.loads(dmaster.request('/disco/job/new', jobpack))
    if status != 'ok':
        errmsg('Failed to start job. Server replied: %s' % response)
    print response
Esempio n. 3
0
    def __init__(self, *args, **kwargs):
        # load the defaults
        super(Settings, self).update(defaults)

        # override with the settings file
        path = kwargs.get('settings_file') or self['settings_file']
        if path and os.path.exists(path):
            try:
                import yaml
                self.update(yaml.load(open(path)))
            except:
                pass  # if ya can't ya can't

        # final overrides
        super(Settings, self).update(overrides)
        super(Settings, self).__init__(*args, **kwargs)

        # set up ddfs and disco
        if not self['server'].startswith('disco://'):
            self['server'] = 'disco://' + self['server']

        if 'ddfs' not in self:
            self['ddfs'] = DDFS(self['server'])
        self['server'] = Disco(self['server'])

        # set up worker
        if 'worker' not in self:
            worker_mod, _, worker_class = self['worker_class'].rpartition('.')
            mod = __import__(worker_mod, {}, {}, worker_mod)
            self['worker'] = getattr(mod, worker_class)()
Esempio n. 4
0
def get_disco_handle(server):
    from disco.core import Disco
    from disco.ddfs import DDFS

    if server and not server.startswith('disco://'):
        server = 'disco://' + server

    return Disco(server), DDFS(server)
Esempio n. 5
0
File: job.py Progetto: sqs/freequery
 def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False):
     # TODO(sqs): refactoring potential with PagerankJob
     self.spec = spec
     self.discodex = discodex
     self.docset = Docset(spec.docset_name)
     self.disco = Disco(DiscoSettings()["DISCO_MASTER"])
     self.nr_partitions = 8
     self.profile = profile
Esempio n. 6
0
File: job.py Progetto: sqs/freequery
class IndexJob(object):
    def __init__(self, spec, discodex, disco_addr="disco://localhost", profile=False):
        # TODO(sqs): refactoring potential with PagerankJob
        self.spec = spec
        self.discodex = discodex
        self.docset = Docset(spec.docset_name)
        self.disco = Disco(DiscoSettings()["DISCO_MASTER"])
        self.nr_partitions = 8
        self.profile = profile

    def start(self):
        results = self.__run_job(self.__index_job())
        self.__run_discodex_index(results)

    def __run_job(self, job):
        results = job.wait()
        if self.profile:
            self.__profile_job(job)
        return results

    def __index_job(self):
        return self.disco.new_job(
            name="index_tfidf",
            input=["tag://" + self.docset.ddfs_tag],
            map_reader=docparse,
            map=TfIdf.map,
            reduce=TfIdf.reduce,
            sort=True,
            partitions=self.nr_partitions,
            partition=TfIdf.partition,
            merge_partitions=False,
            profile=self.profile,
            params=dict(doc_count=self.docset.doc_count),
        )

    def __run_discodex_index(self, results):
        opts = {
            "parser": "disco.func.chain_reader",
            "demuxer": "freequery.index.tf_idf.TfIdf_demux",
            "nr_ichunks": 1,  # TODO(sqs): after disco#181 fixed, increase this
        }
        ds = DataSet(input=results, options=opts)
        origname = self.discodex.index(ds)
        self.disco.wait(origname)  # origname is also the disco job name
        self.discodex.clone(origname, self.spec.invindex_name)
Esempio n. 7
0
 def __init__(self, spec, disco_addr="disco://localhost", alpha=0.15, niter=2, profile=False):
     self.spec = spec
     self.docset = Docset(spec.docset_name)
     self.disco = Disco("disco://localhost")
     self.alpha = alpha
     self.niter = niter
     self.nr_partitions = 16
     self.merge_partitions = False
     self.profile = profile
Esempio n. 8
0
class LinkParseJob(object):

    def __init__(self, spec, verbose=False, **kwargs):
        self.spec = spec
        self.docset = Docset(self.spec.docset_name)
        self.disco = Disco("disco://localhost")
        self.verbose = verbose

    def start(self):
        from disco import func
        job = self.disco.new_job(
            name="linkparse",
            input=self.docset.dump_uris(),
            map_reader=docparse,
            map=linkparse_map,
            map_output_stream=(func.map_output_stream,
                               func.disco_output_stream,
                               LinkFileOutputStream.disco_output_stream),
            partitions=0,
            save=True,
        )
        results = job.wait()

        self.__tag_results(results)

        if self.verbose:
            self.__print_results(results)

    def __tag_results(self, results):
        from disco.ddfs import DDFS
        ddfs = DDFS()
        results_tag = results[0]
        ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))

        # remove old, temporary tag
        ddfs.delete(results_tag)
            
    def __print_results(self, results):
        for doc in result_iterator(results, tempdir=False, reader=doclinksparse):
            print "%s\n\t%s" % (doc.uri, "\n\t".join(doc.link_uris))
Esempio n. 9
0
                              Results,
                              Query)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import ddfs_name, flatten, parse_dir

from discodb import Q

discodex_settings = settings.DiscodexSettings()
disco_master_url  = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix      = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix      = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file        = discodex_settings['DISCODEX_PURGE_FILE']
disco_master      = Disco(disco_master_url)
ddfs              = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'

class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)
Esempio n. 10
0
    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name


@Disco.command
def wait(program, jobname):
    """Usage: jobname

    Wait for the named job to complete.
    """
    program.disco.wait(jobname)


if __name__ == '__main__':
    Disco(option_parser=DiscoOptionParser()).main()

    # Workaround for "disco test" in Python2.5 which doesn't shutdown the
    # test_server thread properly.
    sys.exit(0)  # XXX still needed?
Esempio n. 11
0
def data_gen(path):
        return "\n".join(ani)

def fun_map(e, params):
        if type(e) == tuple:
                return [(e[0] + params['suffix'], int(e[1]) + 1)]
        else:
                return [(e + params['suffix'], 0)]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add(k + "-", v)

tserver.run_server(data_gen)
disco = Disco(sys.argv[1])

results = disco.new_job(name = "test_chain_0", input = tserver.makeurl([""] * 100),
                map = fun_map, reduce = fun_reduce, nr_reduces = 4,
                sort = False, params = {'suffix': '0'}).wait()

i = 1
while i < 10:
        nresults = disco.new_job(name = "test_chain_%d" % i, input = results,
                map = fun_map, reduce = fun_reduce, nr_reduces = 4,
                map_reader = chain_reader, sort = False,
                params = {'suffix': str(i)}).wait()

        disco.purge(jobname(results[0]))
        results = nresults
        i += 1
Esempio n. 12
0
def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add("red:" + k, v)

def data_gen(path):
        return path[1:]

tserver.run_server(data_gen)

inputs = ["apple", "orange", "pear"]

job = Disco(sys.argv[1]).new_job(
        name="test_streams",
        input=tserver.makeurl(inputs),
        map=fun_map,
        reduce=fun_reduce,
        nr_reduces=1,
        map_reader = map_reader,
        map_input_stream =
                [map_input_stream, map_input1, map_input2, map_input3],
        reduce_output_stream = [reduce_output1, reduce_output2])

for k, v in result_iterator(job.wait(),
                input_stream = [resultiter_input1, map_input_stream]):

        if not k.startswith("red:cba"):
                raise Exception("Invalid prefix in key. Got '%s' "\
                        "expected prefix 'red:cba'" % k)

        if k[7:] not in inputs:
                raise Exception("Invalid result '%s'" % k)
        inputs.remove(k[7:])
Esempio n. 13
0
        return [(w, 1) for w in re.sub("\W", " ", e).lower().split()]

def fun_reduce(iter, out, params):
        s = {}
        for k, v in iter:
                if k in s:
                        s[k] += int(v)
                else:
                        s[k] = int(v)
        for k, v in s.iteritems():
                out.add(k, v)

tserver.run_server(data_gen)
job = Disco(sys.argv[1]).new_job(name="test_50k",
                        input=tserver.makeurl([""] * int(5e4)),
                        map=fun_map,
                        reduce=fun_reduce,
                        nr_reduces=300,
                        sort=False)

ANS = {"gutta": int(5e6), "cavat": int(1e7), "capidem": int(5e6)}
i = 0
for key, value in result_iterator(job.wait()):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match"
if i != 3:
        raise "Wrong number of results: Got %d expected 3" % i

job.purge()
Esempio n. 14
0
                if v != results[k]:
                        raise "%s: Invalid result for key %s, got %s, "\
                        "expected %s" % (job.name, k, v, results[k])

tserver.run_server(data_gen)

N = 10
results = {}
inputs = []
for i in range(N):
        a = [i] * 10
        b = range(i, i + 10)
        inputs += ["%d:%d" % x for x in zip(a, b)]
        results[str(i)] = sum(b)

disco = Disco(sys.argv[1])

# map results in individual files, one per input file (default mode)
job1 = disco.new_job(\
                name = "test_partfile1",
                input = tserver.makeurl(inputs),
                map = fun_map)

# map results in one big partition file per host
job2 = disco.new_job(\
                name = "test_partfile2",
                input = tserver.makeurl(inputs),
                map = fun_map,
                nr_reduces = 1)

check_results(job1)
Esempio n. 15
0
import sys
from disco.core import Disco, result_iterator

def fun_map(e, params):
        for i in range(3):
                msg("--special_test_string_%d--" % i)
        return [(e, "")]

inputs = ["raw://discoapi"]

job = Disco(sys.argv[1]).new_job(name = "test_discoapi",
        input = inputs,
        map = fun_map)

r = list(result_iterator(job.wait()))
if [("discoapi", "")] != r:
        raise Exception("Invalid result: <%s> " % r)

n = job.jobspec()["name"]
if not n.startswith("test_discoapi"):
        raise Exception("Invalid jobspec: Expected name prefix test_discoapi, "\
                        "got %s" % n)

events = [ev[2] for offs, ev in job.events()]

for i in range(3):
        m = "--special_test_string_%d--" % i
        if not [x for x in events if m in x]:
                raise Exception("Message '%s' not found in events" % m)

job.purge()
Esempio n. 16
0
 def __init__(self, spec, verbose=False, **kwargs):
     self.spec = spec
     self.docset = Docset(self.spec.docset_name)
     self.disco = Disco("disco://localhost")
     self.verbose = verbose
Esempio n. 17
0
import tserver, sys, time
from disco.core import Disco

def data_gen(path):
        return "1 2 3\n"

def fun_map(e, params):
        import time
        time.sleep(100)
        return []

disco = Disco(sys.argv[1])
num = sum(x['max_workers'] for x in disco.nodeinfo()['available'])
print >> sys.stderr, num, "slots available"
tserver.run_server(data_gen)
job = disco.new_job(name = "test_kill",
        input = tserver.makeurl([""] * num * 2), map = fun_map)

time.sleep(10)
print >> sys.stderr, "Killing", job.name
job.kill()
time.sleep(5)
if job.jobinfo()['active'] == "dead":
        print "ok"
        job.purge()
else:
        raise Exception("Killing failed")


Esempio n. 18
0
fail = ["1", "2", "3"]

def data_gen(path):
        lock.acquire()
        e = path[1:]
        if e in fail:
                fail.remove(e)
                lock.release()
                raise tserver.FailedReply()
        else:
                lock.release()
                return str(int(e) * 10) + "\n"

def fun_map(e, params):
        return [(int(e) * 10, "")]

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(
        name = "test_tempfail",
        input = tserver.makeurl(map(str, range(10))),
        map = fun_map)

res = sum(int(x) for x, y in result_iterator(job.wait()))
if res != 4500:
        raise Exception("Invalid result: Got %d, expected 4500" % res)

job.purge()
print "ok"

Esempio n. 19
0
	                  2. Online ODAT; 3. Offline dim')
	parser.add_option('--post-fix',
	                  default=1,
	                  help='Does post-fixing for ODAT? (default=1): 1. Yes; 2. No')	
	parser.add_option('--go-live',
	                  default=1,
	                  help='Load offline dim data to DW DBMS? (default=1): 1. yes; 2. No')		
	parser.add_option('--profile',
	                  default=False,
	                  help='Profile (default=False)')
	parser.add_option('--config',
	                  default='conf/config.py',
	                  help='The path to config.py (default=conf/config.py)')

	(options, input_paths) = parser.parse_args()
	master = Disco("disco://"+options.disco_master)	
	
	load_method = odotetlmr
	seq_process = None
	post_fixing = -1
	load_step = int(options.load_step)
	if options.load_method=='2':
		load_method = odatetlmr
		if  load_step==1:
			post_fixing = int(options.post_fix)
			seq_process = multiprocessing.Process(target=seq_server)
			seq_process.start()		
	elif options.load_method=='3':
		load_method = offdimetlmr
		
	input_file_urls = []	
import sys
import json

from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings

from disco import func
import time


from mapper import map
from reducer import reduce

name = "gap-%s" % int(time.time())
disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job (%s).." % name
print "Go to %s to see status of the job." % disco.master

results = disco.new_job(name=name,
        input=["tag://gap:1million"],
        map_input_stream=(
            func.map_input_stream,
            func.chain_reader,
        ),
        map=map, 
        reduce=reduce, 
        save=True).wait()

print "Job done. Results:"
f = open('data.js', 'w')
for time_of_day, scores in result_iterator(results):
Esempio n. 21
0
    except ValueError:
        print msg(line)

    # bad hack :-(
    time = timestamp.replace("'", "")
    date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off
    nearest_minute = date_obj - timedelta(minutes=date_obj.minute % 1, seconds=date_obj.second, microseconds=date_obj.microsecond)

    yield (nearest_minute, {'unique_id': uid, 'query': query, 'frequency': frequency})

def reduce(iter, params):
    # This doesn't work at all, its from an old example.
    for unique_id, counts in kvgroup(sorted(iter)):
        yield unique_id, sum(counts)

disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master

"""
:clicks (ad id,people who clicked the ads)
"""
results = disco.new_job(name="bartekc",
        input=["tag://hackreduce:search:history"],
        map_input_stream=(
            func.map_input_stream,
            func.chain_reader,
        ),
        map=map, 
        reduce=reduce, 
        save=True).wait()
Esempio n. 22
0
import sys
from disco.core import Disco, result_iterator

def fun_map(e, params):
        return [("", e + ":map")]

inputs = ["raw://eeny", "raw://meeny", "raw://miny", "raw://moe"]

job = Disco(sys.argv[1]).new_job(name = "test_raw",
        input = inputs,
        map = fun_map)

res = dict((x[6:] + ":map", True) for x in inputs)

for x in result_iterator(job.wait()):
        if x[1] not in res:
                raise "Invalid result: <%s> " % x[1]
        del res[x[1]]

if res:
        raise "Invalid number of results %d" %\
                (len(inputs) - len(res))

job.purge()

print "ok"

Esempio n. 23
0
import sys
from disco.core import Disco

OK_STATUS = ['job_ready', 'job_died']

disco = Disco(sys.argv[1])

def op_show(n, s):
    print n

def op_kill(n, s):
    if s == "job_active":
        print "Killing", n
        disco.kill(n)

def op_clean(n, s):
    print "Cleaning", n
    disco.clean(n)

def op_purge(n, s):
    print "Purging", n
    disco.purge(n)

for t, s, name in disco.joblist():
    if sys.argv[3] in name:
        globals()["op_" + sys.argv[2]](name, s)
Esempio n. 24
0
        checkl("testfun4-norecurse", modutil.find_modules([testfun4],\
                recurse = False), [("mod1", abspath("extra/mod1.py"))])

        print "local tests ok"

local_tests()

def data_gen(path):
        return path[1:] + "\n"

def fun_map(e, params):
        x, y = map(float, e.split("|"))
        return [(mod1.plusceil(x, y) + math.ceil(1.5), "")]

tserver.run_server(data_gen)
disco = Disco(sys.argv[1])

inputs = ["0.5|1.2"]

print "disco tests.."

# default
job = disco.new_job(
        name = "test_modutil1",
        input = tserver.makeurl(inputs),
        map = fun_map)
checkl("test_modutil1", result_iterator(job.wait()), [("4.0", "")])
job.purge()
print "test_modutil1 ok"

job = disco.new_job(
Esempio n. 25
0
import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings

def map(entry, params):
    for word in entry.split():
        yield word, 1

def reduce(iter, out, params):
    s = {}
    for word, freq in iter:
        s[word] = s.get(word, 0) + int(freq)
    for word, freq in s.iteritems():
        out.add(word, freq)

disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(name="wordcount",
                   input=["http://discoproject.org/chekhov.txt"],
                   map=map,
                   reduce=reduce).wait()
print "Job done. Results:"
for word, freq in result_iterator(results):
    print word, freq
Esempio n. 26
0
        s = 1
        for k, v in iter:
                if k != "=" + v:
                        raise Exception("Corrupted key")
                s *= int(v)
        out.add("result", s)

tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]

job = Disco(sys.argv[1]).new_job(
                name = "test_writers", 
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = fun_map_writer,
                reduce = fun_reduce, 
                reduce_reader = fun_reduce_reader,
                reduce_writer = fun_reduce_writer,
                nr_reduces = 1,
                sort = False)

res = list(result_iterator(job.wait(), reader = result_reader))

if res != [ANS]:
        raise Exception("Invalid answer: %s" % res)

job.purge()

print "ok"
Esempio n. 27
0
 def __init__(self, name=None, master=None, worker=None, settings=None):
     from disco.core import Disco
     self.name = name or type(self).__name__
     self.disco = master if isinstance(master, Disco) else Disco(master)
     self.worker = worker or self.Worker()
     self.settings = settings or DiscoSettings()
Esempio n. 28
0
def fun_map(e, params):
        return [({"PI": math.pi}, time.strptime(e, "%d/%m/%Y"))]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add({"PI2": k["PI"]}, datetime.datetime(*v[0:6]))

tserver.run_server(data_gen)

inputs = ["01/11/1965", "14/03/1983", "12/12/2002"]

job = Disco(sys.argv[1]).new_job(name = "test_objectrw",
                input = tserver.makeurl(inputs),
                map = fun_map,
                map_writer = func.object_writer,
                reduce = fun_reduce, 
                reduce_reader = func.object_reader,
                reduce_writer = func.object_writer,
                required_modules = ["math", "datetime", "time"],
                nr_reduces = 1,
                sort = False)

i = 0
for k, v in result_iterator(job.wait(), reader = func.object_reader):
        if k["PI2"] != math.pi:
                raise "Invalid key: %s" % k
        if v.strftime("%d/%m/%Y") not in inputs:
                raise "Invalid value: %s" % v
        i += 1

if i != 30:
        raise "Wrong number of results, got %d, expected 30" % i
Esempio n. 29
0
import sys
from disco.core import Disco, result_iterator
from disco.settings import DiscoSettings


def map(line, params):
    for word in line.split():
        yield word, 1


def reduce(iter, params):
    from disco.util import kvgroup
    for word, counts in kvgroup(sorted(iter)):
        yield word, sum(counts)


disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(
    name="wordcount",
    input=["http://discoproject.org/media/text/chekhov.txt"],
    map=map,
    reduce=reduce,
    save=True).wait()
print "Job done. Results:"
for word, count in result_iterator(results):
    print word, count
Esempio n. 30
0
from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)
Esempio n. 31
0
File: cli.py Progetto: tpeng/disco
 def disco(self):
     from disco.core import Disco
     return Disco(settings=self.settings)
Esempio n. 32
0
 def disco(self):
     return Disco(self.disco_master_url)
Esempio n. 33
0
        time.sleep(2)
        return []

def fun_map2(e, params):
        time.sleep(3)
        return []

def fun_map3(e, params):
        fail

def fun_map4(e, params):
        time.sleep(4)
        return []

tserver.run_server(data_gen)
disco = Disco(sys.argv[1])

jobs = []
for i, m in enumerate([fun_map1, fun_map2, fun_map3, fun_map4]):
        jobs.append(disco.new_job(
                name = "test_waitmany_%d" % (i + 1),
                input = tserver.makeurl([""] * 5),
                map = m))

res = []
while jobs:
        cont = False
        ready, jobs = disco.results(jobs, timeout = 2000)
        res += ready

for n, r in res:
Esempio n. 34
0
def fun_reduce(iter, out, params):
        s = {}
        for k, v in iter:
                if k in s:
                        s[k] += int(v)
                else:
                        s[k] = int(v)
        for k, v in s.iteritems():
                out.add(k, v)

tserver.run_server(data_gen)

job = Disco(sys.argv[1]).new_job(\
        name = "test_profile",\
        input = tserver.makeurl([""] * int(100)),\
        map = really_unique_function_name,\
        reduce = fun_reduce,\
        nr_reduces = 30,\
        sort = False,\
        profile = True)

ANS = {"gutta": int(1e4), "cavat": int(2e4), "capidem": int(1e4)}
i = 0
for key, value in result_iterator(job.wait()):
        i += 1
        if ANS[key] == int(value):
                print "Correct: %s %s" % (key, value)
        else:
                raise "Results don't match (%s): Got %d expected %d" %\
                        (key, int(value), ANS[key])
if i != 3:
        raise "Too few results"
Esempio n. 35
0
 def disco(self):
     from disco.core import Disco
     return Disco(self.settings['DISCO_MASTER'])
Esempio n. 36
0
def data_gen(path):
    return "\n".join([path[1:]] * 10)


def fun_map(e, params):
    return [("=" + e, e)]


def fun_reduce(iter, out, params):
    s = 1
    for k, v in iter:
        if k != "=" + v:
            raise Exception("Corrupted key")
        s *= int(v)
    out.add("result", s)


tserver.run_server(data_gen)

inputs = [3, 5, 7, 11, 13, 17, 19, 23, 29, 31]
job = Disco(sys.argv[1]).new_job(
    name="test_simple", input=tserver.makeurl(inputs), map=fun_map, reduce=fun_reduce, nr_reduces=1, sort=False
)

if list(result_iterator(job.wait())) != [("result", ANS)]:
    raise Exception("Invalid answer")

job.purge()
print "ok"
Esempio n. 37
0
 def data():
     return Disco(self.master).jobpack(self.jobname)
Esempio n. 38
0
        distances = list(distances)
        newdistances = {}

        def minFrom(d, a):
            for k, v in a.items():
                d[k] = mymin(d.get(k, -1), v)

        for d in distances:
            if d.get("nodes"):
                nodes = d["nodes"]
            minFrom(newdistances, d["distances"])

        yield node, json.dumps([node, newdistances, nodes])


disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(name="shortestpath",
                        input=["file:///home/marko/tmp/disco/out.txt"],
                        map=map,
                        reduce=reduce,
                        save=True).wait()
print "Job done"

out = file("out.txt", "w")

for node, data in result_iterator(results):
    print >> out, data

out.close()
Esempio n. 39
0
 def disco(self):
     return Disco(settings=self.settings)
Esempio n. 40
0
def data_gen(path):
        return "\n".join([path[1:]] * 10)

def fun_map(e, params):
        import time, random
        time.sleep(random.randint(1, 3))
        return [(e, 0)]

def fun_reduce(iter, out, params):
        for k, v in iter:
                out.add("[%s]" % k, v)

tserver.run_server(data_gen)

disco = Disco(sys.argv[1])
num = sum(x['max_workers'] for x in disco.nodeinfo()['available'])
print >> sys.stderr, num, "slots available"
inputs = tserver.makeurl(range(num * 10))
random.shuffle(inputs)

jobs = []
for i in range(5):
        jobs.append(disco.new_job(name = "test_async_%d" % i,
                       input = inputs[i * (num * 2):(i + 1) * (num * 2)],
                       map = fun_map, reduce = fun_reduce, nr_reduces = 11,
                       sort = False))
        time.sleep(1)

all = dict(("[%s]" % i, 0) for i in range(num * 10))
while jobs:
    
    import csv
    recordReader = csv.reader(output, delimiter=',', quotechar='"')
    for line in recordReader:
        title = line[-4]
        year = line[-1]
        yield year, title 

def reduce(iter, params):
    from disco.util import kvgroup
    for year, titles in kvgroup(sorted(iter)):
        romantic_titles = [title for title in titles if "love" in title.lower()]
        yield year, len(romantic_titles)
        
        
disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job.."
print "Go to %s to see status of the job." % disco.master
results = disco.new_job(name="song-titles",
                        input=["tag://hackreduce:millionsongs:subset"],
                        map=map,
                        reduce=reduce,
                        save=True).wait()
print "Job done. Results:"

chart_url = "http://chart.apis.google.com/chart?chxr=0,0,15&chxt=y&chbh=a,4,10&chs=738x220&cht=bvs&chco=4D89F9&chds=0,15&chd=t:"
res_list = []
# Print result to user
for year, titles in result_iterator(results):
    res_list.append(str(titles))
Esempio n. 42
0
def fun_map(e, params):
        msg(e)
        return []

def fun_map2(e, params):
        return []

def fun_map3(e, params):
        for i in range(100000):
            print "foobar"
        return []

tserver.run_server(data_gen)
inputs = tserver.makeurl([1])
job = Disco(sys.argv[1]).new_job(name = "test_ratelimit",
        input = inputs, map = fun_map)

time.sleep(5)
check_dead(job)

job = Disco(sys.argv[1]).new_job(name = "test_ratelimit2",
        input = inputs, map = fun_map2, status_interval = 1)

time.sleep(5)
check_dead(job)

job = Disco(sys.argv[1]).new_job(name = "test_ratelimit3",
        input = inputs, map = fun_map3, status_interval = 1)

time.sleep(5)
check_dead(job)
Esempio n. 43
0
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

from disco.core import Disco, result_iterator
from disco.util import external

ext_map_exec = "java_map.sh"
ext_reduce_exec = "java_reduce.sh"
map_class = "rmaus.disco.external.sample.WordCountMap"
reduce_class = "rmaus.disco.external.sample.WordCountReduce"

job = Disco("http://discomaster-dr-01:8989").new_job(
        name = "java_wordcount",
	input = ["raw://foo", "raw://bar", "raw://foo"],
	ext_params = { "mapFunction" : map_class, "reduceFunction" : reduce_class, "testKey" : "testValue" },
        map = external([ext_map_exec]),
        reduce = external([ext_reduce_exec]))

results = job.wait(show=True)

for result in sorted(result_iterator(results), key=lambda x:x[1]):
	print result
Esempio n. 44
0
from disco.core import Disco, result_iterator
from disco.util import kvgroup
from disco.settings import DiscoSettings

def fun_map((key, value), params):
    bucket_range = (params.upper - params.lower) // params.num_buckets
    bucket = value // bucket_range
    if bucket >= params.num_buckets:
        yield params.num_buckets - 1, value
    yield bucket, value

def fun_reduce(iter, params):
    for k, v in kvgroup(sorted(iter))
        yield k, sorted(v)

disco = Disco(DiscoSettings()['DISCO_MASTER'])
print "Starting Disco job..
"
results = disco.new_job(name = "Sorting job",
                        input = [(1, 1), (2, 2), (5, 5), (4, 4), (-1, -1)]
                        map = fun_map,
                        reduce = fun_reduce,
                        params = disco.core.Params(lower = 0,
                                                   upper = 10,
                                                   num_buckets = 3)).wait()

print "Job done. Results:"
for k, v in result_iterator(results):
    print k, v

Esempio n. 45
0
    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()


if __name__ == '__main__':
    parser = OptionParser(usage='%prog [options] inputs')
    parser.add_option('--disco-master',
                      default=getenv('DISCO_MASTER'),
                      help='Disco master')
    parser.add_option('--iterations', default=10, help='Numbers of iteration')
    parser.add_option('--clusters', default=10, help='Numbers of clusters')

    (options, input) = parser.parse_args()
    master = Disco(options.disco_master)

    centers = estimate(master, input, mean_point_center, int(options.clusters),
                       int(options.iterations))

    res = predict(master, input, mean_point_center, centers)

    print '\n'.join(res)
Esempio n. 46
0
def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]