def funcify(maybe_curry): if ':' in maybe_curry: from functools import partial from urllib import unquote dotted_name, arg = maybe_curry.split(':', 1) return partial(reify(dotted_name, globals=globals()), unquote(arg)) return reify(maybe_curry, globals=globals())
def chunk(program, tag, *urls): """Usage: tag [url ...] Chunks the contents of the urls, pushes the chunks to ddfs and tags them. """ from itertools import chain from disco.util import reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) def getSizeIfSupplied(value, default): if value is not None: from disco.fileutils import MB return int(float(value) * MB) else: return default from disco.fileutils import CHUNK_SIZE, MAX_RECORD_SIZE chunk_size = getSizeIfSupplied(program.options.size, CHUNK_SIZE) max_record_size = getSizeIfSupplied(program.options.max_record_size, MAX_RECORD_SIZE) reader = reify(program.options.reader or 'None') tag, blobs = program.ddfs.chunk(tag, chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader, replicas=program.options.replicas, forceon=[] if not program.options.forceon else [program.options.forceon], chunk_size=chunk_size, max_record_size=max_record_size, update=program.options.update) for replicas in blobs: print('created: {0}'.format('\t'.join(replicas)))
def run(program, jobclass, *inputs): """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.core import Params from disco.util import reify def maybe_list(seq): return seq[0] if len(seq) == 1 else seq name = program.options.name or jobclass.split('.')[-1] input = inputs or [ maybe_list(line.split()) for line in fileinput.input(inputs) ] job = reify(jobclass)(program.disco, name) try: params = job.params except AttributeError: params = Params() params.__dict__.update(**dict(program.options.params)) job.run(input=input, **program.option_parser.jobdict) print job.name
def check_reify(option, opt, val): from disco.util import reify try: return reify(val) except Exception, e: raise OptionValueError("%s option: %s" % (opt, str(e)))
def chunk(program, tag, *urls): """Usage: tag [url ...] Chunks the contents of the urls, pushes the chunks to ddfs and tags them. For chunking a file in the current directory, a './' must be prepended to the file name. Otherwise, ddfs chunk assumes it is a tag name. The character '-' can be used to specify that input can be read from stdin for example: cat chekhov.txt | ddfs chunk chekhov - is the same as: ddfs chunk chekhov ./chekhov.txt and both of them chunk the chekhov.txt file from the local directory into ddfs. """ from itertools import chain from disco.util import reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) def getSizeIfSupplied(value, default): if value is not None: from disco.fileutils import MB return int(float(value) * MB) else: return default from disco.fileutils import CHUNK_SIZE, MAX_RECORD_SIZE chunk_size = getSizeIfSupplied(program.options.size, CHUNK_SIZE) max_record_size = getSizeIfSupplied(program.options.max_record_size, MAX_RECORD_SIZE) reader = reify(program.options.reader or 'None') tag, blobs = program.ddfs.chunk(tag, chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader, replicas=program.options.replicas, forceon=[] if not program.options.forceon else [program.options.forceon], chunk_size=chunk_size, max_record_size=max_record_size, update=program.options.update) for replicas in blobs: print('created: {0}'.format('\t'.join(replicas)))
def chunk(program, tag, *urls): """Usage: [-n replicas] [-S stream] [-R reader] [-t token] [-u] tag [url ...] Chunks the contents of the urls, pushes the chunks to ddfs and tags them. """ from disco.util import reify tags, urls = program.separate_tags(*urls) stream = reify(program.options.stream) reader = reify(program.options.reader or 'None') tag, blobs = program.ddfs.chunk(tag, chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader, replicas=program.options.replicas, update=program.options.update) for replicas in blobs: print 'created: %s' % '\t'.join(replicas)
def chunk(program, tag, *urls): """Usage: tag [url ...] Chunks the contents of the urls, pushes the chunks to ddfs and tags them. """ from itertools import chain from disco.util import reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = reify(program.options.reader or 'None') tag, blobs = program.ddfs.chunk(tag, chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader, replicas=program.options.replicas, update=program.options.update) for replicas in blobs: print('created: {0}'.format('\t'.join(replicas)))
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in classic_iterator(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
def xcat(program, *urls): """Usage: [-i] [-p] [-R reader] [-t token] [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from disco.core import RecordIter from disco.util import iterify, reify tags, urls = program.separate_tags(*urls) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in RecordIter(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
def run(program, jobclass, *inputs): """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.util import reify def maybe_list(seq): return seq[0] if len(seq) == 1 else seq name = program.options.name or jobclass.split('.')[-1] input = inputs or [maybe_list(line.split()) for line in fileinput.input(inputs)] job = reify(jobclass)(program.disco, name) job.run(input=input, **program.option_parser.jobdict) print job.name
def xcat(program, *urls): """Usage: [-i] [-p] [-R reader] [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from disco.core import result_iterator from disco.util import iterify, reify tags, urls = program.separate_tags(*urls) reader = reify(program.options.reader or 'disco.func.chain_reader') for result in result_iterator(chain(urls, program.blobs(*tags)), reader=reader): print '\t'.join(map(str, iterify(result)))
def run(program, jobclass, *inputs): """Usage: jobclass [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.util import reify sys.path.insert(0, "") job = reify(jobclass)(name=program.options.name, master=program.disco, settings=program.settings) input = program.input(*inputs) if any(input): program.options.jobargs["input"] = input if program.options.scheduler: program.options.jobargs["scheduler"] = program.scheduler job.run(**program.options.jobargs) print(job.name)
def run(program, jobclass, *inputs): """Usage: jobclass [input ...] Create an instance of jobclass and run it. Input urls are specified as arguments or read from stdin. """ from disco.util import reify sys.path.insert(0, '') job = reify(jobclass)(name=program.options.name, master=program.disco, settings=program.settings) input = program.input(*inputs) if any(input): program.options.jobargs['input'] = input if program.options.scheduler: program.options.jobargs['scheduler'] = program.scheduler job.run(**program.options.jobargs) print(job.name)
def check_reify(option, opt, val): from disco.util import reify try: return reify(val) except Exception as e: raise OptionValueError('{0} option: {1}'.format(opt, str(e)))
def check_reify(option, opt, val): from disco.util import reify try: return reify(val) except Exception, e: raise OptionValueError('%s option: %s' % (opt, str(e)))