Exemple #1
0
def funcify(maybe_curry):
    if ':' in maybe_curry:
        from functools import partial
        from urllib import unquote
        dotted_name, arg = maybe_curry.split(':', 1)
        return partial(reify(dotted_name, globals=globals()), unquote(arg))
    return reify(maybe_curry, globals=globals())
Exemple #2
0
def chunk(program, tag, *urls):
    """Usage: tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    """
    from itertools import chain
    from disco.util import reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)

    def getSizeIfSupplied(value, default):
        if value is not None:
            from disco.fileutils import MB
            return int(float(value) * MB)
        else:
            return default

    from disco.fileutils import CHUNK_SIZE, MAX_RECORD_SIZE
    chunk_size = getSizeIfSupplied(program.options.size, CHUNK_SIZE)
    max_record_size = getSizeIfSupplied(program.options.max_record_size, MAX_RECORD_SIZE)

    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    forceon=[] if not program.options.forceon else
                                        [program.options.forceon],
                                    chunk_size=chunk_size,
                                    max_record_size=max_record_size,
                                    update=program.options.update)
    for replicas in blobs:
        print('created: {0}'.format('\t'.join(replicas)))
Exemple #3
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.core import Params
    from disco.util import reify

    def maybe_list(seq):
        return seq[0] if len(seq) == 1 else seq

    name = program.options.name or jobclass.split('.')[-1]
    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name
Exemple #4
0
def check_reify(option, opt, val):
    from disco.util import reify

    try:
        return reify(val)
    except Exception, e:
        raise OptionValueError("%s option: %s" % (opt, str(e)))
Exemple #5
0
def chunk(program, tag, *urls):
    """Usage: tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    For chunking a file in the current directory, a './' must be prepended to the
    file name.  Otherwise, ddfs chunk assumes it is a tag name.
    The character '-' can be used to specify that input can be read from stdin
    for example:
        cat chekhov.txt | ddfs chunk chekhov -
    is the same as:
        ddfs chunk chekhov ./chekhov.txt

    and both of them chunk the chekhov.txt file from the local directory into
    ddfs.
    """
    from itertools import chain
    from disco.util import reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)

    def getSizeIfSupplied(value, default):
        if value is not None:
            from disco.fileutils import MB
            return int(float(value) * MB)
        else:
            return default

    from disco.fileutils import CHUNK_SIZE, MAX_RECORD_SIZE
    chunk_size = getSizeIfSupplied(program.options.size, CHUNK_SIZE)
    max_record_size = getSizeIfSupplied(program.options.max_record_size,
                                        MAX_RECORD_SIZE)

    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    forceon=[] if not program.options.forceon
                                    else [program.options.forceon],
                                    chunk_size=chunk_size,
                                    max_record_size=max_record_size,
                                    update=program.options.update)
    for replicas in blobs:
        print('created: {0}'.format('\t'.join(replicas)))
Exemple #6
0
def chunk(program, tag, *urls):
    """Usage: tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    For chunking a file in the current directory, a './' must be prepended to the
    file name.  Otherwise, ddfs chunk assumes it is a tag name.
    The character '-' can be used to specify that input can be read from stdin
    for example:
        cat chekhov.txt | ddfs chunk chekhov -
    is the same as:
        ddfs chunk chekhov ./chekhov.txt

    and both of them chunk the chekhov.txt file from the local directory into
    ddfs.
    """
    from itertools import chain
    from disco.util import reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)

    def getSizeIfSupplied(value, default):
        if value is not None:
            from disco.fileutils import MB
            return int(float(value) * MB)
        else:
            return default

    from disco.fileutils import CHUNK_SIZE, MAX_RECORD_SIZE
    chunk_size = getSizeIfSupplied(program.options.size, CHUNK_SIZE)
    max_record_size = getSizeIfSupplied(program.options.max_record_size, MAX_RECORD_SIZE)

    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    forceon=[] if not program.options.forceon else
                                        [program.options.forceon],
                                    chunk_size=chunk_size,
                                    max_record_size=max_record_size,
                                    update=program.options.update)
    for replicas in blobs:
        print('created: {0}'.format('\t'.join(replicas)))
Exemple #7
0
def chunk(program, tag, *urls):
    """Usage: [-n replicas] [-S stream] [-R reader] [-t token] [-u] tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    """
    from disco.util import reify

    tags, urls = program.separate_tags(*urls)
    stream = reify(program.options.stream)
    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    update=program.options.update)
    for replicas in blobs:
        print 'created: %s' % '\t'.join(replicas)
Exemple #8
0
def chunk(program, tag, *urls):
    """Usage: tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    """
    from itertools import chain
    from disco.util import reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    update=program.options.update)
    for replicas in blobs:
        print('created: {0}'.format('\t'.join(replicas)))
Exemple #9
0
def chunk(program, tag, *urls):
    """Usage: tag [url ...]

    Chunks the contents of the urls, pushes the chunks to ddfs and tags them.
    """
    from itertools import chain
    from disco.util import reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = reify(program.options.reader or 'None')
    tag, blobs = program.ddfs.chunk(tag,
                                    chain(urls, program.blobs(*tags)),
                                    input_stream=stream,
                                    reader=reader,
                                    replicas=program.options.replicas,
                                    update=program.options.update)
    for replicas in blobs:
        print('created: {0}'.format('\t'.join(replicas)))
Exemple #10
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    for record in classic_iterator(chain(urls, program.blobs(*tags)),
                                   input_stream=stream,
                                   reader=reader):
        print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
Exemple #11
0
def xcat(program, *urls):
    """Usage: [-i] [-p] [-R reader] [-t token] [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from disco.core import RecordIter
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*urls)
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)

    for record in RecordIter(chain(urls, program.blobs(*tags)),
                             input_stream=stream,
                             reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
Exemple #12
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    for record in classic_iterator(chain(urls, program.blobs(*tags)),
                                   input_stream=stream,
                                   reader=reader):
        print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
Exemple #13
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
Exemple #14
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
Exemple #15
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.util import reify
    def maybe_list(seq):
        return seq[0] if len(seq) == 1 else seq
    name = program.options.name or jobclass.split('.')[-1]
    input = inputs or [maybe_list(line.split())
                       for line in fileinput.input(inputs)]
    job = reify(jobclass)(program.disco, name)
    job.run(input=input, **program.option_parser.jobdict)
    print job.name
Exemple #16
0
def xcat(program, *urls):
    """Usage: [-i] [-p] [-R reader] [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from disco.core import result_iterator
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*urls)
    reader = reify(program.options.reader or 'disco.func.chain_reader')

    for result in result_iterator(chain(urls, program.blobs(*tags)),
                                  reader=reader):
        print '\t'.join(map(str, iterify(result)))
Exemple #17
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.util import reify

    sys.path.insert(0, "")
    job = reify(jobclass)(name=program.options.name, master=program.disco, settings=program.settings)
    input = program.input(*inputs)
    if any(input):
        program.options.jobargs["input"] = input
    if program.options.scheduler:
        program.options.jobargs["scheduler"] = program.scheduler
    job.run(**program.options.jobargs)
    print(job.name)
Exemple #18
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.util import reify
    sys.path.insert(0, '')
    job = reify(jobclass)(name=program.options.name,
                          master=program.disco,
                          settings=program.settings)
    input = program.input(*inputs)
    if any(input):
        program.options.jobargs['input'] = input
    if program.options.scheduler:
        program.options.jobargs['scheduler'] = program.scheduler
    job.run(**program.options.jobargs)
    print(job.name)
Exemple #19
0
def check_reify(option, opt, val):
    from disco.util import reify
    try:
        return reify(val)
    except Exception as e:
        raise OptionValueError('{0} option: {1}'.format(opt, str(e)))
Exemple #20
0
def check_reify(option, opt, val):
    from disco.util import reify
    try:
        return reify(val)
    except Exception as e:
        raise OptionValueError('{0} option: {1}'.format(opt, str(e)))
Exemple #21
0
def check_reify(option, opt, val):
    from disco.util import reify
    try:
        return reify(val)
    except Exception, e:
        raise OptionValueError('%s option: %s' % (opt, str(e)))