Ejemplo n.º 1
0
 def test_chunk(self):
     from disco.core import classic_iterator
     url = 'http://discoproject.org/media/text/chekhov.txt'
     self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
     self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
     self.assert_(list(classic_iterator(['tag://disco:test:chunk'])),
                  list(classic_iterator([url], reader=None)))
     self.ddfs.delete('disco:test:chunk')
Ejemplo n.º 2
0
 def test_chunk(self):
     from disco.core import classic_iterator
     url = 'http://discoproject.org/media/text/chekhov.txt'
     self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
     self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
     self.assert_(list(classic_iterator(['tag://disco:test:chunk'])),
                  list(classic_iterator([url], reader=None)))
     self.ddfs.delete('disco:test:chunk')
Ejemplo n.º 3
0
def create(name, input):
    # move the existing dir to a backup dir
    dir = dirname(name)
    bck = bckname(name)
    if os.path.exists(dir):
        shutil.move(dir, bck)
    os.makedirs(dir)

    input_size = sum([util.result_size(url) for url in input])
    partitions = 1 + (input_size / partition_size) # close enough
    with open(os.path.join(dir, 'partitions'), 'w') as file:
        file.write(str(partitions))
    job = CreateDB().run(
        input = input,
        partitions = partitions,
        params = {'name':name, 'partitions':partitions}
        )
    created = [key for key, value in classic_iterator(job.wait())]

    load(name)

    # successful - purge job and delete the backup dir
    job.purge()
    if os.path.exists(bck):
        shutil.rmtree(bck)

    return created
Ejemplo n.º 4
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    for record in classic_iterator(chain(urls, program.blobs(*tags)),
                                   input_stream=stream,
                                   reader=reader):
        print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
Ejemplo n.º 5
0
Archivo: ddfscli.py Proyecto: yuj/disco
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    for record in classic_iterator(chain(urls, program.blobs(*tags)),
                                   input_stream=stream,
                                   reader=reader):
        print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
Ejemplo n.º 6
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
Ejemplo n.º 7
0
def xcat(program, *urls):
    """Usage: [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from itertools import chain
    from disco.core import classic_iterator
    from disco.util import iterify, reify, urlresolve, proxy_url

    tags, urls = program.separate_tags(*program.input(*urls))
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)
    bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset]
                for repset in chain(urls, program.blobs(*tags))]
    for record in classic_iterator(bloburls,
                                   input_stream=stream,
                                   reader=reader):
        print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
Ejemplo n.º 8
0
    def run(self, map, reduce, **jobargs):
        """Run a map-reduce job with either ``input_uri`` or ``output_uri``
        as a "mongodb://..." URI.

        .. todo:

            parameter docs
            consider "input" and "output" (sans _uri)
        """

        if not any(uri in jobargs for uri in ('input_uri', 'output_uri')):
            logging.info('You did not specify "input_uri" or "output_uri" '
                         'with MongoJob. This may be in error.')

        if 'mongodb://' in jobargs.get('input_uri', ''):
            jobargs['map_input_stream'] = mongodb_input_stream

        if 'mongodb://' in jobargs.get('output_uri', ''):
            jobargs['reduce_output_stream'] = mongodb_output_stream

        jobargs['map'] = map
        jobargs['reduce'] = reduce
        jobargs.setdefault('input', calculate_splits(jobargs))
        jobargs.setdefault('required_modules', []).extend([
            'mongodisco.mongodb_io',
            'mongodisco.mongodb_input',
            'mongodisco.mongodb_output',
            'mongodisco.mongo_util',
        ])

        super(MongoJob, self).run(**jobargs)

        if jobargs.get('print_to_stdout'):
            for key, value in classic_iterator(self.wait(show=True)):
                print key, value

        elif jobargs.get('job_wait', False):
            self.wait(show=True)

        return self
Ejemplo n.º 9
0
 def chunk_iter(replicas):
     chunker = Chunker(chunk_size=chunk_size)
     return chunker.chunks(classic_iterator([replicas], **kwargs))
Ejemplo n.º 10
0
Archivo: ddfs.py Proyecto: wquan/disco
 def chunk_iter(replicas):
     chunker = Chunker(chunk_size=chunk_size)
     return chunker.chunks(classic_iterator([replicas], **kwargs))
Ejemplo n.º 11
0
    def run(self, map=None, reduce=None, **jobargs):
        """Run a map-reduce job with either ``input_uri`` or ``output_uri``
        as a "mongodb://..." URI.

        .. todo:

            parameter docs
            consider "input" and "output" (sans _uri)
        """

        if not any(uri in jobargs for uri in ('input_uri', 'output_uri', 'bson_input', 'bson_output')):
            logging.info('You did not specify "input_uri" or "output_uri" '
                         'with MongoJob. This may be in error.')

        if 'mongodb://' in jobargs.get('input_uri', ''):
            jobargs['map_input_stream'] = mongodb_input_stream
            jobargs.setdefault('input', calculate_splits(jobargs))
        elif jobargs.get('bson_input', False):
            jobargs['map_input_stream'] = bsonfile_input_stream

        if 'mongodb://' in jobargs.get('output_uri', ''):
            jobargs['reduce_output_stream'] = mongodb_output_stream
            output_params = {
                'output_uri': jobargs['output_uri'],
                'job_output_key': jobargs.get('job_output_key', '_id'),
                'job_output_value': jobargs.get('job_output_value', 'value'),
                'add_action': jobargs.get('add_action', 'insert'),
                'add_upsert': jobargs.get('add_upsert', False),
                'base_doc': jobargs.get('base_doc', {})
            }

            params = jobargs.get('params', {})
            if not isinstance(params, dict):
                raise Exception('params option must be a dict')
            params['mongodb'] = output_params
            jobargs['params'] = params

        elif jobargs.get('bson_output', False):
            jobargs['reduce_output_stream'] = bsonfile_output_stream

        if map:
            jobargs['map'] = map
        if reduce:
            jobargs['reduce'] = reduce


        jobargs.setdefault('required_modules', []).extend([
            'mongodisco.mongodb_io',
            'mongodisco.mongodb_input',
            'mongodisco.mongodb_output',
            'mongodisco.mongo_util',
            'mongodisco.bsonfile_io',
            'mongodisco.bsonfile_input',
            'mongodisco.bsonfile_output'
        ])

        super(MongoJob, self).run(**jobargs)

        if jobargs.get('print_to_stdout'):
            for key, value in classic_iterator(self.wait(show=True)):
                print key, value

        elif jobargs.get('job_wait',False):
            self.wait(show=True)

        return self