Example #1
0
def batch_accumulate(max_batch_size, a_generator, cooperator=None):
    """
    Start a Deferred whose callBack arg is a deque of the accumulation
    of the values yielded from a_generator which is iterated over
    in batches the size of max_batch_size.

    It should be more efficient to iterate over the generator in
     batches and still provide enough speed for non-blocking execution.

    :param max_batch_size: The number of iterations of the generator
     to consume at a time.
    :param a_generator: An iterator which yields some not None values.
    :return: A Deferred to which the next callback will be called with
     the yielded contents of the generator function.
    """
    if cooperator:
        own_cooperate = cooperator.cooperate
    else:
        own_cooperate = cooperate

    spigot = ValueBucket()
    items = stream_tap((spigot,), a_generator)

    d = own_cooperate(i_batch(max_batch_size, items)).whenDone()
    d.addCallback(accumulation_handler, spigot)
    return d
Example #2
0
def distribute_run_to_runners(items_func, in_url, reader=None, batch_size=1100):
    """
    With a multi-process pool, map batches of items from
    file to an items processing function.

    The reader callable should be as fast as possible to
    reduce data feeder cpu usage. It should do the minimal
    to produce discrete units of data, save any decoding
    for the items function.

    :param items_func: Callable that takes multiple items of the data.
    :param reader: URL reader callable.
    :param in_url: Url of content
    :param batch_size: size of batches.
    """
    from concurrent.futures import ProcessPoolExecutor

    if not reader:
        reader = i_read_buffered_binary_file

    stream = reader(in_url)
    batches = i_batch(batch_size, stream)

    with ProcessPoolExecutor() as pool:
        return list(pool.map(items_func, batches))
Example #3
0
def batch_accumulate(max_batch_size, a_generator, cooperator=None):
    """
    Start a Deferred whose callBack arg is a deque of the accumulation
    of the values yielded from a_generator which is iterated over
    in batches the size of max_batch_size.

    It should be more efficient to iterate over the generator in
     batches and still provide enough speed for non-blocking execution.

    :param max_batch_size: The number of iterations of the generator
     to consume at a time.
    :param a_generator: An iterator which yields some not None values.
    :return: A Deferred to which the next callback will be called with
     the yielded contents of the generator function.
    """
    if cooperator:
        own_cooperate = cooperator.cooperate
    else:
        own_cooperate = cooperate

    spigot = ValueBucket()
    items = stream_tap((spigot, ), a_generator)

    d = own_cooperate(i_batch(max_batch_size, items)).whenDone()
    d.addCallback(accumulation_handler, spigot)
    return d
def certain_kind_tap(data_items):
    """
    As the stream of data items go by, get different
    kinds of information from them, in this case,
    the things that are fruit and metal, collecting
    each kind with a different spigot.

    stream_tap doesn't consume the data_items iterator
    by itself, it's a generator and must be consumed
    by something else. In this case, it's consuming
    the items by casting the iterator to a tuple,
    but doing it in batches.

    Since each batch is not referenced by anything
    the memory can be freed by the garbage collector,
    so no matter the size of the data_items, only a little
    memory is needed. The only things retained
    are the results, which should just be a subset
    of the items and in this case, the getter functions
    only return a portion of each item it matches.


    :param data_items: A sequence of unicode strings
    """
    fruit_spigot = Bucket(get_fruit)
    metal_spigot = Bucket(get_metal)

    items = stream_tap((fruit_spigot, metal_spigot), data_items)

    for batch in i_batch(100, items):
        tuple(batch)

    return fruit_spigot.contents(), metal_spigot.contents()
Example #5
0
def split_file_output(name, data, out_dir, max_lines=1100,
                      buffering=FILE_BUFFER_SIZE):
    """
    Split an iterable lines into groups and write each to
    a shard.

    :param name: Each shard will use this in it's name.
    :type name: str
    :param data: Iterable of data to write.
    :type data: iter
    :param out_dir: Path to directory to write the shards.
    :type out_dir: str
    :param max_lines: Max number of lines per shard.
    :type max_lines: int
    :param buffering: number of bytes to buffer files
    :type buffering: int
    """
    batches = i_batch(max_lines, data)

    if is_py3():
        join_str = b''
    else:
        join_str = ''

    index = count()
    for group in batches:
        file_path = os.path.join(out_dir,
                                 "{0}_{1}".format(next(index), name))
        with open(file_path, 'wb', buffering=buffering) as shard_file:
            shard_file.write(join_str.join(group))
Example #6
0
def split_file_output_json(filename, dict_list, out_dir=None, max_lines=1100,
                           buffering=FILE_BUFFER_SIZE):
    """
    Split an iterable of JSON serializable rows of data
     into groups and write each to a shard.

    :param buffering: number of bytes to buffer files
    :type buffering: int
    """
    dirname = os.path.abspath(os.path.dirname(filename))
    if out_dir is None:
        out_dir = dirname
    basename = os.path.basename(filename)

    batches = i_batch(max_lines, dict_list)

    index = count()
    for group in batches:
        write_as_json(
            group,
            os.path.join(out_dir, "{0}_{1}".format(next(index), basename)),
            buffering=buffering)
Example #7
0
def write_as_csv(items, file_name, append=False,
                 line_buffer_size=None, buffering=FILE_BUFFER_SIZE,
                 get_csv_row_writer=get_csv_row_writer):
    """
    Writes out items to a csv file in groups.

    :param items: An iterable collection of collections.
    :param file_name: path to the output file.
    :param append: whether to append or overwrite the file.
    :param line_buffer_size: number of lines to write at a time.
    :param buffering: number of bytes to buffer files
    :type buffering: int
    :param get_csv_row_writer: callable that returns a csv row writer function,
     customize this for non-default options:
     `custom_writer = partial(get_csv_row_writer, delimiter="|");`
     `write_as_csv(items, 'my_out_file', get_csv_row_writer=custom_writer)`
    """
    if line_buffer_size is None:
        line_buffer_size = LINE_BUFFER_SIZE
    if append:
        mode = 'a'
    else:
        mode = 'w'

    kwargs = dict(buffering=buffering)
    if is_py3():
        mode += 't'
        kwargs.update(dict(newline=''))
    else:
        mode += 'b'

    with open(file_name, mode, **kwargs) as csv_file:
        write_row = get_csv_row_writer(csv_file)
        batches = i_batch(line_buffer_size, items)
        for batch in batches:
            for row in batch:
                write_row(row)
Example #8
0
def distribute_multi_run_to_runners(items_func, in_dir,
                                    reader=None,
                                    walker=None,
                                    batch_size=1100,
                                    filter_func=None):
    """
    With a multi-process pool, map batches of items from
    multiple files to an items processing function.

    The reader callable should be as fast as possible to
    reduce data feeder cpu usage. It should do the minimal
    to produce discrete units of data, save any decoding
    for the items function.

    :param items_func: Callable that takes multiple items of the data.
    :param reader: URL reader callable.
    :param walker: A generator that takes the in_dir URL and emits
     url, name tuples.
    :param batch_size: size of batches.
    :param filter_func: a function that returns True for desired paths names.
    """
    from concurrent.futures import ProcessPoolExecutor
    from multiprocessing import cpu_count

    if not reader:
        reader = i_read_buffered_binary_file

    if not walker:
        walker = i_walk_dir_for_filepaths_names

    paths_names = walker(in_dir)
    if filter_func:
        paths_names_final = ifilter(filter_func, paths_names)
    else:
        paths_names_final = paths_names

    stream = chain.from_iterable(
        (reader(in_url) for in_url, name in paths_names_final))
    batches = i_batch(batch_size, stream)

    n_cpus = cpu_count()
    max_workers = (n_cpus-1) or 1
    max_in_queue = int(n_cpus * 1.5)
    with ProcessPoolExecutor(max_workers=max_workers) as pool:
        futures = []
        while True:
            if len(pool._pending_work_items) < max_in_queue:
                try:
                    batch = next(batches)
                    futures.append(pool.submit(items_func, batch))
                except StopIteration:
                    break

    def results():
        """Generator that yield results of futures
        that are done. If not done yet, it skips it.
        """
        while futures:
            for index, future in enumerate(futures):
                if future.done():
                    yield future.result()
                    del futures[index]
                    break

    return results()