def batch_accumulate(max_batch_size, a_generator, cooperator=None): """ Start a Deferred whose callBack arg is a deque of the accumulation of the values yielded from a_generator which is iterated over in batches the size of max_batch_size. It should be more efficient to iterate over the generator in batches and still provide enough speed for non-blocking execution. :param max_batch_size: The number of iterations of the generator to consume at a time. :param a_generator: An iterator which yields some not None values. :return: A Deferred to which the next callback will be called with the yielded contents of the generator function. """ if cooperator: own_cooperate = cooperator.cooperate else: own_cooperate = cooperate spigot = ValueBucket() items = stream_tap((spigot,), a_generator) d = own_cooperate(i_batch(max_batch_size, items)).whenDone() d.addCallback(accumulation_handler, spigot) return d
def distribute_run_to_runners(items_func, in_url, reader=None, batch_size=1100): """ With a multi-process pool, map batches of items from file to an items processing function. The reader callable should be as fast as possible to reduce data feeder cpu usage. It should do the minimal to produce discrete units of data, save any decoding for the items function. :param items_func: Callable that takes multiple items of the data. :param reader: URL reader callable. :param in_url: Url of content :param batch_size: size of batches. """ from concurrent.futures import ProcessPoolExecutor if not reader: reader = i_read_buffered_binary_file stream = reader(in_url) batches = i_batch(batch_size, stream) with ProcessPoolExecutor() as pool: return list(pool.map(items_func, batches))
def batch_accumulate(max_batch_size, a_generator, cooperator=None): """ Start a Deferred whose callBack arg is a deque of the accumulation of the values yielded from a_generator which is iterated over in batches the size of max_batch_size. It should be more efficient to iterate over the generator in batches and still provide enough speed for non-blocking execution. :param max_batch_size: The number of iterations of the generator to consume at a time. :param a_generator: An iterator which yields some not None values. :return: A Deferred to which the next callback will be called with the yielded contents of the generator function. """ if cooperator: own_cooperate = cooperator.cooperate else: own_cooperate = cooperate spigot = ValueBucket() items = stream_tap((spigot, ), a_generator) d = own_cooperate(i_batch(max_batch_size, items)).whenDone() d.addCallback(accumulation_handler, spigot) return d
def certain_kind_tap(data_items): """ As the stream of data items go by, get different kinds of information from them, in this case, the things that are fruit and metal, collecting each kind with a different spigot. stream_tap doesn't consume the data_items iterator by itself, it's a generator and must be consumed by something else. In this case, it's consuming the items by casting the iterator to a tuple, but doing it in batches. Since each batch is not referenced by anything the memory can be freed by the garbage collector, so no matter the size of the data_items, only a little memory is needed. The only things retained are the results, which should just be a subset of the items and in this case, the getter functions only return a portion of each item it matches. :param data_items: A sequence of unicode strings """ fruit_spigot = Bucket(get_fruit) metal_spigot = Bucket(get_metal) items = stream_tap((fruit_spigot, metal_spigot), data_items) for batch in i_batch(100, items): tuple(batch) return fruit_spigot.contents(), metal_spigot.contents()
def split_file_output(name, data, out_dir, max_lines=1100, buffering=FILE_BUFFER_SIZE): """ Split an iterable lines into groups and write each to a shard. :param name: Each shard will use this in it's name. :type name: str :param data: Iterable of data to write. :type data: iter :param out_dir: Path to directory to write the shards. :type out_dir: str :param max_lines: Max number of lines per shard. :type max_lines: int :param buffering: number of bytes to buffer files :type buffering: int """ batches = i_batch(max_lines, data) if is_py3(): join_str = b'' else: join_str = '' index = count() for group in batches: file_path = os.path.join(out_dir, "{0}_{1}".format(next(index), name)) with open(file_path, 'wb', buffering=buffering) as shard_file: shard_file.write(join_str.join(group))
def split_file_output_json(filename, dict_list, out_dir=None, max_lines=1100, buffering=FILE_BUFFER_SIZE): """ Split an iterable of JSON serializable rows of data into groups and write each to a shard. :param buffering: number of bytes to buffer files :type buffering: int """ dirname = os.path.abspath(os.path.dirname(filename)) if out_dir is None: out_dir = dirname basename = os.path.basename(filename) batches = i_batch(max_lines, dict_list) index = count() for group in batches: write_as_json( group, os.path.join(out_dir, "{0}_{1}".format(next(index), basename)), buffering=buffering)
def write_as_csv(items, file_name, append=False, line_buffer_size=None, buffering=FILE_BUFFER_SIZE, get_csv_row_writer=get_csv_row_writer): """ Writes out items to a csv file in groups. :param items: An iterable collection of collections. :param file_name: path to the output file. :param append: whether to append or overwrite the file. :param line_buffer_size: number of lines to write at a time. :param buffering: number of bytes to buffer files :type buffering: int :param get_csv_row_writer: callable that returns a csv row writer function, customize this for non-default options: `custom_writer = partial(get_csv_row_writer, delimiter="|");` `write_as_csv(items, 'my_out_file', get_csv_row_writer=custom_writer)` """ if line_buffer_size is None: line_buffer_size = LINE_BUFFER_SIZE if append: mode = 'a' else: mode = 'w' kwargs = dict(buffering=buffering) if is_py3(): mode += 't' kwargs.update(dict(newline='')) else: mode += 'b' with open(file_name, mode, **kwargs) as csv_file: write_row = get_csv_row_writer(csv_file) batches = i_batch(line_buffer_size, items) for batch in batches: for row in batch: write_row(row)
def distribute_multi_run_to_runners(items_func, in_dir, reader=None, walker=None, batch_size=1100, filter_func=None): """ With a multi-process pool, map batches of items from multiple files to an items processing function. The reader callable should be as fast as possible to reduce data feeder cpu usage. It should do the minimal to produce discrete units of data, save any decoding for the items function. :param items_func: Callable that takes multiple items of the data. :param reader: URL reader callable. :param walker: A generator that takes the in_dir URL and emits url, name tuples. :param batch_size: size of batches. :param filter_func: a function that returns True for desired paths names. """ from concurrent.futures import ProcessPoolExecutor from multiprocessing import cpu_count if not reader: reader = i_read_buffered_binary_file if not walker: walker = i_walk_dir_for_filepaths_names paths_names = walker(in_dir) if filter_func: paths_names_final = ifilter(filter_func, paths_names) else: paths_names_final = paths_names stream = chain.from_iterable( (reader(in_url) for in_url, name in paths_names_final)) batches = i_batch(batch_size, stream) n_cpus = cpu_count() max_workers = (n_cpus-1) or 1 max_in_queue = int(n_cpus * 1.5) with ProcessPoolExecutor(max_workers=max_workers) as pool: futures = [] while True: if len(pool._pending_work_items) < max_in_queue: try: batch = next(batches) futures.append(pool.submit(items_func, batch)) except StopIteration: break def results(): """Generator that yield results of futures that are done. If not done yet, it skips it. """ while futures: for index, future in enumerate(futures): if future.done(): yield future.result() del futures[index] break return results()