async def branch(self, coros, limit=10): """异步切片,限制并发""" index = 0 while True: xs = stream.preserve(coros) ys = xs[index:index + limit] t = await stream.list(ys) if not t: break await asyncio.create_task(asyncio.wait(t)) index = limit + 1
async def test_preserve(assert_run, event_loop): async def agen(): yield 1 yield 2 xs = stream.iterate(agen())[0] await assert_run(xs, [1]) await assert_run(xs, [], IndexError('Index out of range')) ys = stream.preserve(agen())[0] await assert_run(ys, [1]) await assert_run(ys, [2])
async def branch(coros, limit=10): ''' 使用aiostream模块对异步生成器做一个切片操作。这里并发量为10. :param coros: 异步生成器 :param limit: 并发次数 :return: ''' index = 0 while True: xs = stream.preserve(coros) ys = xs[index:index + limit] t = await stream.list(ys) if not t: break await asyncio.ensure_future(asyncio.wait(t)) index += limit + 1
async def branch(self, tasks, limit=950): """异步切片,限制并发""" # index = 0 while True: xs = stream.preserve(tasks) ys = xs[0:limit] t = await stream.list(ys) if not t: break await asyncio.create_task(asyncio.wait(t)) print(f'休眠300s') await asyncio.sleep(300) print('休眠结束') self.r.close() await self.r.wait_closed() self.client.close
async def main(): async def agen(): yield 1 yield 2 yield 3 # The xs stream does not preserve the generator xs = stream.iterate(agen()) print(await xs[0]) # Print 1 print(await stream.list(xs)) # Print [] (2 and 3 have never yielded) # The xs stream does preserve the generator xs = stream.preserve(agen()) print(await xs[0]) # Print 1 print(await stream.list(xs)) # Print [2, 3] # Transform agen into a stream operator agen_stream = operator(agen) xs = agen_stream() # agen is now reusable print(await stream.list(xs)) # Print [1, 2, 3] print(await stream.list(xs)) # Print [1, 2, 3]
async def data_stream_async(url, files, columns=None, map_func=None, reduce_func=None, initializer=None, producer_num=2, data_handler_num=2, executor_type='process'): data_type = files[0].split('.')[-1] columns_mapping = get_parquet_mapping() if data_type == 'parquet' else None if columns: c = [k for k, v in columns_mapping.items() if v in columns] c = c + list( set(columns).difference(set(list(columns_mapping.values())))) else: c = columns global pbar_handler pbar_handler = PbarHandler(len(files)) global executor if executor_type == 'process': executor = ProcessPoolExecutor(max_workers=data_handler_num) elif executor_type == 'thread': executor = ThreadPoolExecutor(max_workers=data_handler_num) if map_func: map_task = partial(data_load, func=map_func) else: map_task = data_load pbar_handler.register(mapper, len(files)) if reduce_func: reduce_task = partial(reducer, func=reduce_func) else: reduce_task = partial(reducer, func=concat) initializer = pd.DataFrame() file_streams = [ stream.preserve(file_stream(files[i::producer_num], url)) for i in range(producer_num) ] file_list = [] aws = (stream.merge(*file_streams) | pipe.map( async_(lambda x: mapper(x, map_task, c, columns_mapping)), task_limit=data_handler_num) | pipe.map(async_(lambda x: file_list.append(x[0]) or x[1]), task_limit=data_handler_num)) if reduce_func: pbar_handler.register(reducer, len(files) - 1) rs = stream.reduce(aws, async_(reduce_task), initializer) reduced = await stream.takelast(rs, 1) return reduced else: data_list = await asyncio.gather(stream.list(aws)) data_list = data_list[0] tmp_list = list(zip(file_list, data_list)) tmp_list = sorted(tmp_list, key=lambda pair: files.index(pair[0])) if map_func: return tmp_list else: return pd.concat(list(map(lambda pair: pair[1], tmp_list)), axis=0)