Exemple #1
0
 async def branch(self, coros, limit=10):
     """异步切片,限制并发"""
     index = 0
     while True:
         xs = stream.preserve(coros)
         ys = xs[index:index + limit]
         t = await stream.list(ys)
         if not t:
             break
         await asyncio.create_task(asyncio.wait(t))
         index = limit + 1
Exemple #2
0
async def test_preserve(assert_run, event_loop):
    async def agen():
        yield 1
        yield 2

    xs = stream.iterate(agen())[0]
    await assert_run(xs, [1])
    await assert_run(xs, [], IndexError('Index out of range'))

    ys = stream.preserve(agen())[0]
    await assert_run(ys, [1])
    await assert_run(ys, [2])
Exemple #3
0
async def test_preserve(assert_run, event_loop):

    async def agen():
        yield 1
        yield 2

    xs = stream.iterate(agen())[0]
    await assert_run(xs, [1])
    await assert_run(xs, [], IndexError('Index out of range'))

    ys = stream.preserve(agen())[0]
    await assert_run(ys, [1])
    await assert_run(ys, [2])
Exemple #4
0
async def branch(coros, limit=10):
    '''
    使用aiostream模块对异步生成器做一个切片操作。这里并发量为10.
    :param coros: 异步生成器
    :param limit: 并发次数
    :return:
    '''
    index = 0
    while True:
        xs = stream.preserve(coros)
        ys = xs[index:index + limit]
        t = await stream.list(ys)
        if not t:
            break
        await asyncio.ensure_future(asyncio.wait(t))
        index += limit + 1
    async def branch(self, tasks, limit=950):
        """异步切片,限制并发"""
        # index = 0
        while True:
            xs = stream.preserve(tasks)
            ys = xs[0:limit]
            t = await stream.list(ys)
            if not t:
                break
            await asyncio.create_task(asyncio.wait(t))

            print(f'休眠300s')
            await asyncio.sleep(300)
            print('休眠结束')

        self.r.close()
        await self.r.wait_closed()
        self.client.close
Exemple #6
0
async def main():
    async def agen():
        yield 1
        yield 2
        yield 3

    # The xs stream does not preserve the generator
    xs = stream.iterate(agen())
    print(await xs[0])            # Print 1
    print(await stream.list(xs))  # Print [] (2 and 3 have never yielded)

    # The xs stream does preserve the generator
    xs = stream.preserve(agen())
    print(await xs[0])            # Print 1
    print(await stream.list(xs))  # Print [2, 3]

    # Transform agen into a stream operator
    agen_stream = operator(agen)
    xs = agen_stream()            # agen is now reusable
    print(await stream.list(xs))  # Print [1, 2, 3]
    print(await stream.list(xs))  # Print [1, 2, 3]
Exemple #7
0
async def main():
    async def agen():
        yield 1
        yield 2
        yield 3

    # The xs stream does not preserve the generator
    xs = stream.iterate(agen())
    print(await xs[0])  # Print 1
    print(await stream.list(xs))  # Print [] (2 and 3 have never yielded)

    # The xs stream does preserve the generator
    xs = stream.preserve(agen())
    print(await xs[0])  # Print 1
    print(await stream.list(xs))  # Print [2, 3]

    # Transform agen into a stream operator
    agen_stream = operator(agen)
    xs = agen_stream()  # agen is now reusable
    print(await stream.list(xs))  # Print [1, 2, 3]
    print(await stream.list(xs))  # Print [1, 2, 3]
Exemple #8
0
async def data_stream_async(url,
                            files,
                            columns=None,
                            map_func=None,
                            reduce_func=None,
                            initializer=None,
                            producer_num=2,
                            data_handler_num=2,
                            executor_type='process'):
    data_type = files[0].split('.')[-1]
    columns_mapping = get_parquet_mapping() if data_type == 'parquet' else None

    if columns:
        c = [k for k, v in columns_mapping.items() if v in columns]
        c = c + list(
            set(columns).difference(set(list(columns_mapping.values()))))
    else:
        c = columns

    global pbar_handler
    pbar_handler = PbarHandler(len(files))

    global executor
    if executor_type == 'process':
        executor = ProcessPoolExecutor(max_workers=data_handler_num)
    elif executor_type == 'thread':
        executor = ThreadPoolExecutor(max_workers=data_handler_num)

    if map_func:
        map_task = partial(data_load, func=map_func)
    else:
        map_task = data_load
    pbar_handler.register(mapper, len(files))

    if reduce_func:
        reduce_task = partial(reducer, func=reduce_func)
    else:
        reduce_task = partial(reducer, func=concat)
        initializer = pd.DataFrame()

    file_streams = [
        stream.preserve(file_stream(files[i::producer_num], url))
        for i in range(producer_num)
    ]
    file_list = []

    aws = (stream.merge(*file_streams)
           | pipe.map(
               async_(lambda x: mapper(x, map_task, c, columns_mapping)),
               task_limit=data_handler_num)
           | pipe.map(async_(lambda x: file_list.append(x[0]) or x[1]),
                      task_limit=data_handler_num))

    if reduce_func:
        pbar_handler.register(reducer, len(files) - 1)
        rs = stream.reduce(aws, async_(reduce_task), initializer)
        reduced = await stream.takelast(rs, 1)
        return reduced
    else:
        data_list = await asyncio.gather(stream.list(aws))
        data_list = data_list[0]
        tmp_list = list(zip(file_list, data_list))
        tmp_list = sorted(tmp_list, key=lambda pair: files.index(pair[0]))
        if map_func:
            return tmp_list
        else:
            return pd.concat(list(map(lambda pair: pair[1], tmp_list)), axis=0)