Beispiel #1
0
def rawquery(
    query: str,
    start_date: str = arrow.get().format(SHORT_DATE_FORMAT),
    end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT)):
    logger.debug("Converting dates from string")
    init_date = arrow.get(start_date)
    finish_date = arrow.get(end_date)

    logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" %
                (query, init_date.format('YYYY-MM-DD'),
                 finish_date.format('YYYY-MM-DD')))

    # Create day urls
    urls = __generate_search_url_by_range(query, init_date, finish_date)

    stage_results = fetch_all(urls)
    stage_results = aio.flat_map(_get_page_branches,
                                 stage_results,
                                 workers=MAX_WORKERS)
    stage_results = th.flat_map(_get_branch_walk,
                                stage_results,
                                workers=MAX_WORKERS)
    stage_results = th.flat_map(_read_statuses,
                                stage_results,
                                workers=MAX_WORKERS)

    # results = list_no_dupes(stage_results)
    results = list(stage_results)

    logger.info(f"💬 Captured {len(results)}")

    return results
Beispiel #2
0
def rawquery(
        query: str,
        start_date: str = arrow.get().format(SHORT_DATE_FORMAT),
        end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT),
        hydrate: int = 0,
        kafka: bool = False):
    logger.debug("Converting dates from string")
    init_date = arrow.get(start_date)
    finish_date = arrow.get(end_date)

    logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" %
                (query, init_date.format('YYYY-MM-DD'),
                 finish_date.format('YYYY-MM-DD')))

    # Create day urls
    urls = __generate_search_url_by_range(query, init_date, finish_date)

    stage_results = fetch_all(urls)

    stage_results = aio.flat_map(_get_page_branches,
                                 stage_results,
                                 workers=MAX_WORKERS)
    stage_results = th.flat_map(_get_branch_walk,
                                stage_results,
                                workers=MAX_WORKERS)
    if hydrate == 0:
        stage_results = th.flat_map(__get_statuses,
                                    stage_results,
                                    workers=MAX_WORKERS)
    elif hydrate == 1:
        stage_results = th.flat_map(_read_statuses,
                                    stage_results,
                                    workers=MAX_WORKERS)
        stage_results = th.map(_update_status_stats,
                               stage_results,
                               workers=MAX_WORKERS)
    else:
        raise NotImplementedError

    if kafka:
        stage_results = th.map(_send_kafka, stage_results, workers=MAX_WORKERS)

    stage_results = th.map(lambda s: json.dumps(s, indent=4),
                           stage_results,
                           workers=MAX_WORKERS)

    # List conversion executes pipeline
    results = list(stage_results)
    results = list_no_dupes(results)

    logger.info(f"💬 Getted {len(results)}")

    return results
Beispiel #3
0
def test_flat_map_square(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = aio.map(lambda x: x**2, nums)
    nums_pl = aio.flat_map(_generator, nums_pl)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
Beispiel #4
0
def test_flat_map_square_workers(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = aio.map(lambda x: x**2, nums)
    nums_pl = aio.flat_map(_generator, nums_pl, workers=3)
    nums_pl = list(nums_pl)

    assert sorted(nums_pl) == sorted(nums_py)
Beispiel #5
0
def test_flat_map_square_filter_workers_pipe(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = cz.filter(lambda x: x > 1, nums_py)
    nums_py = list(nums_py)

    nums_pl = (nums
               | aio.map(lambda x: x**2)
               | aio.flat_map(_generator, workers=3)
               | aio.filter(lambda x: x > 1)
               | list)

    assert sorted(nums_pl) == sorted(nums_py)
Beispiel #6
0
from pypeln import asyncio_task as aio

list_acc = []


def batch(x, n):

    if len(list_acc) == n:
        list_out = list(list_acc)
        list_acc.clear()
        yield list_out
    else:
        list_acc.append(x)


print(
    range(100)
    | aio.from_iterable()
    | aio.flat_map(lambda x: batch(x, 10))
    | aio.map(sum)
    | list)
Beispiel #7
0
from pypeln import asyncio_task as aio


def batch(x, list_acc, n):

    if len(list_acc) == n:
        list_out = list(list_acc)
        list_acc.clear()
        yield list_out
    else:
        list_acc.append(x)


print(
    range(100)
    | aio.map(lambda x: x)
    | aio.flat_map(lambda x, list_acc: batch(x, list_acc, 10),
                   on_start=lambda: [])
    | aio.map(sum)
    | list)