Beispiel #1
0
def rawquery(
    query: str,
    start_date: str = arrow.get().format(SHORT_DATE_FORMAT),
    end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT)):
    logger.debug("Converting dates from string")
    init_date = arrow.get(start_date)
    finish_date = arrow.get(end_date)

    logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" %
                (query, init_date.format('YYYY-MM-DD'),
                 finish_date.format('YYYY-MM-DD')))

    # Create day urls
    urls = __generate_search_url_by_range(query, init_date, finish_date)

    stage_results = fetch_all(urls)
    stage_results = aio.flat_map(_get_page_branches,
                                 stage_results,
                                 workers=MAX_WORKERS)
    stage_results = th.flat_map(_get_branch_walk,
                                stage_results,
                                workers=MAX_WORKERS)
    stage_results = th.flat_map(_read_statuses,
                                stage_results,
                                workers=MAX_WORKERS)

    # results = list_no_dupes(stage_results)
    results = list(stage_results)

    logger.info(f"💬 Captured {len(results)}")

    return results
Beispiel #2
0
def rawquery(
        query: str,
        start_date: str = arrow.get().format(SHORT_DATE_FORMAT),
        end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT),
        hydrate: int = 0,
        kafka: bool = False):
    logger.debug("Converting dates from string")
    init_date = arrow.get(start_date)
    finish_date = arrow.get(end_date)

    logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" %
                (query, init_date.format('YYYY-MM-DD'),
                 finish_date.format('YYYY-MM-DD')))

    # Create day urls
    urls = __generate_search_url_by_range(query, init_date, finish_date)

    stage_results = fetch_all(urls)

    stage_results = aio.flat_map(_get_page_branches,
                                 stage_results,
                                 workers=MAX_WORKERS)
    stage_results = th.flat_map(_get_branch_walk,
                                stage_results,
                                workers=MAX_WORKERS)
    if hydrate == 0:
        stage_results = th.flat_map(__get_statuses,
                                    stage_results,
                                    workers=MAX_WORKERS)
    elif hydrate == 1:
        stage_results = th.flat_map(_read_statuses,
                                    stage_results,
                                    workers=MAX_WORKERS)
        stage_results = th.map(_update_status_stats,
                               stage_results,
                               workers=MAX_WORKERS)
    else:
        raise NotImplementedError

    if kafka:
        stage_results = th.map(_send_kafka, stage_results, workers=MAX_WORKERS)

    stage_results = th.map(lambda s: json.dumps(s, indent=4),
                           stage_results,
                           workers=MAX_WORKERS)

    # List conversion executes pipeline
    results = list(stage_results)
    results = list_no_dupes(results)

    logger.info(f"💬 Getted {len(results)}")

    return results
Beispiel #3
0
def test_flat_map_square(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = th.map(lambda x: x**2, nums)
    nums_pl = th.flat_map(_generator, nums_pl)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
Beispiel #4
0
def test_flat_map_square_workers(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = th.map(lambda x: x**2, nums)
    nums_pl = th.flat_map(_generator, nums_pl, workers=3)
    nums_pl = list(nums_pl)

    assert sorted(nums_pl) == sorted(nums_py)
Beispiel #5
0
def test_flat_map_square_filter_workers_pipe(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = cz.filter(lambda x: x > 1, nums_py)
    nums_py = list(nums_py)

    nums_pl = (nums
               | th.map(lambda x: x**2)
               | th.flat_map(_generator, workers=3)
               | th.filter(lambda x: x > 1)
               | list)

    assert sorted(nums_pl) == sorted(nums_py)