def rawquery( query: str, start_date: str = arrow.get().format(SHORT_DATE_FORMAT), end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT)): logger.debug("Converting dates from string") init_date = arrow.get(start_date) finish_date = arrow.get(end_date) logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" % (query, init_date.format('YYYY-MM-DD'), finish_date.format('YYYY-MM-DD'))) # Create day urls urls = __generate_search_url_by_range(query, init_date, finish_date) stage_results = fetch_all(urls) stage_results = aio.flat_map(_get_page_branches, stage_results, workers=MAX_WORKERS) stage_results = th.flat_map(_get_branch_walk, stage_results, workers=MAX_WORKERS) stage_results = th.flat_map(_read_statuses, stage_results, workers=MAX_WORKERS) # results = list_no_dupes(stage_results) results = list(stage_results) logger.info(f"💬 Captured {len(results)}") return results
def rawquery( query: str, start_date: str = arrow.get().format(SHORT_DATE_FORMAT), end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT), hydrate: int = 0, kafka: bool = False): logger.debug("Converting dates from string") init_date = arrow.get(start_date) finish_date = arrow.get(end_date) logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" % (query, init_date.format('YYYY-MM-DD'), finish_date.format('YYYY-MM-DD'))) # Create day urls urls = __generate_search_url_by_range(query, init_date, finish_date) stage_results = fetch_all(urls) stage_results = aio.flat_map(_get_page_branches, stage_results, workers=MAX_WORKERS) stage_results = th.flat_map(_get_branch_walk, stage_results, workers=MAX_WORKERS) if hydrate == 0: stage_results = th.flat_map(__get_statuses, stage_results, workers=MAX_WORKERS) elif hydrate == 1: stage_results = th.flat_map(_read_statuses, stage_results, workers=MAX_WORKERS) stage_results = th.map(_update_status_stats, stage_results, workers=MAX_WORKERS) else: raise NotImplementedError if kafka: stage_results = th.map(_send_kafka, stage_results, workers=MAX_WORKERS) stage_results = th.map(lambda s: json.dumps(s, indent=4), stage_results, workers=MAX_WORKERS) # List conversion executes pipeline results = list(stage_results) results = list_no_dupes(results) logger.info(f"💬 Getted {len(results)}") return results
def test_flat_map_square(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums) nums_pl = th.flat_map(_generator, nums_pl) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_flat_map_square_workers(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums) nums_pl = th.flat_map(_generator, nums_pl, workers=3) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_flat_map_square_filter_workers_pipe(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = cz.filter(lambda x: x > 1, nums_py) nums_py = list(nums_py) nums_pl = (nums | th.map(lambda x: x**2) | th.flat_map(_generator, workers=3) | th.filter(lambda x: x > 1) | list) assert sorted(nums_pl) == sorted(nums_py)