def rawquery( query: str, start_date: str = arrow.get().format(SHORT_DATE_FORMAT), end_date: str = arrow.get().shift(days=-15).format(SHORT_DATE_FORMAT), hydrate: int = 0, kafka: bool = False): logger.debug("Converting dates from string") init_date = arrow.get(start_date) finish_date = arrow.get(end_date) logger.info("🐦 Scrapping with:[%s] From 🗓️:[%s] ➡️ To 🗓️:[%s]" % (query, init_date.format('YYYY-MM-DD'), finish_date.format('YYYY-MM-DD'))) # Create day urls urls = __generate_search_url_by_range(query, init_date, finish_date) stage_results = fetch_all(urls) stage_results = aio.flat_map(_get_page_branches, stage_results, workers=MAX_WORKERS) stage_results = th.flat_map(_get_branch_walk, stage_results, workers=MAX_WORKERS) if hydrate == 0: stage_results = th.flat_map(__get_statuses, stage_results, workers=MAX_WORKERS) elif hydrate == 1: stage_results = th.flat_map(_read_statuses, stage_results, workers=MAX_WORKERS) stage_results = th.map(_update_status_stats, stage_results, workers=MAX_WORKERS) else: raise NotImplementedError if kafka: stage_results = th.map(_send_kafka, stage_results, workers=MAX_WORKERS) stage_results = th.map(lambda s: json.dumps(s, indent=4), stage_results, workers=MAX_WORKERS) # List conversion executes pipeline results = list(stage_results) results = list_no_dupes(results) logger.info(f"💬 Getted {len(results)}") return results
def test_concat_basic(nums): nums_py = list(map(lambda x: x + 1, nums)) nums_py1 = list(map(lambda x: x**2, nums_py)) nums_py2 = list(map(lambda x: -x, nums_py)) nums_py = nums_py1 + nums_py2 nums_pl = th.map(lambda x: x + 1, nums) nums_pl1 = th.map(lambda x: x**2, nums_pl) nums_pl2 = th.map(lambda x: -x, nums_pl) nums_pl = th.concat([nums_pl1, nums_pl2]) assert sorted(nums_pl) == sorted(nums_py)
def test_map_square_event_end(nums): namespace = th._get_namespace() namespace.x = 0 namespace.done = False namespace.active_workers = -1 def set_1(): namespace.x = 1 def set_2(stage_status): namespace.x = 2 namespace.active_workers = stage_status.active_workers namespace.done = stage_status.done nums_pl = th.map(lambda x: x**2, nums, workers=3, on_start=set_1, on_done=set_2) nums_pl = list(nums_pl) assert namespace.x == 2 assert namespace.done == True assert namespace.active_workers == 0
def test_map_id(nums): nums_py = nums nums_pl = th.map(lambda x: x, nums) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_map_square_workers(nums): nums_py = map(lambda x: x**2, nums) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums, workers=2) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_map_square(nums): nums_py = map(lambda x: x**2, nums) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_map_id_pipe(nums): nums_pl = ( nums | th.map(lambda x: x) | list ) assert nums_pl == nums
def test_concat_multiple(nums): nums_py = [x + 1 for x in nums] nums_py1 = nums_py + nums_py nums_py2 = nums_py1 + nums_py nums_pl = th.map(lambda x: x + 1, nums) nums_pl1 = th.concat([nums_pl, nums_pl]) nums_pl2 = th.concat([nums_pl1, nums_pl]) assert sorted(nums_py1) == sorted(list(nums_pl1)) assert sorted(nums_py2) == sorted(list(nums_pl2))
def test_from_to_iterable(nums): nums_pl = nums nums_pl = th.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = th.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
def test_flat_map_square(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums) nums_pl = th.flat_map(_generator, nums_pl) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_flat_map_square_workers(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = th.map(lambda x: x**2, nums) nums_pl = th.flat_map(_generator, nums_pl, workers=3) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_map_square_event_start(nums): nums_py = map(lambda x: x**2, nums) nums_py = list(nums_py) namespace = th._get_namespace() namespace.x = 0 def set_1(): namespace.x = 1 nums_pl = th.map(lambda x: x**2, nums, on_start=set_1) nums_pl = list(nums_pl) assert nums_pl == nums_py assert namespace.x == 1
def test_error_handling(): error = None def raise_error(x): raise MyError() stage = th.map(raise_error, range(10)) try: list(stage) except MyError as e: error = e assert isinstance(error, MyError)
def test_flat_map_square_filter_workers_pipe(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = cz.filter(lambda x: x > 1, nums_py) nums_py = list(nums_py) nums_pl = (nums | th.map(lambda x: x**2) | th.flat_map(_generator, workers=3) | th.filter(lambda x: x > 1) | list) assert sorted(nums_pl) == sorted(nums_py)
def test_worker_info(): nums = range(100) n_workers = 4 def set_1(worker_info): return worker_info.index def _lambda(x, index): return index nums_pl = th.map( _lambda, nums, on_start = set_1, workers = n_workers, ) nums_pl = set(nums_pl) assert nums_pl.issubset(set(range(n_workers)))
################### # from_to_iterable ################### @hp.given(nums=st.lists(st.integers())) @hp.settings(max_examples=MAX_EXAMPLES) def test_from_to_iterable(nums): nums_pl = nums nums_pl = th.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = th.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl if __name__ == '__main__': error = None def raise_error(x): raise MyError() stage = th.map(raise_error, range(10)) list(stage)