def regularsearch(area, category, sort, cache, cachedir, executor, get, **kwargs): doc = lxml.html.fromstring( get( get_query_url(area, category, 'search', offset=0, sort=sort, **kwargs))) num_total_posts = get_num_total_posts_from_response(doc) num_posts_on_page = get_number_of_posts_on_current_page_from_response(doc) yield from get_posts_from_response(doc, area) for offset in range(100, num_total_posts, 100): doc = lxml.html.fromstring( get( get_query_url(area, category, 'search', offset=offset, sort=sort, **kwargs))) num_posts_on_page = get_number_of_posts_on_current_page_from_response( doc) yield from get_posts_from_response(doc, area)
def regularsearch(area, category, sort, cache, cachedir, executor, get, **kwargs): doc = lxml.html.fromstring( get( get_query_url(area, category, 'search', offset=0, sort=sort, **kwargs))) num_total_posts = get_num_total_posts_from_response(doc) num_posts_on_page = get_number_of_posts_on_current_page_from_response(doc) logger.debug( f'downloaded first page: num_total_posts: {num_total_posts} | num_posts_on_page: {num_posts_on_page}' ) yield from get_posts_from_response(doc, area) per_page = num_posts_on_page for offset in range(per_page, num_total_posts, per_page): doc = lxml.html.fromstring( get( get_query_url(area, category, 'search', offset=offset, sort=sort, **kwargs))) num_posts_on_page = get_number_of_posts_on_current_page_from_response( doc) yield from get_posts_from_response(doc, area)
async def jsonsearch_async(area, category, sort, cache, cachedir, get, as_completed=asyncio.as_completed, **kwargs): async def process_clusters(clusters): futures = [ process_cluster_url_async(cluster.url, get) for cluster in clusters ] try: for future in as_completed(futures): posts, clusters = await future for post in posts: yield post async for post in process_clusters(clusters): yield post except KeyboardInterrupt: # pragma: no cover for future in futures: future.cancel() url = get_query_url(area, category, "jsonsearch", sort=sort, map=1, **kwargs) posts, clusters = await process_cluster_url_async(url, get) for post in posts: yield post async for post in process_clusters(clusters): yield post
def jsonsearch(area, category, sort, cache, cachedir, executor, get, as_completed=concurrent.futures.as_completed, **kwargs): def process_clusters(clusters, executor): futures = (executor.submit(process_cluster_url, cluster.url, get) for cluster in clusters) try: for future in as_completed(futures): posts, clusters = future.result() yield from posts process_clusters(clusters, executor) except KeyboardInterrupt: # pragma: no cover for future in futures: future.cancel() url = get_query_url(area, category, "jsonsearch", sort=sort, map=1, **kwargs) posts, clusters = process_cluster_url(url, get) yield from posts yield from process_clusters(clusters, executor)
def test_get_query_url(): from craigslist._search import get_query_url assert "https://washingtondc.craigslist.org/jsonsearch/apa" in get_query_url( 'washingtondc', 'apa', 'jsonsearch') with pytest.raises(ValueError) as e_info: get_query_url('washingtondc', 'errorerrorerror', 'jsonsearch')