def test_simple(): get_f = lambda: ratelimit(3, 0.1)(foo) assert timeit(get_f(), 1) < 0.01 assert timeit(get_f(), 3) < 0.01 assert timeit(get_f(), 5) > 0.2 get_f = ratelimit(1, 0.2)(foo) assert timeit(get_f, 1) < 0.01 assert timeit(get_f, 2) > 0.2
def run_scraper(state: TruecarState, args: Namespace) -> None: LOG.info(f"Starting scrape with state {state}") if state.scrape_finished_unix < state.scrape_started_unix: start_mileage = state.start_mileage else: start_mileage = 1 mileage_delta = 10 target_total = 1000 mileage_cap = 500_000 conn = sql.Connection(CAR_DB) register(conn.close) state.scrape_started_unix = int(time.time()) limiter = ratelimit(3, args.ratelimit) session = Session() insert_executor = ThreadPoolExecutor(max_workers=1) with session: while True: max_mileage = min(start_mileage + mileage_delta, mileage_cap) listings = list( get_listings_shard_sqlite( session, limiter, min_mileage=start_mileage, max_mileage=max_mileage, )) if len(listings) == 0 and state.start_mileage >= 500_000: yield state.new() return insert_executor.submit(insert_listings, listings) # mileage_delta < 5 is bugged on the server side start_mileage += mileage_delta mileage_delta = max( 5, int(mileage_delta * target_total / len(listings))) LOG.info( f"Inserted {len(listings)} listings: " f"mileage {start_mileage}->{start_mileage + mileage_delta}") state.start_mileage = start_mileage yield state listings.clear() if max_mileage >= mileage_cap: break state.scrape_finished_unix = int(time.time()) yield state
def test_basic(): with pytest.raises(ValueError): ratelimit(1, 0) with pytest.raises(ValueError): ratelimit(0, 1) # test we don't crash ratelimit(1, 1)(foo)()
def test_microfuzz(): for pool in range(1, 4): for n_calls in range(1, 4): for _ in range(5): timeit(ratelimit(pool, 1e-6 + random() * 1e-3)(foo), n_calls)
def scrape(st: AutotraderState, args: Namespace = None) -> Generator[AutotraderState, None, None]: args = args or Namespace if args.force_restart or st.cur_min_price > 1_000_000: st = st.new() yield st sess = Session() sess.headers.update({ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) " "Gecko/20100101 Firefox/98.0" }) insert_pool = ThreadPoolExecutor(max_workers=1) limiter = ratelimit(3, 1) http_get = limiter(partial(sess.get, timeout=30)) delta = 256 inserted = processed = 0 while True: yield st # determine a suitable price range that will keep us under offset 1000, # which is the backend allowed limit while True: params = dict( allLisingType="USED", sellerTypes="d", searchRadius=0, numRecords=25, minPrice=st.cur_min_price, maxPrice=st.cur_min_price + delta, ) if st.cur_shards: # TODO formalize/generalize parameter sector/sharding logic body, ofs = st.cur_shards[-1] params |= dict(firstRecord=ofs) if body: params |= dict(vehicleStyleCodes=body) # without shards this tries to get the whole sector, which lets us know # how to shard nd = http_get(BASE_URL, params=params).json() tot_results = nd["totalResultCount"] LOG.debug(f"Got page: firstRecord={params.get('firstRecord', 0)}, " f"{tot_results=}; n_shards={len(st.cur_shards)}") if st.cur_shards: break elif 0 < tot_results < 1000: st.cur_shards = [(None, ofs) for ofs in range(0, tot_results, 25)] elif tot_results == 0: st.next_sector(delta) yield st elif delta > 0: delta //= 2 else: LOG.info(f"Still too many results, sharding by body type.") shards = {} for bt in AT_BODIES: params["vehicleStyleCodes"] = bt shards[bt] = http_get( BASE_URL, params=params).json()["totalResultCount"] st.cur_shards = [(key, ofs) for key, val in shards.items() for ofs in range(0, val, 25)] raw_listings = prepare_listing_dict(nd) listings = [handle_listing(sess, ld) for ld in raw_listings] inserted += len([it for it in listings if it is not None]) processed += len(listings) insert_pool.submit(insert_listings, listings) st.cur_shards.pop() if not st.cur_shards: LOG.info( f"Inserted {inserted} listings: " f"price {st.cur_min_price}->{st.cur_min_price + delta}; " f"{inserted/processed if processed > 0 else 1:.3f} accept rate." ) st.next_sector(delta) inserted = processed = 0 # aim for 750 results delta = int(delta * 750 / (100 + tot_results)) if st.cur_min_price > 1_000_000: return