def test_get_new_identifiers(self): saved_bid = factories.BidFactory() identifiers = [saved_bid.identifier, "new-identifier"] new_identifiers = get_new_identifiers(self.session, identifiers, Bid.Site.COMMBUYS) assert len(new_identifiers) == 1 assert "new-identifier" in new_identifiers
def scrape(self, session): """Iterates through all of Commbuys and extracts bids. This is implemented as follows, starting on the first results page: 1. Download the results page. 2. Extract the bid identifiers from this page. 3. Check which of those identifiers are not yet in our database. 4. For each of the identifiers not yet in our database: 4.1. Download the detail page for each identifier. 4.2. Extract the fields we are interested in. 4.3. Create a Bid object and store it in the database. 5. Go to the next page. Repeat from step #1. """ current_page = 1 # Emergency hack: CommBuys is showing very many bids. Scrape only the first 30 pages. # The bids don't seem to be exactly date ordered, but are mostly date ordered, so # the most recent 30 pages should give us recent / active bids. while current_page <= 30: page = self.scraper.post(self.results_url, data={ 'mode': 'navigation', 'currentPage': current_page }) bid_ids = self.scrape_results_page(page.content) log.info("Results page {} found bid ids: {}".format( current_page, bid_ids)) if not bid_ids: log.info("Page {} has no results. Done scraping.".format( current_page)) break new_ids = get_new_identifiers(session, bid_ids, self.get_site()) # Scrape in parallel the new bid ids found. # Any underlying exceptions are allowed to propagate to the caller, and # will abort the entire scraping process. arg_tuples = [(self.scrape_bid_page, bid_id) for bid_id in new_ids] try: bids = execute_parallel(arg_tuples) except Exception as err: log.error( "Caught exception during bid detail scraping: {}".format( err)) session.bulk_save_objects(bids) # Save all the new bids from this results page in one db call. session.commit() current_page += 1
def scrape(self): """Iterates through a single results page and extracts bids. This is implemented as follows: 1. Download the results page. 2. Extract the bid identifiers from this page. 3. Check which of those identifiers are not yet in our database. 4. For each of the identifiers not yet in our database: 4.1. Download the detail page for each identifier. 4.2. Extract the fields we are interested in. 4.3. Create a Bid object and store it in the database. """ session = Session() page = self.scraper.get(self.results_url) bid_ids = self.scrape_results_page(page.content) log.info("Found bid ids: {}".format(bid_ids)) new_ids = get_new_identifiers(session, bid_ids, self.get_site()) arg_tuples = [(self.scrape_bid_page, bid_id) for bid_id in new_ids] bids = execute_parallel(arg_tuples) session.bulk_save_objects(bids) session.commit()
def scrape(self): """Iterates through all of Commbuys and extracts bids. This is implemented as follows, starting on the first results page: 1. Download the results page. 2. Extract the bid identifiers from this page. 3. Check which of those identifiers are not yet in our database. 4. For each of the identifiers not yet in our database: 4.1. Download the detail page for each identifier. 4.2. Extract the fields we are interested in. 4.3. Create a Bid object and store it in the database. 5. Go to the next page. Repeat from step #1. """ current_page = 1 session = Session() while True: page = self.scraper.post(self.results_url, data={ 'mode': 'navigation', 'currentPage': current_page }) bid_ids = self.scrape_results_page(page.content) log.info("Results page {} found bid ids: {}".format( current_page, bid_ids)) if not bid_ids: log.info("Page {} has no results. Done scraping.".format( current_page)) break new_ids = get_new_identifiers(session, bid_ids, self.get_site()) # Scrape in parallel the new bid ids found. # Any underlying exceptions are allowed to propagate to the caller, and # will abort the entire scraping process. arg_tuples = [(self.scrape_bid_page, bid_id) for bid_id in new_ids] bids = execute_parallel(arg_tuples) session.bulk_save_objects(bids) # Save all the new bids from this results page in one db call. session.commit() current_page += 1