def testScraper(self): """ Test the Scraper class by feeding it a sample of html files randomly picked from the download directory. """ path_ = dirname(__file__) downloads = normpath(join(path_, '../downloads/m-134')) files = listdir(downloads) files = [file for file in files if 'NO_JOBS' not in file] min_ = 1 max_ = len(files) size = 100 samples = sample(range(min_, max_), size) soup_items = list() for i in samples: filepath = join(downloads, files[i]) with open(filepath, 'r') as f: html = f.read() f.close() soup = BeautifulSoup(html) uuid_match = search(r'uuid-(\d{7})', files[i]) uuid = uuid_match.group(1) date_match = match(r'(\d{4}-\d{2}-\d{2})', files[i]) sdate = date_match.group(1) date_ = datetime.strptime(sdate, '%Y-%m-%d').date() stamp = Stamp(date_, uuid) soup_item = Stamped(stamp, soup) soup_items.append(soup_item) s = Scraper() serial_items = s.scrape(soup_items) self.assertIsNotNone(serial_items)
def mine(self, date: datetime): """ If that date hasn't been scraped before, scrape it! """ date_string = date.strftime('%d-%m-%Y') # Switch on the engine m = Scraper(date=date, session=self._session, server=self._server) # Been there, done that if date in self._miners: self._rec('{} has already been mined', date_string) m.close() else: # Go browse the web summary page for that day # and scrape off the job uuid request parameters. jobs = m.scrape_uuids() # I don't work on weekends if not jobs: self._rec('No jobs found for {}', date_string) else: for j in jobs: # Grab the job's web page, regex it and store # the collected fields in a sensible manner. # We don't pickle the data yet: instead, we # pickle multiple days at once before exit. soup = m._get_job(j) raw_data = m._scrape_job(soup) m.process_job(raw_data) # So wanna see results? pp = PrettyPrinter() pp.pprint(m.raw_data[0]) # Job details pp.pprint(m.raw_data[1]) # Price table [pp.pprint(d) for d in m.raw_data[2]] # Addresses # We're never gonna scrape with a 100% success # rate, but let's do better next time! # TODO Hopefully remove this debug message later self._rec('Mined: {} successfully!', date_string) for message in m._warnings: self._rec(message)