Ejemplo n.º 1
0
    def testScraper(self):
        """
        Test the Scraper class by feeding it a sample of html
        files randomly picked from the download directory.
        """

        path_ = dirname(__file__)
        downloads = normpath(join(path_, '../downloads/m-134'))

        files = listdir(downloads)
        files = [file for file in files if 'NO_JOBS' not in file]

        min_ = 1
        max_ = len(files)
        size = 100

        samples = sample(range(min_, max_), size)
        soup_items = list()

        for i in samples:
            filepath = join(downloads, files[i])

            with open(filepath, 'r') as f:
                html = f.read()
                f.close()
                soup = BeautifulSoup(html)

                uuid_match = search(r'uuid-(\d{7})', files[i])
                uuid = uuid_match.group(1)

                date_match = match(r'(\d{4}-\d{2}-\d{2})', files[i])
                sdate = date_match.group(1)
                date_ = datetime.strptime(sdate, '%Y-%m-%d').date()

                stamp = Stamp(date_, uuid)
                soup_item = Stamped(stamp, soup)
                soup_items.append(soup_item)

        s = Scraper()
        serial_items = s.scrape(soup_items)

        self.assertIsNotNone(serial_items)
Ejemplo n.º 2
0
    def mine(self, date: datetime):
        """ If that date hasn't been scraped before, scrape it! """
        date_string = date.strftime('%d-%m-%Y')
        # Switch on the engine
        m = Scraper(date=date, session=self._session, server=self._server)

        # Been there, done that
        if date in self._miners:
            self._rec('{} has already been mined', date_string)
            m.close()

        else:
            # Go browse the web summary page for that day
            # and scrape off the job uuid request parameters.
            jobs = m.scrape_uuids()

            # I don't work on weekends
            if not jobs:
                self._rec('No jobs found for {}', date_string)

            else:
                for j in jobs:
                    # Grab the job's web page, regex it and store
                    # the collected fields in a sensible manner.
                    # We don't pickle the data yet: instead, we
                    # pickle multiple days at once before exit.
                    soup = m._get_job(j)
                    raw_data = m._scrape_job(soup)
                    m.process_job(raw_data)

                    # So wanna see results?
                    pp = PrettyPrinter()
                    pp.pprint(m.raw_data[0])                    # Job details
                    pp.pprint(m.raw_data[1])                    # Price table
                    [pp.pprint(d) for d in m.raw_data[2]]       # Addresses



                # We're never gonna scrape with a 100% success
                # rate, but let's do better next time!
                # TODO Hopefully remove this debug message later
                self._rec('Mined: {} successfully!', date_string)
                for message in m._warnings:
                    self._rec(message)