def testScrape(self):
        url = "https://www.bbc.com"
        s = Scrape()
        s.setUrl(url)

        keywords, links = s.scrape()
        
        self.assertTrue(keywords, msg='No Keywords found')
        self.assertTrue(links, msg='No links found')
Ejemplo n.º 2
0
    while stopTime < 0 or time.time() < stopTime:

        # Wait to receive a source from the master
        source = comm.recv(source=0)
        links = list()
        keywords = list()

        if source == '':
            # We got a blank link, wait for a while then ask again
            time.sleep(1)
        else:
            source = source.strip()
            parts = source.split('/')
            baseurl = '/'.join(parts[0:3])
            rp = roboparser.RobotFileParser()
            rp.set_url(baseurl + '/robots.txt')

            rp.read()

            if rp.can_fetch('*', source):
                s.setUrl(source)
                keywords, links = s.scrape()

                # Persist keywords to the database
                # commenting out because this is such a bottleneck
                # s.submitWords(keywords)

        # Send new links back to the master queue
        comm.send((keywords, links), dest=0)
        time.sleep(1)