def testScrape(self): url = "https://www.bbc.com" s = Scrape() s.setUrl(url) keywords, links = s.scrape() self.assertTrue(keywords, msg='No Keywords found') self.assertTrue(links, msg='No links found')
while stopTime < 0 or time.time() < stopTime: # Wait to receive a source from the master source = comm.recv(source=0) links = list() keywords = list() if source == '': # We got a blank link, wait for a while then ask again time.sleep(1) else: source = source.strip() parts = source.split('/') baseurl = '/'.join(parts[0:3]) rp = roboparser.RobotFileParser() rp.set_url(baseurl + '/robots.txt') rp.read() if rp.can_fetch('*', source): s.setUrl(source) keywords, links = s.scrape() # Persist keywords to the database # commenting out because this is such a bottleneck # s.submitWords(keywords) # Send new links back to the master queue comm.send((keywords, links), dest=0) time.sleep(1)