def test_getterTypes(): testurl = 'https://news.ycombinator.com/news' getter0 = Getter('urlopen') html0 = getter0.get_html(testurl) assert isinstance(html0, bytes) getter1 = Getter('chromedriver') html1 = getter1.get_html(testurl) assert isinstance(html1, str) getter2 = Getter('requests') html2 = getter2.get_html(testurl) assert isinstance(html2, bytes)
class WebStash: def __init__(self, getterType='urlopen', waitTimeBeforeScraping=0): self.cacher = Cacher() self.config = Config() self.getter = Getter(getterType, waitTimeBeforeScraping=waitTimeBeforeScraping) def get_web_data(self, url): try: return self.cacher[url] except KeyError: self.config.debugPrint('Getting webData...') filename = self.cacher.getFilename(url) html = self.getter.get_html(url) screenshotLocation = self.getter.get_screenshot( url, filename + '.png') webData = WebData(filename, url, html, screenshotLocation=screenshotLocation) self.cacher[url] = webData return self.cacher[url] def delete(url): del self.cacher[url] def clean(self): self.cacher.clean()
def test_getter_wait_before_scraping(): import datetime waitTimeBeforeScraping = 1 testSleep = Getter('urlopen', waitTimeBeforeScraping=waitTimeBeforeScraping) startTime = datetime.datetime.now() for i in range(3): testSleep.get_html('https://news.ycombinator.com/news') endTime = datetime.datetime.now() assert (endTime - startTime).seconds > 3 * waitTimeBeforeScraping try: errorgetter = Getter('this is not a getter type') except GetterImplementationError as e: assert str( e) == 'this is not a getter type is not a supported getter type'