def main(): base = 'https://www.owler.com' # path = '/sector/industrial-machinery-equipment-companies' # path = '/industry/industrial-goods-services-companies' path = '/industry/industrial-goods-services-companies?p=1319' url = base + path driver = webdriver.Firefox() time.sleep(7) wait = WebDriverWait(driver, 20) driver.get(url) time.sleep(10) # with open('sample.txt', 'r') as f: # Mocked # sHtml = f.read() # Mocked resultsInfo = Extractor(driver.page_source) sdf = resultsInfo.getData() # writeData(sdf, 'biggertest') # Mocked writeData(sdf, 'companies') n = resultsInfo.nResults() print(n, 'this is main N') for i in range(5, 0, -1): time.sleep(1) print('%s seconds - Crawl will begin' % (i)) # for v in range(2, (int(n/15)+1)): for v in range(1320, (int(n / 15) + 1)): randomPause = random.randint(8, 13) for i in range(randomPause, 0, -1): time.sleep(1) # print('%s seconds - Next page will begin' % (i)) wait = WebDriverWait(driver, 20) wait.until( EC.visibility_of_element_located((By.XPATH, '//*[@id="next-15"]'))) driver.find_element_by_xpath('//*[@id="next-15"]').click() html = driver.page_source info = Extractor(html) df = info.getData() # writeData(df, 'biggertest') # Mocked writeData(df, 'companies') print('Page %s of %s' % (v, int(n / 15))) if info.title() == 'Pardon Our Interruption': print('wait: %s, p: %s of %s' % (randomPause, v, str(int(n / 15) + 1))) print(datetime.datetime.now()) driver.quit() raise SystemExit('They\'re onto us! Ghost out!') driver.quit()
def main(): if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() # sPage = requests.get(startUrl) # sHtml = sPage.text # sPage.raise_for_status() sess = dryscrape.Session(base_url='https://www.owler.com') sess.set_attribute('auto_load_images', False) sess.visit('/sector/industrial-machinery-equipment-companies') print(sess.status_code(), sess.headers()) sHtml = sess.body() # with open('sample.txt', 'r') as f: # Mocked # sHtml = f.read() # Mocked resultsInfo = Extractor(sHtml) sdf = resultsInfo.getData() print(type(sdf)) # writeData(sdf, 'companies') writeData(sdf, 'runcompanies') # Mocked n = resultsInfo.nResults() for i in range(5, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i)) for v in range(2, int(n / 15)): nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v) print(nextone) # page = requests.get(nextpage) # page.raise_for_status() # html = page.text sess.visit(nextone) print(sess.status_code(), sess.headers()) html = sess.body() info = Extractor(html) # info = Extractor(sHtml) # Mocked df = info.getData() # writeData(df, 'companies') writeData(df, 'runcompanies') # Mocked for i in range(20, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i))