Beispiel #1
0
def main():
    base = 'https://www.owler.com'
    # path = '/sector/industrial-machinery-equipment-companies'
    # path = '/industry/industrial-goods-services-companies'
    path = '/industry/industrial-goods-services-companies?p=1319'
    url = base + path
    driver = webdriver.Firefox()
    time.sleep(7)
    wait = WebDriverWait(driver, 20)
    driver.get(url)
    time.sleep(10)
    # with open('sample.txt', 'r') as f:  # Mocked
    #     sHtml = f.read()  # Mocked
    resultsInfo = Extractor(driver.page_source)
    sdf = resultsInfo.getData()
    # writeData(sdf, 'biggertest')  # Mocked
    writeData(sdf, 'companies')
    n = resultsInfo.nResults()
    print(n, 'this is main N')
    for i in range(5, 0, -1):
        time.sleep(1)
        print('%s seconds - Crawl will begin' % (i))
    # for v in range(2, (int(n/15)+1)):
    for v in range(1320, (int(n / 15) + 1)):
        randomPause = random.randint(8, 13)
        for i in range(randomPause, 0, -1):
            time.sleep(1)
            # print('%s seconds - Next page will begin' % (i))
        wait = WebDriverWait(driver, 20)
        wait.until(
            EC.visibility_of_element_located((By.XPATH, '//*[@id="next-15"]')))
        driver.find_element_by_xpath('//*[@id="next-15"]').click()
        html = driver.page_source
        info = Extractor(html)
        df = info.getData()
        # writeData(df, 'biggertest')  # Mocked
        writeData(df, 'companies')
        print('Page %s of %s' % (v, int(n / 15)))
        if info.title() == 'Pardon Our Interruption':
            print('wait: %s, p: %s of %s' %
                  (randomPause, v, str(int(n / 15) + 1)))
            print(datetime.datetime.now())
            driver.quit()
            raise SystemExit('They\'re onto us! Ghost out!')
    driver.quit()
Beispiel #2
0
def main():
    if 'linux' in sys.platform:
        # start xvfb in case no X is running. Make sure xvfb
        # is installed, otherwise this won't work!
        dryscrape.start_xvfb()
    # sPage = requests.get(startUrl)
    # sHtml = sPage.text
    # sPage.raise_for_status()
    sess = dryscrape.Session(base_url='https://www.owler.com')
    sess.set_attribute('auto_load_images', False)
    sess.visit('/sector/industrial-machinery-equipment-companies')
    print(sess.status_code(), sess.headers())
    sHtml = sess.body()
    # with open('sample.txt', 'r') as f:  # Mocked
    #     sHtml = f.read()  # Mocked
    resultsInfo = Extractor(sHtml)
    sdf = resultsInfo.getData()
    print(type(sdf))
    # writeData(sdf, 'companies')
    writeData(sdf, 'runcompanies')  # Mocked
    n = resultsInfo.nResults()
    for i in range(5, 0, -1):
        time.sleep(1)
        print('%s seconds - Next page will begin' % (i))
    for v in range(2, int(n / 15)):
        nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v)
        print(nextone)
        # page = requests.get(nextpage)
        # page.raise_for_status()
        # html = page.text
        sess.visit(nextone)
        print(sess.status_code(), sess.headers())
        html = sess.body()
        info = Extractor(html)
        # info = Extractor(sHtml)  # Mocked
        df = info.getData()
        # writeData(df, 'companies')
        writeData(df, 'runcompanies')  # Mocked
        for i in range(20, 0, -1):
            time.sleep(1)
            print('%s seconds - Next page will begin' % (i))