コード例 #1
0
def return_html_code(url,use_proxy):    
    dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
    if use_proxy==True:
        proxy_address=random.choice(get_proxy_fastest())
        proxy_type='https'
        print proxy_address,proxy_type
        service_args = [
        '--proxy='+proxy_address,
        '--proxy-type='+proxy_type,
        ]
        driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=service_args)
    else:
        driver = webdriver.PhantomJS(desired_capabilities=dcap)     
    driver.maximize_window()
    driver.get(url)
    print 'Loading initial page'
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    except TimeoutException:
        print 'No tweets here'
        driver.quit()
        return False
    # scroll down to the last tweet until there is no more tweets loaded
    print 'Scrolling tweets'
    while True:       
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print(number_of_tweets),

        # move to the top and then to the bottom 5 times in a row
        for _ in range(5):
            driver.execute_script("window.scrollTo(0, 0)")
            driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
            time.sleep(0.5)

        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.quit()
    print "_"*15
    #with open("check.html",'w') as f: f.write(html_full_source)
    return html_full_source
コード例 #2
0
def return_html_code(url, proxy_use):
    vdisplay = Xvfb()
    vdisplay.start()
    proxy_address_list = get_proxy_fastest()
    if proxy_address_list != False:
        proxy_address = random.choice(proxy_address_list)
        ip, port = proxy_address.split(":")
        print ip, port
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.http", ip)
        profile.set_preference("network.proxy.http_port", port)
        profile.set_preference("network.proxy_type", 1)
        driver = webdriver.Firefox(firefox_profile=profile)
    else:
        print "Using localhost, unable to get proxy"
        driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 30)
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    except TimeoutException:
        driver.quit()
        return False  # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        print len(tweets)  # added in edit 1
        number_of_tweets = len(tweets)
        driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
        try:
            wait.until(
                wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)
            )
        except TimeoutException:
            break
    html_full_source = driver.page_source
    driver.close()
    vdisplay.stop()
    return html_full_source