Beispiel #1
0
def get_user_info(user, fail_time=0):
    print u'找到用户',user ,u'的评论, 正在查询', user, u'的星级'
    base_url = config.STAR_INFO_URL
    url = base_url #+ quote(user.encode('utf-8', 'ignore'))
    allow_star = range(1, config.MAX_STAR + 1)
    try:
        time.sleep(1)
        driver = config.DRIVER
        driver.get(url)
        element = driver.find_element_by_id('txt_name')
        element.send_keys(user, Keys.ENTER)
        WebDriverWait(driver, config.TIMEOUT).until(
            EC.visibility_of_element_located((By.ID, "buyer_ratecount"))
        )

        html = driver.page_source
        time.sleep(3)
        pattern = re.compile('<span id="buyer_ratecount.*?src="(.*?)gif', re.S)
        result = re.search(pattern, html)
        print result.group(0)
        src = result.group(0)
        #driver.get('http://www.baidu.com')
        for allow in allow_star:
            if 'b_red_'+ str(allow) in src:
                print u'该用户', user, u'星级符合要求'
                return True
        print u'该用户', user, u'星级不符合要求'
        return False
    except TimeoutException:
        print u'查询失败, 正在重试'
        print u'请打开 http://www.taoyitu.com/ 输入验证码,即可迅速解决问题'
        fail_time = fail_time + 1
        if fail_time == 3:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此用户'
            return False
        return get_user_info(user, fail_time)
    except (socket.error, urllib2.URLError):
        print u'查询失败, 跳过该用户'
        return False
    except NoSuchElementException:
        print u'查询星级失败, 正在重试'
        print u'请打开 http://www.taoyitu.com/ 输入验证码,即可迅速解决问题'
        if fail_time >=2 :
            print u'请求超时, 正在切换代理, 继续重试'
            update_proxy_pool()
            new_proxy_driver()
        else:
            print u'请求超时,正在切换会话重试'
            new_driver()
        time.sleep(3)
        fail_time = fail_time + 1
        if fail_time == 5:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此用户'
            return False
        return get_user_info(user, fail_time)
Beispiel #2
0
def scrap(url, fail_time=0, use_driver=True):
    timeout = config.TIMEOUT

    print u'正在请求', url, u', 请稍后...'

    try:
        if use_driver:
            driver = config.DRIVER
            driver.get(url)
            WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.ID, "content"))
                # EC.presence_of_element_located((By.ID, "J_TabRecommends"))
            )
            result = get_recommends(driver, config.MAX_TRY)
            if result:
                html = driver.page_source
        else:
            html = requests.get(url).text
        if html:
            print u'查找成功'
            print "html:"
            print html
            return html
        else:
            print u'请求超时, 获取失败, 此页面不存在相应内容'
    except TimeoutException:
        if fail_time >= 2:
            print u'请求超时, 正在切换代理, 继续重试'
            update_proxy_pool()
            new_proxy_driver()
        else:
            print u'请求超时,正在切换会话重试'
            new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前页面请求失败数', fail_time
        if fail_time == config.MAX_FAIL:
            update_proxy_pool()
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        scrap(url, fail_time)
    except (socket.error, urllib2.URLError):
        print u'请求页面过于频繁, 请求被中断, 正在切换会话重试'
        new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前页面请求失败数', fail_time
        if fail_time == config.MAX_FAIL:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        scrap(url, fail_time)
    except (WindowsError, OSError, Exception):
        print u'未知错误, 跳过继续运行'
Beispiel #3
0
def scrap(url, fail_time=0):
    timeout = config.TIMEOUT

    print u'正在请求', url, u', 请稍后...'

    try:
        driver = config.DRIVER
        driver.get(url)
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.ID, "J_TabRecommends"))
        )
        result = get_recommends(driver, config.MAX_TRY)
        if result:
            print u'查找成功'
            html = driver.page_source
            parse_content(html)
        else:
            print u'请求超时, 获取失败, 此页面不存在相应内容'
    except TimeoutException:
        if fail_time >=2 :
            print u'请求超时, 正在切换代理, 继续重试'
            update_proxy_pool()
            new_proxy_driver()
        else:
            print u'请求超时,正在切换会话重试'
            new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前页面请求失败数', fail_time
        if fail_time == config.MAX_FAIL:
            update_proxy_pool()
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        scrap(url, fail_time)
    except (socket.error, urllib2.URLError):
        print u'请求页面过于频繁, 请求被中断, 正在切换会话重试'
        new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前页面请求失败数', fail_time
        if fail_time == config.MAX_FAIL:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        scrap(url, fail_time)
    except (WindowsError, OSError, Exception):
        print u'未知错误, 跳过继续运行'
Beispiel #4
0
def get_product(url, fail_time=0):
    try:
        print u'当前宝贝在第', config.NOW_COUNT + 1, u'个网页, 共', config.TOTAL_COUNT, u'个网页'
        time.sleep(1)
        driver = config.DRIVER
        driver.get(url)
        WebDriverWait(driver, config.TIMEOUT).until(
            EC.presence_of_element_located((By.TAG_NAME, "title"))
        )
        html = driver.page_source
        doc = pq(html)
        title = doc('title').text()
        return title
    except TimeoutException:
        if fail_time >= 2:
            print u'请求超时, 正在切换代理, 继续重试'
            update_proxy_pool()
            new_proxy_driver()
        else:
            print u'请求超时,正在切换会话重试'
            new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前失败次数', fail_time
        if fail_time == config.MAX_FAIL:
            update_proxy_pool()
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        get_product(url, fail_time)
    except (socket.error, urllib2.URLError):
        print u'请求宝贝过于频繁, 请求被中断, 正在切换会话重试'
        new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前失败次数', fail_time
        if fail_time == config.MAX_FAIL:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        get_product(url, fail_time)
    except (WindowsError, OSError, Exception):
        print u'未知错误, 跳过继续运行'
Beispiel #5
0
def get_product(url, fail_time=0):
    try:
        print u'当前宝贝在第', config.NOW_COUNT + 1, u'个网页, 共', config.TOTAL_COUNT, u'个网页'
        time.sleep(1)
        driver = config.DRIVER
        driver.get(url)
        WebDriverWait(driver, config.TIMEOUT).until(
            EC.presence_of_element_located((By.TAG_NAME, "title")))
        html = driver.page_source
        doc = pq(html)
        title = doc('title').text()
        return title
    except TimeoutException:
        if fail_time >= 2:
            print u'请求超时, 正在切换代理, 继续重试'
            update_proxy_pool()
            new_proxy_driver()
        else:
            print u'请求超时,正在切换会话重试'
            new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前失败次数', fail_time
        if fail_time == config.MAX_FAIL:
            update_proxy_pool()
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        get_product(url, fail_time)
    except (socket.error, urllib2.URLError):
        print u'请求宝贝过于频繁, 请求被中断, 正在切换会话重试'
        new_driver()
        fail_time = fail_time + 1
        if config.CONSOLE_OUTPUT:
            print u'当前失败次数', fail_time
        if fail_time == config.MAX_FAIL:
            if config.CONSOLE_OUTPUT:
                print u'失败次数过多, 跳过此请求'
            return False
        get_product(url, fail_time)
    except (WindowsError, OSError, Exception):
        print u'未知错误, 跳过继续运行'
Beispiel #6
0
# -*- coding: utf-8 -*-
from proxy.getproxy import update_proxy_pool
from proxy.updateproxy import update_proxy

if __name__ == "__main__":
    update_proxy_pool()