Example #1
0
def VisitGoodsPage(mongo_collection, driver, key, brand):
    # 初始化各个变量
    url = brand['original_url']

    try:
        driver.get(url)
    except WebDriverException as e:
        time.sleep(10)
        Logger.info('Error!' + str(e))
        driver.quit()
        driver = loginTmall.login_tmall()
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    time.sleep(random.uniform(0.5, 1))

    print('准备访问商品页面')
    print('商品详细信息')

    time.sleep(random.uniform(2, 4))
    driver.execute_script("scrollTo(0,1000)")
    time.sleep(random.uniform(1, 2))
    driver.execute_script("scrollTo(0,5000)")
    time.sleep(random.uniform(1, 2))
    driver.execute_script("scrollTo(0,10000)")
    time.sleep(random.uniform(1, 2))
    # driver.execute_script("scrollTo(0,30000)")

    bs_obj = BeautifulSoup(driver.page_source, 'lxml')
    dealWith(mongo_collection, bs_obj, key, brand)


    print("done..")
def VisitGoodsPage(mongo_collection, driver, key, brand):
    # 初始化各个变量
    url = brand['original_url']

    try:
        driver.get(url)
    except WebDriverException as e:
        time.sleep(10)
        Logger.info('Error!' + str(e))
        driver.quit()
        driver = loginTmall.login_tmall()
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    time.sleep(random.uniform(0.5, 1))
    # 判断是否跳入了验证码
    current_url = driver.current_url
    if 'https://sec.taobao.com' in current_url:
        collection_name = mongo_collection.name
        writeToCsv(url, brand, collection_name)
        time.sleep(random.uniform(2, 4))
        return
    # 判断是否没有商品
    if isNoItem(driver):
        return
    max_page = getMaxPage(driver)
    if max_page == 0:
        return
    Logger.info('最大页数:' + str(max_page))
    print('准备访问商品页面')
    print('商品详细信息')

    time.sleep(random.uniform(2, 4))
    driver.execute_script("scrollTo(0,1000)")
    time.sleep(random.uniform(1, 2))
    driver.execute_script("scrollTo(0,5000)")
    time.sleep(random.uniform(1, 2))
    driver.execute_script("scrollTo(0,10000)")
    time.sleep(random.uniform(1, 2))
    # driver.execute_script("scrollTo(0,30000)")

    bs_obj = BeautifulSoup(driver.page_source, 'lxml')
    dealWith(mongo_collection, bs_obj, key, brand)
    N = 2
    while N <= int(max_page):

        time.sleep(2)

        element = WebDriverWait(
            driver, 60).until(lambda driver: driver.find_element_by_xpath(
                "//a[@class='ui-page-s-next']"))
        element.click()
        time.sleep(2)
        driver.execute_script("scrollTo(0,1000)")
        time.sleep(1)
        driver.execute_script("scrollTo(0,5000)")
        time.sleep(1)
        driver.execute_script("scrollTo(0,10000)")
        time.sleep(1)
        # driver.execute_script("scrollTo(0,30000)")
        Logger.info(driver.current_url)

        # driver.execute_script("scrollTo(0,30000)")

        bs_obj = BeautifulSoup(driver.page_source, 'lxml')
        dealWith(mongo_collection, bs_obj, key, brand)
        # time.sleep(5)
        current_page = getCurrentPage(driver)
        Logger.info('完成当前页爬取:' + str(current_page))
        if int(current_page) == int(max_page):
            Logger.info(brand['original_name'])
            Logger.info('''
                                    #########################################################################
                                    |                            最大页数爬取完毕                               |
                                    #########################################################################
                                ''')
        N = int(current_page) + 1

    print("done..")
Example #3
0
    dataBase = mongo_conn['db_ysld']  # Database

    key_list = []
    for cat in urls_collections_config.keys():
        key_list.append(cat)

    for key in key_list:
        collection_name = urls_collections_config[key][1]
        collection = dataBase[collection_name]  # collection

        name = urls_collections_config[key][3]
        url = urls_collections_config[key][0]
        category_id = urls_collections_config[key][2]
        category = key
        brand = {}
        brand['category_id'] = category_id

        brand['id'] = ''
        brand['original_url'] = url
        brand['name'] = name
        brand['category'] = category
        brand['store_id'] = 1


        Logger.info(name)
        Logger.info(url)
        VisitGoodsPage(collection, driver, key, brand)
        time.sleep(random.uniform(8, 15))
    driver.quit()
    mongo_conn.close()
    output.close()
    cursors.close()
    return res


if __name__ == '__main__':

    driver = loginTmall.login_tmall()
    time.sleep(1)
    mongo_conn = connect_mongo(mongodb_host, mongodb_port, mongodb_username,
                               mongodb_password)
    dataBase = mongo_conn['power']  # Database

    url_list = []
    with open('fail_url.txt', 'r', encoding='utf-8') as fc:
        for i in fc:
            temp = i.split('/*/')
            url_list.append(temp)
        for url_item in url_list:
            collection_name = url_item[0].strip()
            collection = dataBase[collection_name]
            brand_str = url_item[1].replace('ObjectId', '')
            brand = eval(brand_str)

            Logger.info(brand['original_name'])
            Logger.info(brand['original_url'])
            VisitGoodsPage(collection, driver, brand['category'], brand)
            time.sleep(random.uniform(5, 20))
    driver.quit()
    mongo_conn.close()
    output.close()