def VisitGoodsPage(mongo_collection, driver, key, brand): # 初始化各个变量 url = brand['original_url'] try: driver.get(url) except WebDriverException as e: time.sleep(10) Logger.info('Error!' + str(e)) driver.quit() driver = loginTmall.login_tmall() driver.get(url) time.sleep(random.uniform(2, 4)) time.sleep(random.uniform(0.5, 1)) print('准备访问商品页面') print('商品详细信息') time.sleep(random.uniform(2, 4)) driver.execute_script("scrollTo(0,1000)") time.sleep(random.uniform(1, 2)) driver.execute_script("scrollTo(0,5000)") time.sleep(random.uniform(1, 2)) driver.execute_script("scrollTo(0,10000)") time.sleep(random.uniform(1, 2)) # driver.execute_script("scrollTo(0,30000)") bs_obj = BeautifulSoup(driver.page_source, 'lxml') dealWith(mongo_collection, bs_obj, key, brand) print("done..")
def VisitGoodsPage(mongo_collection, driver, key, brand): # 初始化各个变量 url = brand['original_url'] try: driver.get(url) except WebDriverException as e: time.sleep(10) Logger.info('Error!' + str(e)) driver.quit() driver = loginTmall.login_tmall() driver.get(url) time.sleep(random.uniform(2, 4)) time.sleep(random.uniform(0.5, 1)) # 判断是否跳入了验证码 current_url = driver.current_url if 'https://sec.taobao.com' in current_url: collection_name = mongo_collection.name writeToCsv(url, brand, collection_name) time.sleep(random.uniform(2, 4)) return # 判断是否没有商品 if isNoItem(driver): return max_page = getMaxPage(driver) if max_page == 0: return Logger.info('最大页数:' + str(max_page)) print('准备访问商品页面') print('商品详细信息') time.sleep(random.uniform(2, 4)) driver.execute_script("scrollTo(0,1000)") time.sleep(random.uniform(1, 2)) driver.execute_script("scrollTo(0,5000)") time.sleep(random.uniform(1, 2)) driver.execute_script("scrollTo(0,10000)") time.sleep(random.uniform(1, 2)) # driver.execute_script("scrollTo(0,30000)") bs_obj = BeautifulSoup(driver.page_source, 'lxml') dealWith(mongo_collection, bs_obj, key, brand) N = 2 while N <= int(max_page): time.sleep(2) element = WebDriverWait( driver, 60).until(lambda driver: driver.find_element_by_xpath( "//a[@class='ui-page-s-next']")) element.click() time.sleep(2) driver.execute_script("scrollTo(0,1000)") time.sleep(1) driver.execute_script("scrollTo(0,5000)") time.sleep(1) driver.execute_script("scrollTo(0,10000)") time.sleep(1) # driver.execute_script("scrollTo(0,30000)") Logger.info(driver.current_url) # driver.execute_script("scrollTo(0,30000)") bs_obj = BeautifulSoup(driver.page_source, 'lxml') dealWith(mongo_collection, bs_obj, key, brand) # time.sleep(5) current_page = getCurrentPage(driver) Logger.info('完成当前页爬取:' + str(current_page)) if int(current_page) == int(max_page): Logger.info(brand['original_name']) Logger.info(''' ######################################################################### | 最大页数爬取完毕 | ######################################################################### ''') N = int(current_page) + 1 print("done..")
dataBase = mongo_conn['db_ysld'] # Database key_list = [] for cat in urls_collections_config.keys(): key_list.append(cat) for key in key_list: collection_name = urls_collections_config[key][1] collection = dataBase[collection_name] # collection name = urls_collections_config[key][3] url = urls_collections_config[key][0] category_id = urls_collections_config[key][2] category = key brand = {} brand['category_id'] = category_id brand['id'] = '' brand['original_url'] = url brand['name'] = name brand['category'] = category brand['store_id'] = 1 Logger.info(name) Logger.info(url) VisitGoodsPage(collection, driver, key, brand) time.sleep(random.uniform(8, 15)) driver.quit() mongo_conn.close() output.close()
cursors.close() return res if __name__ == '__main__': driver = loginTmall.login_tmall() time.sleep(1) mongo_conn = connect_mongo(mongodb_host, mongodb_port, mongodb_username, mongodb_password) dataBase = mongo_conn['power'] # Database url_list = [] with open('fail_url.txt', 'r', encoding='utf-8') as fc: for i in fc: temp = i.split('/*/') url_list.append(temp) for url_item in url_list: collection_name = url_item[0].strip() collection = dataBase[collection_name] brand_str = url_item[1].replace('ObjectId', '') brand = eval(brand_str) Logger.info(brand['original_name']) Logger.info(brand['original_url']) VisitGoodsPage(collection, driver, brand['category'], brand) time.sleep(random.uniform(5, 20)) driver.quit() mongo_conn.close() output.close()