def form(): if request.method == 'GET': config = configparser.RawConfigParser() config.read("group-answer.ini", encoding="utf-8") return render_template("amz/track-form.html", groups=config.sections(), user=session.get('user'), active="amzTrackForm") if request.method == 'POST': data = request.get_data() json_data = json.loads(data.decode("utf-8")) mp = MysqlPool() sql = "insert into tb_amz_track_pro(user_id,keyword,asin,status,page_size,add_time) values(%s,%s,%s,%s,%s,now())" param = [ session.get('user')['id'], json_data.get("keyword"), json_data.get("asin"), "1", json_data.get('page_size') ] try: mp.insert(sql, param) res_json = {"code": "0000", "message": "已成功提交追踪任务"} except IntegrityError as e: res_json = {"code": "1000", "message": "%s" % e} return jsonify(res_json)
def updateComment(): data = request.get_data() json_data = json.loads(data.decode("utf-8")) mp = MysqlPool() sql = "update tb_comment set content=%s where id=%s" mp.insert(sql, [json_data.get('content'), json_data.get('id')]) res_json = {"code": "0000"} return jsonify(res_json)
def addComment(): data = request.get_data() json_data = json.loads(data.decode("utf-8")) mp = MysqlPool() sql = "insert into tb_comment(content,add_time) values(%s,%s)" mp.insert(sql, [json_data.get('content'), datetime.now()]) res_json = {"code": "0000"} return jsonify(res_json)
def reviewForm(): mp = MysqlPool() if request.method == 'GET': user = session.get('user') user_sql = "select * from tb_user where status=1" user_list = None if user['level'] == 1: user_list = mp.fetch_all(user_sql, None) return render_template("review/review-form.html", user=user, user_list=user_list, active="reviewForm") if request.method == 'POST': data = request.get_data() json_data = json.loads(data.decode("utf-8")) sql = "insert into tb_review_task(user_id,asin,brand,country,img,keyword,kw_page,store," \ "price,days_order,total_order,is_vp,note,add_time,name) values(%s,%s,%s,'us',%s,%s," \ "%s,%s,%s,%s,%s,1,%s,now(),%s)" try: user_id = json_data.get("user_id") except: user_id = session.get('user')['id'] if not user_id: user_id = session.get('user')['id'] param = [ user_id, json_data.get("asin"), json_data.get("brand"), json_data.get("img"), json_data.get("keyword"), json_data.get("kw_page"), json_data.get("store"), json_data.get("price"), json_data.get("days_order"), json_data.get("total_order"), json_data.get("note"), json_data.get("name") ] try: task_id = mp.insert(sql, param) asin_sql = "insert into tb_task_asin(asin,task_id,status,is_put) values(%s,%s,%s,%s)" for asin in str(json_data.get("asin")).split("|"): asin_param = [asin, task_id, 1, 0] mp.insert(asin_sql, asin_param) res_json = {"code": "0000", "message": "已成功提交刷单任务"} except Exception as e: res_json = {"code": "9999", "message": "提交失败%s" % e} return jsonify(res_json)
def form(): if request.method == 'GET': config = configparser.RawConfigParser() config.read("group-answer.ini", encoding="utf-8") return render_template("fb/form.html", groups=config.sections(), user=session.get('user'),active="fbForm") if request.method == 'POST': data = request.get_data() json_data = json.loads(data.decode("utf-8")) mp = MysqlPool() try: for i,group_id in enumerate(json_data.get('group_id')): sql = "insert into tb_post(group_id,keyword,nums,share_num,done_num,done_share,content,user_id,status,add_time,accounts) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'')" param = [group_id, json_data.get("keyword"), json_data.get("nums"),json_data.get("share_num"),"0","0", json_data.get('content'), session.get('user')['id'], 'working', datetime.now()] mp.insert(sql, param) res_json = {"code": "0000", "message": "已成功提交%s个任务" % (len(json_data.get('group_id')))} except IntegrityError as e: res_json = {"code":"1000","message":"%s"%e} return jsonify(res_json)
def getKeyword(): mp = MysqlPool() options = webdriver.ChromeOptions() options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument('blink-settings=imagesEnabled=false') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation']) driver = webdriver.Chrome(options=options) driver.get( "https://www.amz123.com/usatopkeywords-1-1-.htm?rank=0&uprank=0") txt = '▶' while txt == '▶': try: data_lists = driver.find_elements_by_xpath( '//div[@class="listdata"]') for data in data_lists: try: keyword = data.find_element_by_xpath('./div').text cur_rank = data.find_element_by_xpath('./div[2]').text last_rank = data.find_element_by_xpath('./div[3]').text sql = "insert into amz123_keyword set keyword=%s,cur_rank=%s,last_rank=%s,add_time=now()" param = [keyword, cur_rank, last_rank] mp.insert(sql, param) print("---%s入库成功---" % keyword) except: continue sleep(1) next_page = driver.find_element_by_xpath('//nav/ul/li[last()]') txt = next_page.text if next_page.text == '▶': next_page.click() except: continue print("采集完毕") driver.close() driver.quit()
def getProData(ip,product_list): all_log.logger.info("***start***ip=%s,product_list=%s***"%(ip,len(product_list))) ua = UserAgent(verify_ssl=False).chrome options = webdriver.ChromeOptions() options.add_argument("user-agent=" + ua) if ip: options.add_argument(('--proxy-server=http://' + ip)) options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument('blink-settings=imagesEnabled=false') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging','enable-automation']) driver = webdriver.Chrome(options=options) res_success_list = [] try: driver.get("https://www.baidu.com") WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'su'))) cookies = [{'domain': 'www.amazon.com', 'expiry': 1632329890, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no'}, {'domain': '.amazon.com', 'expiry': 2082787202, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US'}, {'domain': '.amazon.com', 'expiry': 1633625853, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '134-4542133-6572654'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l'}, {'domain': '.amazon.com', 'expiry': 1633625846, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '132-8928912-9834042'}] for cookie in cookies: driver.add_cookie(cookie_dict=cookie) sleep(1) for index,product in enumerate(product_list): all_log.logger.info("---开始跟踪%s(%s)---"%(product['keyword'],product['asin'])) driver.get("https://www.amazon.com/s?k=" + product['keyword'] + "&ref=nb_sb_noss") pro_num = 0 page_num = 1 break_flag = False success_flag = True while True: try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.XPATH, '//ul[@class="a-pagination"]'))) except: try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, '//h4[contains(text(),"characters you see")]'))) error_log.logger.error( "***ip=%s,keyword=%s,asin=%s出现验证码,结束当前采集***" % (ip, product['keyword'], product['asin'])) driver.quit() return res_success_list except: pass try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, '//ul[@class="a-pagination"]'))) except: error_log.logger.error( "***ip=%s,keyword=%s,asin=%s页面采集错误,结束当前采集***" % (ip, product['keyword'], product['asin'])) driver.quit() return res_success_list divs = driver.find_elements_by_xpath('//div[contains(@class,"s-main-slot")]/div') for div in divs: pro_asin = div.get_attribute('data-asin') if pro_asin: pro_num += 1 if pro_asin in str(product['asin']): try: #跳过广告 div.find_element_by_xpath('.//div[@class="a-row a-spacing-micro"]') continue except: pass try: price = div.find_element_by_xpath('.//span[@data-a-color="base"]/span').get_attribute("innerText").replace( "$", "") except: price = None try: star = div.find_element_by_xpath('.//div[@class="a-row a-size-small"]/span').get_attribute('aria-label').replace(" out of 5 stars","") except: star = None try: review = div.find_element_by_xpath('.//div[@class="a-row a-size-small"]/span[2]').get_attribute( 'aria-label').replace(",", "") except: review = "0" try: div.find_element_by_xpath('.//span[contains(text(),"by Amazon")]') fba = "1" except: fba = "0" pro_url = div.find_element_by_xpath('.//h2/a').get_attribute("href") js = 'window.open("' + pro_url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.ID, 'bylineInfo_feature_div'))) except: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.XPATH, '//span[contains(text(),"未连接到互联网")]'))) error_log.logger.error("网络连接断开") return res_success_list except: error_log.logger.error("-----%s(%s)采集出错-----" % (product['keyword'], product['asin'])) driver.close() driver.switch_to.window(driver.window_handles[0]) break_flag = True success_flag = False break try: brand = driver.find_element_by_xpath('//a[@id="bylineInfo"]').text.replace('Brand: ', '') except: brand = None try: qa = driver.find_element_by_xpath('//*[@id="askATFLink"]/span').get_attribute( 'innerText').replace(" answered questions", "").replace(",", "").replace("+", "") except: qa = "0" seller = None try: follow_up_text = driver.find_element_by_xpath( '//div[@class="olp-text-box"]/span').get_attribute('innerText') follow_up_list = re.findall("\d", follow_up_text) for fu in follow_up_list: seller += fu except: pass br_error_num = 0 rank_type = 0 big_rank_txt = "" big_rank = 0 mid_rank_txt = "" mid_rank = 0 small_rank_txt = "" small_rank = 0 for_break_flag = False while big_rank_txt == "": if rank_type == 1: try: big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul').get_attribute( 'innerText') if big_rank_txt == "": br_error_num += 1 except: br_error_num += 1 sleep(1) big_rank_txt = "" else: try: big_rank_txt = getRank(driver, 1) except: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.ID, 'detailBulletsWrapper_feature_div'))) rank_type = 1 big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul').get_attribute( 'innerText') except: br_error_num += 1 sleep(1) big_rank_txt = "" if br_error_num == 5: print("未采集到大类排名%s次,退出" % br_error_num) for_break_flag = True break_flag = True success_flag = False break if for_break_flag: break if big_rank_txt != "": if rank_type == 0: big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br in big_rank_list: big_rank += br else: for br_i, br in enumerate(big_rank_txt.split("#")): rank_txt = "#" + br.strip() if br_i == 1: big_rank_txt = re.sub("\(.*", "", rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br_1 in big_rank_list: big_rank += br_1 elif br_i == 2: mid_rank_txt = rank_txt mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr elif br_i == 3: small_rank_txt = rank_txt small_rank_list = re.findall("\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: big_rank = 0 if rank_type == 0: try: mid_rank_txt = getRank(driver, 2) except: mid_rank_txt = "" if mid_rank_txt != "": mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip() mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr else: mid_rank = 0 try: small_rank_txt = getRank(driver, 3) except: small_rank_txt = "" if small_rank_txt != "": small_rank_txt = re.sub("\(.*", "", small_rank_txt).strip() small_rank_list = re.findall("\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: small_rank = 0 rank = pro_num sql = "insert into tb_amz_track_data(pro_id,rank,page_num,price,fba,star,review,brand,qa,seller,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,add_time) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())" sql_param = [product['id'], rank, page_num,price, fba, star, review, brand, qa, seller,big_rank_txt, big_rank, mid_rank_txt, mid_rank, small_rank_txt, small_rank] try: mp = MysqlPool() except: try: mp = MysqlPool() except: error_log.logger.error("-----数据库连接失败-----") success_flag = False break_flag = True break try: mp.insert(sql, sql_param) all_log.logger.info("***%s(%s)入库成功***" % (product['asin'], product['keyword'])) success_flag = False except Exception: error_log.logger.error("入库异常%s"%sql_param) success_flag = False break_flag = True break driver.close() driver.switch_to.window(driver.window_handles[0]) res_success_list.append(product) break_flag = True break if break_flag: break if page_num == product['page_size']: break try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, './/li[@class="a-last"]'))) driver.find_element_by_class_name('a-last').click() page_num += 1 except TimeoutException: print("已到最后一页第%s页"%page_num) break if success_flag: error_log.logger.error("---%s在%s的%s页内未找到---"%(product['asin'],product['keyword'],page_num)) res_success_list.append(product) except Exception as e: traceback.print_exc() error_log.logger.error(e) finally: all_log.logger.info("---end---ip=%s,product_list=%s---" % (ip, product_list)) driver.quit() return res_success_list
def getProData(ip, keyword): all_log.logger.info("***start***ip=%s,keyword=%s***" % (ip, keyword)) ua = UserAgent().chrome options = webdriver.ChromeOptions() options.add_argument("user-agent=" + ua) if ip: options.add_argument(('--proxy-server=http://' + ip)) options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("log-level=3") options.add_argument('blink-settings=imagesEnabled=false') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation']) driver = webdriver.Chrome(options=options) try: driver.get("https://www.baidu.com") WebDriverWait(driver, 15).until(EC.visibility_of_element_located( (By.ID, 'su'))) cookies = [{ 'domain': 'www.amazon.com', 'expiry': 1632329890, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no' }, { 'domain': '.amazon.com', 'expiry': 2082787202, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US' }, { 'domain': '.amazon.com', 'expiry': 1633625853, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '134-4542133-6572654' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l' }, { 'domain': '.amazon.com', 'expiry': 1633625846, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '132-8928912-9834042' }] for cookie in cookies: driver.add_cookie(cookie_dict=cookie) sleep(1) driver.get("https://www.amazon.com/s?k=" + keyword + "&ref=nb_sb_noss") try: WebDriverWait(driver, 20).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.XPATH, '//h4[contains(text(),"characters you see")]'))) error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" % (ip, keyword)) driver.quit() return False, keyword except: pass try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" % (ip, keyword)) driver.quit() return False, keyword divs = driver.find_elements_by_xpath( '//div[contains(@class,"s-main-slot")]/div') success_num = 0 error_num = 0 for div in divs: if error_num > 2: error_log.logger.error("-----%s采集出错超过%s次,退出采集-----" % (keyword, error_num)) all_log.logger.info("-----已采集%s条ASIN-----" % success_num) if success_num > 20: return True, keyword else: return False, keyword try: asin = div.get_attribute('data-asin') except: sleep(1) error_num += 1 continue if asin: try: div.find_element_by_xpath( './/div[@class="a-row a-spacing-micro"]') sponsored = "1" except: pass sponsored = "0" try: price = div.find_element_by_xpath( './/span[@data-a-color="base"]/span').get_attribute( "innerText").replace("$", "") except: price = None try: img = div.find_element_by_xpath('.//img').get_attribute( 'src') except: img = None try: title = div.find_element_by_xpath( './/h2/a/span').get_attribute("innerText") except: title = None try: div.find_element_by_xpath( './/span[contains(text(),"by Amazon")]') fba = "1" except: fba = "0" try: star = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span' ).get_attribute('aria-label').replace( " out of 5 stars", "") except: star = None try: review = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span[2]' ).get_attribute('aria-label').replace(",", "") except: review = "0" pro_url = div.find_element_by_xpath('.//h2/a').get_attribute( "href") js = 'window.open("' + pro_url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.ID, 'bylineInfo_feature_div'))) except: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//span[contains(text(),"未连接到互联网")]'))) error_log.logger.error("网络连接断开") all_log.logger.info("-----已采集%s条ASIN-----" % success_num) if success_num > 20: return True, keyword else: return False, keyword except: pass error_log.logger.error("-----%s(%s)采集出错-----" % (keyword, asin)) error_num += 1 driver.close() driver.switch_to.window(driver.window_handles[0]) continue try: brand = driver.find_element_by_xpath( '//a[@id="bylineInfo"]').text.replace('Brand: ', '') except: brand = None try: qa = driver.find_element_by_xpath( '//*[@id="askATFLink"]/span').get_attribute( 'innerText').replace(" answered questions", "").replace(",", "").replace( "+", "") except: qa = "0" try: seller_id = driver.find_element_by_id( 'merchantID').get_attribute("value") except: seller_id = None br_error_num = 0 rank_type = 0 big_rank_txt = "" big_rank = 0 mid_rank_txt = "" mid_rank = 0 small_rank_txt = "" small_rank = 0 while big_rank_txt == "": if rank_type == 1: try: big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul' ).get_attribute('innerText') if big_rank_txt == "": br_error_num += 1 except: br_error_num += 1 sleep(1) big_rank_txt = "" else: try: big_rank_txt = getRank(driver, 1) except: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.ID, 'detailBulletsWrapper_feature_div'))) rank_type = 1 big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul' ).get_attribute('innerText') except: br_error_num += 1 sleep(1) big_rank_txt = "" if br_error_num == 3: print("未采集到大类排名%s次,跳过" % br_error_num) break if big_rank_txt != "": if rank_type == 0: big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br in big_rank_list: big_rank += br else: for br_i, br in enumerate(big_rank_txt.split("#")): rank_txt = "#" + br.strip() if br_i == 1: big_rank_txt = re.sub("\(.*", "", rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br_1 in big_rank_list: big_rank += br_1 elif br_i == 2: mid_rank_txt = rank_txt mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr elif br_i == 3: small_rank_txt = rank_txt small_rank_list = re.findall( "\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: big_rank = 0 if rank_type == 0: try: mid_rank_txt = getRank(driver, 2) except: mid_rank_txt = "" if mid_rank_txt != "": mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip() mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr else: mid_rank = 0 try: small_rank_txt = getRank(driver, 3) except: small_rank_txt = "" if small_rank_txt != "": small_rank_txt = re.sub("\(.*", "", small_rank_txt).strip() small_rank_list = re.findall("\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: small_rank = 0 try: put_date = driver.find_element_by_xpath( '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[4]/td' ).get_attribute('innerText') if put_date: put_date = datetime.strptime( put_date, '%B %d, %Y').strftime("%Y-%m-%d") except: put_date = None sql = "insert into tb_amz_pro(keyword,asin,img,sponsored,price,title,fba,star,review,brand,seller_id,qa,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())" sql_param = [ keyword, asin, img, sponsored, price, title, fba, star, review, brand, seller_id, qa, big_rank_txt, big_rank, mid_rank_txt, mid_rank, small_rank_txt, small_rank, put_date ] try: mp = MysqlPool() except: try: mp = MysqlPool() except: error_log.logger.error("-----数据库连接失败-----") continue try: mp.insert(sql, sql_param) all_log.logger.info("***%s(%s)入库成功***" % (asin, keyword)) except pymysql.err.IntegrityError: print("重复入库") pass except Exception: error_log.logger.error("入库异常%s" % sql_param) pass success_num += 1 driver.close() driver.switch_to.window(driver.window_handles[0]) return True, keyword except Exception as e: traceback.print_exc() error_log.logger.error(e) return False, keyword finally: all_log.logger.info("---end---ip=%s,keyword=%s---" % (ip, keyword)) driver.quit()
# -*- codeing = utf-8 -*- # @Time : 2020/10/27 23:41 # @Author : Cj # @File : test2.py.py # @Software : PyCharm from time import sleep from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from db import MysqlPool if __name__ == "__main__": mp = MysqlPool() sql = "select * from tb_review_task" s_list = mp.fetch_all(sql, None) for s in s_list: asins = str(s['asin']).split("|") for asin in asins: in_sql = "insert into tb_task_asin(task_id,asin,status) values(%s,%s,1)" param = [s['id'], asin] mp.insert(in_sql, param)
def getData(): options = webdriver.ChromeOptions() options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument('blink-settings=imagesEnabled=false') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation']) driver = webdriver.Chrome(options=options) driver.get("https://www.baidu.com/") cookies = [{ 'domain': '.cashbackbase.com', 'expiry': 1599363219, 'httpOnly': False, 'name': '_gat_gtag_UA_119767146_3', 'path': '/', 'secure': False, 'value': '1' }, { 'domain': '.cashbackbase.com', 'expiry': 1757043117, 'httpOnly': True, 'name': 'cash-ab-test', 'path': '/', 'secure': False, 'value': 'eyJpdiI6InFRZHJVdDNYUHA0UXpPell2MFZFM2c9PSIsInZhbHVlIjoieXFZSjRManpHb00zR01vaGpiNlY5Zz09IiwibWFjIjoiMDU5ZGY3YjA3OGVlNjM2MWRhMjk1YmIxNDJiZTNhOTkxMzRkN2UzNGVhYzViYjM3NGIzMWViZTU2OGY3MGViMyJ9' }, { 'domain': '.cashbackbase.com', 'expiry': 1599363180, 'httpOnly': False, 'name': '_gat_gtag_UA_119767146_1', 'path': '/', 'secure': False, 'value': '1' }, { 'domain': '.cashbackbase.com', 'expiry': 1599449559, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.731395315.1599363120' }, { 'domain': 'www.cashbackbase.com', 'httpOnly': False, 'name': 'current-page', 'path': '/', 'secure': False, 'value': 'https%3A%2F%2Fwww.cashbackbase.com%2Fseller-central' }, { 'domain': '.cashbackbase.com', 'httpOnly': True, 'name': 'cashbackbasev6_session', 'path': '/', 'secure': False, 'value': 'eyJpdiI6IlNveW42QlNCN29ndXdicE96RVVRTFE9PSIsInZhbHVlIjoidWQ0MmtvS2c5RUg4SVN4YlY4NzNjN2h2Wjl0MGFaOW5CK0FFbU5YOFBoMmJHYlZPQzdmUDFDWEtkU2xEaFppQyIsIm1hYyI6ImM3YTI4YWEyNzEzYTI2ZWIyZTMyOWU5YTc5MzNhMWI5ZTViNGZiZDgzZGYyNmJjNmUxYTUzY2MzZmIzNmUzYmQifQ%3D%3D' }, { 'domain': '.cashbackbase.com', 'expiry': 1662435159, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.606942347.1599363120' }] for cookie in cookies: driver.add_cookie(cookie_dict=cookie) driver.get( "https://www.cashbackbase.com/seller/order?key=amz_order_id&value=&status=refunded" ) try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.ID, 'msg-notify'))) driver.find_element_by_xpath( '//*[@id="msg-notify"]//button[@class="close"]').click() except: pass sleep(0.5) country = driver.find_element_by_xpath( '//*[@id="navbar"]/ul[1]/li[9]/div/a/img').get_attribute('title') if country == "UK": driver.find_element_by_xpath( '//*[@id="navbar"]/ul[1]/li[9]/div/a').click() WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//*[@id="navbar"]/ul[1]/li[9]/div/ul'))) driver.find_element_by_xpath( '//*[@id="navbar"]/ul[1]/li[9]/div/ul/li[1]').click() WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.XPATH, '//img[@title="US"]'))) last_class = driver.find_element_by_xpath( '//ul[@class="pagination"]/li[last()]').get_attribute('class') while True: trs = driver.find_elements_by_xpath('//tbody/tr') for tr in trs: try: order_id = tr.find_element_by_xpath('./td').text asin = tr.find_element_by_xpath('.//strong').text img = tr.find_element_by_xpath('.//img').get_attribute("src") url = tr.find_element_by_xpath( './td[last()]/p/a').get_attribute('href') js = 'window.open("' + url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.CLASS_NAME, 'line-1'))) paypal = driver.find_element_by_class_name( 'line-1').text.replace("PayPal Account:", "").strip() customer_name = driver.find_element_by_xpath( '//div[@class="deal-info-title"]/span').text try: profile = driver.find_element_by_xpath( '//div[@class="deal-info-title"]//a').get_attribute( 'href') except: profile = "" driver.close() driver.switch_to.window(driver.window_handles[0]) mp = MysqlPool() sql = "insert into tb_cbb_customer(order_id,asin,img,customer_name,paypal,profile,add_time) values(%s,%s,%s,%s,%s,%s,now())" param = [order_id, asin, img, customer_name, paypal, profile] try: mp.insert(sql, param) print("%s入库成功" % order_id) except: print("%s已存在" % order_id) sleep(0.5) except: traceback.print_exc() print("-----采集出错-----") if not last_class: driver.find_element_by_xpath( '//ul[@class="pagination"]/li[last()]/a').click() sleep(1) else: break driver.quit()
def collectData(): ua = UserAgent().chrome options = webdriver.ChromeOptions() options.add_argument("user-agent=" + ua) options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument('blink-settings=imagesEnabled=false') options.add_argument("--disable-gpu") options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation']) driver = webdriver.Chrome(options=options) driver.get("https://www.baidu.com") try: WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.ID, 'su'))) except: all_log.logger.error("---打开百度失败---") cookies = [{ 'domain': 'www.asinseed.com', 'httpOnly': True, 'name': 'JSESSIONID', 'path': '/', 'secure': False, 'value': 'B0141BDB986A2D91ADCE21BCD1ACA3D2' }, { 'domain': 'www.asinseed.com', 'expiry': 1609251926, 'httpOnly': False, 'name': 'asinseed-login-user', 'path': '/', 'secure': False, 'value': '4291529061IrZXNTSoIlHhPKyHGfg/7TMbw6xY7YpCjminsqgfQO1ekWtRZ9/kAs/qVnCI5AMe' }, { 'domain': '.asinseed.com', 'expiry': 1638195927, 'httpOnly': False, 'name': 'ecookie', 'path': '/', 'secure': False, 'value': 'dWcWHqqTU5LL9saj_CN' }, { 'domain': 'www.asinseed.com', 'expiry': 1606660198, 'httpOnly': False, 'name': 'crisp-client%2Fsocket%2Fb43aa37b-4c35-4551-a9d4-ad983960d40c', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '0' }, { 'domain': '.asinseed.com', 'expiry': 1669731927, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1615561945.1606659387' }, { 'domain': '.asinseed.com', 'expiry': 1622427931, 'httpOnly': False, 'name': 'crisp-client%2Fsession%2Fb43aa37b-4c35-4551-a9d4-ad983960d40c', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'session_f9e04788-6bf4-48fa-8a09-883989976e41' }, { 'domain': '.asinseed.com', 'expiry': 1606659960, 'httpOnly': False, 'name': '_gat_gtag_UA_125163434_1', 'path': '/', 'secure': False, 'value': '1' }, { 'domain': '.asinseed.com', 'expiry': 1606746327, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.1043797262.1606659387' }, { 'domain': '.asinseed.com', 'expiry': 1922019384, 'httpOnly': False, 'name': 'w_guest', 'path': '/', 'secure': False, 'value': 'NpicHiupaa1M_201129-223501' }] for cookie in cookies: driver.add_cookie(cookie_dict=cookie) sleep(0.5) mp = MysqlPool() trend_sql = "select t.* from selected_products t where t.trend_data is null or t.trend_data=''" trend_data_list = mp.fetch_all(trend_sql, None) for trend_data in trend_data_list: driver.get("https://www.asinseed.com/en/US?q=%s" % trend_data['keyword']) WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//div[@class="morris-table-inline"]'))) trs = driver.find_elements_by_xpath( '//div[@class="morris-table-inline"]/../..') searches = '' for tr in trs: if trend_data['keyword'] == tr.find_element_by_xpath( './td[2]').text: searches = eval( tr.find_element_by_xpath('./td[3]/div').get_attribute( "data-y")) if searches == '': searches = eval( driver.find_element_by_xpath( '//div[@class="morris-table-inline"]').get_attribute( "data-y")) update_sql = "update selected_products set trend_data=%s where id=%s" update_param = [str(searches), trend_data['id']] mp.insert(update_sql, update_param) all_log.logger.info("---%s趋势采集成功---" % trend_data['asin']) sleep(1) asin_sql = "select t.* from selected_products t where t.id not in (select t2.main_id from asin_searches t2 where t2.main_id=t.id)" asin_data_list = mp.fetch_all(asin_sql, None) for asin_data in asin_data_list: driver.get("https://www.asinseed.com/en/US?q=%s" % asin_data['asin']) WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//td[@class="text-right"]'))) trs = driver.find_elements_by_xpath('//td[@class="text-right"]/..') insert_sql = "insert into asin_searches(main_id,asin,keyword,searches,add_time) values" update_param = [] for tr in trs: keyword = tr.find_element_by_xpath('./td').text searches = tr.find_element_by_xpath('./td[2]').text.replace( ",", "") if searches is None or searches == "": searches = 0 insert_sql += "(%s,%s,%s,%s,now())," update_param.append(asin_data['id']) update_param.append(asin_data['asin']) update_param.append(keyword) update_param.append(searches) if insert_sql.endswith(","): insert_sql = insert_sql[:-1] mp.insert(insert_sql, update_param) all_log.logger.info("---%s关联关键词成功---" % asin_data['asin']) sleep(1)
def getProData(): mp = MysqlPool() data_sql = "select * from amz123_keyword_left9 where status is null or status=0 order by id limit 2000" data_list = mp.fetch_all(data_sql, None) for data in data_list: os.system("taskkill /f /im chrome.exe /t") proxy = "C:\\py_file\\proxyauth\\%s" % os.listdir( "C:\\py_file\\proxyauth")[random.randint(0, 4)] # proxy = 1 all_log.logger.info("---ip=%s,keyword=%s开始采集---" % (proxy, data['keyword'])) ua = UserAgent().chrome options = webdriver.ChromeOptions() options.add_extension(proxy) options.add_argument("user-agent=" + ua) # options.add_argument("--start-maximized") # options.add_argument("--headless") options.add_argument('blink-settings=imagesEnabled=false') options.add_argument("--disable-gpu") options.add_argument("log-level=3") options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option( 'excludeSwitches', ['enable-logging', 'enable-automation']) driver = webdriver.Chrome(options=options) driver.set_window_size(600, 600) cookies = [{ 'domain': 'www.amazon.com', 'expiry': 1632329890, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no' }, { 'domain': '.amazon.com', 'expiry': 2082787202, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US' }, { 'domain': '.amazon.com', 'expiry': 1633625853, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '134-4542133-6572654' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l' }, { 'domain': '.amazon.com', 'expiry': 1633625846, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '132-8928912-9834042' }] driver.get("https://www.baidu.com") try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.ID, 'su'))) except: error_log.logger.error("---%s打开百度失败---" % proxy) continue for cookie in cookies: driver.add_cookie(cookie_dict=cookie) sleep(0.5) driver.get("https://www.amazon.com/s?k=" + data['keyword'] + "&ref=nb_sb_noss") try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//h4[contains(text(),"characters you see")]'))) error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" % (proxy, data['keyword'])) driver.quit() continue except: pass try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" % (proxy, data['keyword'])) driver.quit() continue divs = driver.find_elements_by_xpath( '//div[contains(@class,"s-main-slot")]/div') try: success_num = 0 update_sql = "update amz123_keyword_left9 set status=1 where id=%s" for div in divs: asin = div.get_attribute('data-asin') if asin and str(asin).startswith("B"): try: div.find_element_by_xpath( './/div[@class="a-row a-spacing-micro"]') sponsored = "1" except: pass sponsored = "0" try: price = div.find_element_by_xpath( './/span[@data-a-color="base"]/span' ).get_attribute("innerText").replace("$", "") except: price = None try: img1 = div.find_element_by_xpath( './/img').get_attribute('src') except: img1 = None try: title = div.find_element_by_xpath( './/h2/a/span').get_attribute("innerText") except: title = None try: div.find_element_by_xpath( './/span[contains(text(),"by Amazon")]') fba = "1" except: fba = "0" try: star = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span' ).get_attribute('aria-label').replace( " out of 5 stars", "") except: star = None try: review = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span[2]' ).get_attribute('aria-label').replace(",", "") except: review = "0" try: if int(review) > 70: all_log.logger.info("---%s评价数为%s,跳过---" % (asin, review)) continue if float(price) > 40: all_log.logger.info("---%s价格为%s,跳过---" % (asin, price)) continue if sponsored == "1": all_log.logger.info("---%s为广告,跳过---" % asin) continue except: all_log.logger.info("---%s过滤报错,跳过---" % asin) continue pro_url = div.find_element_by_xpath( './/h2/a').get_attribute("href") js = 'window.open("' + pro_url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.ID, 'bylineInfo_feature_div'))) try: brand = driver.find_element_by_xpath( '//a[@id="bylineInfo"]').text.replace( 'Brand: ', '').replace('Visit the ', '').replace('Store', '').strip() except: brand = None try: store = filter_str( driver.find_element_by_id( 'sellerProfileTriggerId').text) except: store = None try: qa = driver.find_element_by_xpath( '//*[@id="askATFLink"]/span').get_attribute( 'innerText').replace( " answered questions", "") except: qa = "0" try: seller_id = driver.find_element_by_id( 'merchantID').get_attribute("value") except: seller_id = None try: seller_num = driver.find_element_by_xpath( '//div[@id="olp-upd-new-freeshipping-threshold"]//a/span' ).text seller_num = re.findall("\((.*)\)", seller_num)[0] except: seller_num = 0 br_error_num = 0 rank_type = 0 big_rank_txt = "" big_rank = 0 mid_rank_txt = "" mid_rank = 0 small_rank_txt = "" small_rank = 0 while big_rank_txt == "": if rank_type == 1: try: big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul' ).get_attribute('innerText') if big_rank_txt == "": br_error_num += 1 except: br_error_num += 1 sleep(1) big_rank_txt = "" else: try: big_rank_txt = getRank(driver, 1) except: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located(( By.ID, 'detailBulletsWrapper_feature_div' ))) rank_type = 1 big_rank_txt = driver.find_element_by_xpath( '//div[@id="detailBullets_feature_div"]/following-sibling::ul' ).get_attribute('innerText') except: br_error_num += 1 sleep(1) big_rank_txt = "" if br_error_num == 3: all_log.logger.error("%s未采集到大类排名%s次" % (asin, br_error_num)) big_rank_txt = "" break if big_rank_txt != "": if rank_type == 0: big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br in big_rank_list: big_rank += br else: for br_i, br in enumerate( big_rank_txt.split("#")): rank_txt = "#" + br.strip() if br_i == 1: big_rank_txt = re.sub( "\(.*", "", rank_txt).strip() big_rank_list = re.findall( "\d", big_rank_txt) big_rank = "" for br_1 in big_rank_list: big_rank += br_1 elif br_i == 2: mid_rank_txt = rank_txt mid_rank_list = re.findall( "\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr elif br_i == 3: small_rank_txt = rank_txt small_rank_list = re.findall( "\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: big_rank = 0 if rank_type == 0: try: mid_rank_txt = getRank(driver, 2) except: mid_rank_txt = "" if mid_rank_txt != "": mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip() mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr else: mid_rank = 0 try: small_rank_txt = getRank(driver, 3) except: small_rank_txt = "" if small_rank_txt != "": small_rank_txt = re.sub( "\(.*", "", small_rank_txt).strip() small_rank_list = re.findall( "\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: small_rank = 0 try: put_date = driver.find_element_by_xpath( '//th[contains(text(),"Date First Available")]/following-sibling::td[1]' ).get_attribute('innerText') if put_date: put_date = datetime.strptime( put_date, '%B %d, %Y').strftime("%Y-%m-%d") except: put_date = None if big_rank == '' or int( big_rank) == 0 or int(big_rank) > 15000: all_log.logger.info("---%s大类排名为%s,跳过---" % (asin, big_rank)) driver.close() driver.switch_to.window(driver.window_handles[0]) continue img2 = '' img3 = '' img2_num = 0 img2_click_num = 0 img3_num = 0 img3_click_num = 0 while img2 == '' and img2_click_num < 40 and img2_num < 5: sleep(0.5) try: driver.find_element_by_xpath( '//div[@id="altImages"]/ul//li[@class="a-spacing-small template"]/following-sibling::li[2]' ).click() except: img2_click_num += 1 try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//li[contains(@class,"itemNo1")]'))) img2 = driver.find_element_by_xpath( '//li[contains(@class,"itemNo1")]//img' ).get_attribute("src") except: img2_num += 1 while img3 == '' and img3_click_num < 40 and img3_num < 5: sleep(0.5) try: driver.find_element_by_xpath( '//div[@id="altImages"]/ul//li[@class="a-spacing-small template"]/following-sibling::li[3]' ).click() except: img3_click_num += 1 try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//li[contains(@class,"itemNo2")]'))) img3 = driver.find_element_by_xpath( '//li[contains(@class,"itemNo2")]//img' ).get_attribute("src") except: img3_num += 1 sql = "insert into tb_amz_pro_1129(keyword,asin,img1,img2,img3,sponsored,price,title,fba,star,review,brand,store,qa,seller_id,seller_num," \ "big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())" sql_param = [ data['keyword'], asin, img1, img2, img3, sponsored, price, title, fba, star, review, brand, store, qa, seller_id, seller_num, big_rank_txt, big_rank, mid_rank_txt, mid_rank, small_rank_txt, small_rank, put_date ] try: mp.insert(sql, sql_param) all_log.logger.info("-----%s(%s)入库成功-----" % (asin, data['keyword'])) success_num += 1 except IntegrityError: all_log.logger.info("-----%s(%s)已存在-----" % (asin, data['keyword'])) success_num += 1 except Exception as e: error_log.logger.error("-----%s(%s)入库失败%s-----" % (asin, data['keyword'], e)) except: traceback.print_exc() error_log.logger.error("-----%s---%s采集出错-----" % (data['keyword'], proxy)) driver.close() driver.switch_to.window(driver.window_handles[0]) mp.update(update_sql, (data['id'], )) except: traceback.print_exc() error_log.logger.error("-----%s---%s出错-----" % (data['keyword'], proxy)) finally: all_log.logger.info("---end---ip=%s,keyword=%s---" % (proxy, data['keyword'])) driver.quit()
def collectionGroup(acc, keywrods): options = webdriver.ChromeOptions() options.add_argument("user-agent=" + acc['user-agent']) prefs = {'profile.default_content_setting_values': {'notifications': 2}} options.add_experimental_option('prefs', prefs) options.add_argument("--start-maximized") # options.add_argument("--headless") driver = webdriver.Chrome(options=options) driver.get("https://www.baidu.com/") if type('') is type(acc['cookies']): cookie_list = eval(acc['cookies']) else: cookie_list = acc['cookies'] for cookie in cookie_list: driver.add_cookie(cookie_dict=cookie) for keyword in keywrods: # 公开小组 & 非公开小组 group_types = [ "&epa=FILTERS&filters=eyJncm91cHNfc2hvd19vbmx5Ijoie1wibmFtZVwiOlwicHVibGljX2dyb3Vwc1wiLFwiYXJnc1wiOlwiXCJ9In0%3D", "&epa=FILTERS&filters=eyJncm91cHNfc2hvd19vbmx5Ijoie1wibmFtZVwiOlwiY2xvc2VkX2dyb3Vwc1wiLFwiYXJnc1wiOlwiXCJ9In0%3D" ] for group_num, group in enumerate(group_types): driver.get("https://www.facebook.com/search/groups/?q=%s" % keyword + group) WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//div[@id="BrowseResultsContainer"]'))) for i in range(15): ActionChains(driver).send_keys(Keys.END).perform() sleep(1.5) if i > 10: try: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.ID, 'browse_end_of_results_footer'))) break except: pass divs = driver.find_elements_by_xpath( '//div[@id="BrowseResultsContainer"]/../div') for j, div in enumerate(divs): if j == 0: proDivs = div.find_elements_by_xpath('./div') elif j == 1: proDivs = div.find_elements_by_xpath('./div/div/div') else: proDivs = div.find_elements_by_xpath('./div/div') if len(proDivs) <= 1: break for pro in proDivs: pro_url = pro.find_element_by_tag_name('a').get_attribute( "href") if group_num == 0: pro_url = re.sub("\?.*", "members/", pro_url) js = 'window.open("' + pro_url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) sleep(1000) try: if group_num == 0: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//div[@id="groupsMemberBrowser"]'))) else: WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.XPATH, '//div[@id="content_container"]'))) group_name = driver.find_element_by_xpath( '//div[@data-testid="group_sidebar_nav"]//a').text group_admin = "" if group_num == 0: nums = driver.find_element_by_xpath( '//div[@id="groupsMemberBrowser"]//span' ).text.replace(",", "") admins_div = driver.find_elements_by_xpath( '//div[@data-testid="GroupAdminGrid"]/ul/div') for admin_div in admins_div: group_admin += admin_div.find_element_by_tag_name( 'img').get_attribute("aria-label") + "|" else: nums = driver.find_element_by_xpath( '//div[@id="pagelet_group_about"]/div[2]//span' ).text.replace("成员 · ", "").replace(",", "") admins_div = driver.find_elements_by_xpath( '//div[@id="pagelet_group_about"]/div[2]/div[2]/div[2]/a' ) for admin_div in admins_div: group_admin += admin_div.find_element_by_tag_name( 'img').get_attribute("aria-label") + "|" if int(nums) < 4000: error_log.logger.error("-----%s人数为%s,跳过-----" % (group_name, nums)) driver.close() driver.switch_to.window(driver.window_handles[0]) continue driver.close() driver.switch_to.window(driver.window_handles[0]) mp = MysqlPool() sql = "insert into tb_group(name,nums,admins,type,url,add_time) values(%s,%s,%s,%s,%s,now())" param = [ filter_str(group_name), nums, filter_str(group_admin), group_num, pro_url ] try: mp.insert(sql, param) all_log.logger.info("-----%s入库成功-----" % group_name) except: error_log.logger.error("-----%s已入库,跳过-----" % group_name) sleep(1) except: traceback.print_exc() error_log.logger.error("*****获取%s信息出错*****" % pro_url) driver.close() driver.switch_to.window(driver.window_handles[0])
def getSellerId(asin_list, process_name): ua = UserAgent(verify_ssl=False).chrome ua = re.sub("Chrome/\d{2}", "Chrome/" + str(random.randint(49, 85)), ua) options = webdriver.ChromeOptions() options.add_argument("user-agent=" + ua) url = "http://ip.ipjldl.com/index.php/api/entry?method=proxyServer.tiqu_api_url&packid=0&fa=0&dt=0&groupid=0&fetch_key=&qty=1&time=1&port=1&format=json&ss=5&css=&dt=0&pro=&city=&usertype=6" options.add_argument("--start-maximized") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument('blink-settings=imagesEnabled=false') options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation']) driver = None error_url = "" while True: asin_url = "" try: ip_data = urllib.request.urlopen(url).read() print(ip_data) json_list = list(json.loads(ip_data)['data']) ip = "%s:%s" % (json_list[0]['IP'], json_list[0]['Port']) options.add_argument(('--proxy-server=http://' + ip)) driver = webdriver.Chrome(options=options) driver.get("https://www.baidu.com") WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.ID, 'su'))) cookies = [{ 'domain': 'www.amazon.com', 'expiry': 1633407103, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-Z135Q1Y24PZMTTNF8DDZ|1603167100870&t:1603167103192&adb:adblk_no' }, { 'domain': '.amazon.com', 'expiry': 2082787201, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US' }, { 'domain': '.amazon.com', 'expiry': 1634703091, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': 'fxzmNhMySgaV1gVga7nbDig972AmQGFxhFgyEZISkgU6//KEtZqCk54TxZV/ttWlmA+5gxnaUgZzFBKseUNhVdQgTHbVI7sDvNIFguqFFGDHATp9swCwfYcd3ViRzafe3d9YkzdIfga0G4kRm5SyB8MRExx3AnOc6jNxeMYPpYxuhaZX8Pe3viZFX6OK551eUxMz5vMEzje8b4ugkSCVV5OKFaJsgqL/iFHyHqnntlRSPPiPwK1eZ2gUicC09p3Q' }, { 'domain': '.amazon.com', 'expiry': 1634703109, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l' }, { 'domain': '.amazon.com', 'httpOnly': False, 'name': 'skin', 'path': '/', 'secure': False, 'value': 'noskin' }, { 'domain': '.amazon.com', 'expiry': 1634703109, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '130-0463586-1564060' }, { 'domain': '.amazon.com', 'expiry': 1634703086, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD' }, { 'domain': '.amazon.com', 'expiry': 1634703109, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '147-0153722-0121323' }] for cookie in cookies: driver.add_cookie(cookie_dict=cookie) while len(asin_list) > 1: print("---第%s个线程剩余asin数量%s---" % (process_name + 1, len(asin_list))) sleep(1) asin_url = "https://www.amazon.com/dp/" + str( asin_list[0]['asin']) driver.get(asin_url) try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.ID, 'bylineInfo_feature_div'))) except: WebDriverWait(driver, 5).until(EC.title_contains('Page Not Found')) error_log.logger.error("%s页面未找到" % asin_list[0]['asin']) asin_list.pop(0) continue sleep(0.5) seller_id = driver.find_element_by_id( 'merchantID').get_attribute("value") if seller_id: insert_sql = "insert into tb_seller_id(seller_id,add_time) values(%s,now())" insert_param = [seller_id] try: insert_mp = MysqlPool() insert_mp.insert(insert_sql, insert_param) except pymysql.err.IntegrityError: pass asin_list.pop(0) except: error_log.logger.error("***第%s个线程%s报错***" % (process_name + 1, asin_url)) if error_url == asin_url: asin_list.pop(0) else: error_url = asin_url if driver: driver.quit() continue break all_log.logger.info("---第%s个线程运行结束---" % (process_name + 1))
def getProData(ip, keyword): all_log.logger.info( "***start***ip=%s,keyword=%s,%s***" % (ip, keyword, datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) ua = UserAgent().chrome options = webdriver.ChromeOptions() options.add_argument("user-agent=" + ua) if ip: options.add_argument(('--proxy-server=http://' + ip)) options.add_argument("--start-maximized") # options.add_argument("--headless") driver = webdriver.Chrome(options=options) cookies = [{ 'domain': 'www.amazon.com', 'expiry': 1632329890, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no' }, { 'domain': '.amazon.com', 'expiry': 2082787202, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US' }, { 'domain': '.amazon.com', 'expiry': 1633625853, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '134-4542133-6572654' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l' }, { 'domain': '.amazon.com', 'expiry': 1633625846, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD' }, { 'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '132-8928912-9834042' }] driver.get("https://www.baidu.com") sleep(0.5) for cookie in cookies: driver.add_cookie(cookie_dict=cookie) driver.get("https://www.amazon.com/s?k=" + keyword + "&ref=nb_sb_noss") try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//h4[contains(text(),"characters you see")]'))) error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" % (ip, keyword)) driver.quit() return False, keyword except: pass try: WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.XPATH, '//div[contains(@class,"s-main-slot")]'))) except: error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" % (ip, keyword)) driver.quit() return False, keyword divs = driver.find_elements_by_xpath( '//div[contains(@class,"s-main-slot")]/div') try: for div in divs: asin = div.get_attribute('data-asin') if asin: try: div.find_element_by_xpath( './/div[@class="a-row a-spacing-micro"]') sponsored = "1" except: pass sponsored = "0" try: price = div.find_element_by_xpath( './/span[@data-a-color="base"]/span').get_attribute( "innerText").replace("$", "") except: price = None try: img = div.find_element_by_xpath('.//img').get_attribute( 'src') except: img = None try: title = div.find_element_by_xpath( './/h2/a/span').get_attribute("innerText") except: title = None try: div.find_element_by_xpath( './/span[contains(text(),"by Amazon")]') fba = "1" except: fba = "0" try: star = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span' ).get_attribute('aria-label').replace( " out of 5 stars", "") except: star = None try: review = div.find_element_by_xpath( './/div[@class="a-row a-size-small"]/span[2]' ).get_attribute('aria-label').replace(",", "") except: review = "0" pro_url = div.find_element_by_xpath('.//h2/a').get_attribute( "href") js = 'window.open("' + pro_url + '")' driver.execute_script(js) driver.switch_to.window(driver.window_handles[1]) try: WebDriverWait(driver, 15).until( EC.visibility_of_element_located( (By.ID, 'bylineInfo_feature_div'))) brand = driver.find_element_by_xpath( '//a[@id="bylineInfo"]').text.replace('Brand: ', '') try: qa = driver.find_element_by_xpath( '//*[@id="askATFLink"]/span').get_attribute( 'innerText').replace(" answered questions", "") except: qa = "0" try: big_rank_txt = driver.find_element_by_xpath( '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[1]' ).get_attribute('innerText') except: try: big_rank_txt = driver.find_element_by_xpath( '//*[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[1]' ).get_attribute('innerText') except: big_rank_txt = "" if big_rank_txt: big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip() big_rank_list = re.findall("\d", big_rank_txt) big_rank = "" for br in big_rank_list: big_rank += br else: big_rank = 0 try: mid_rank_txt = driver.find_element_by_xpath( '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[2]' ).get_attribute('innerText') except: mid_rank_txt = "" if mid_rank_txt: mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip() mid_rank_list = re.findall("\d", mid_rank_txt) mid_rank = "" for mr in mid_rank_list: mid_rank += mr else: mid_rank = 0 try: small_rank_txt = driver.find_element_by_xpath( '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[3]' ).get_attribute('innerText') except: small_rank_txt = "" if small_rank_txt: small_rank_txt = re.sub("\(.*", "", small_rank_txt).strip() small_rank_list = re.findall("\d", small_rank_txt) small_rank = "" for sr in small_rank_list: small_rank += sr else: small_rank = 0 try: put_date = driver.find_element_by_xpath( '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[4]/td' ).get_attribute('innerText') if put_date: put_date = datetime.strptime( put_date, '%B %d, %Y').strftime("%Y-%m-%d") except: put_date = None mp = MysqlPool() sql = "insert into tb_amz_pro(keyword,asin,img,sponsored,price,title,fba,star,review,brand,qa,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())" sql_param = [ keyword, asin, img, sponsored, price, title, fba, star, review, brand, qa, big_rank_txt, big_rank, mid_rank_txt, mid_rank, small_rank_txt, small_rank, put_date ] try: mp.insert(sql, sql_param) all_log.logger.info("-----%s(%s)入库成功-----" % (asin, keyword)) except: traceback.print_exc() all_log.logger.info("-----%s(%s)已存在-----" % (asin, keyword)) except: traceback.print_exc() error_log.logger.error("-----%s---%s采集出错-----" % (keyword, asin)) driver.close() driver.switch_to.window(driver.window_handles[0]) return True, keyword except: traceback.print_exc() return False, keyword finally: all_log.logger.info( "---end---ip=%s,keyword=%s,%s---" % (ip, keyword, datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) driver.quit()