Ejemplo n.º 1
0
def form():
    if request.method == 'GET':
        config = configparser.RawConfigParser()
        config.read("group-answer.ini", encoding="utf-8")
        return render_template("amz/track-form.html",
                               groups=config.sections(),
                               user=session.get('user'),
                               active="amzTrackForm")
    if request.method == 'POST':
        data = request.get_data()
        json_data = json.loads(data.decode("utf-8"))
        mp = MysqlPool()
        sql = "insert into tb_amz_track_pro(user_id,keyword,asin,status,page_size,add_time) values(%s,%s,%s,%s,%s,now())"
        param = [
            session.get('user')['id'],
            json_data.get("keyword"),
            json_data.get("asin"), "1",
            json_data.get('page_size')
        ]
        try:
            mp.insert(sql, param)
            res_json = {"code": "0000", "message": "已成功提交追踪任务"}
        except IntegrityError as e:
            res_json = {"code": "1000", "message": "%s" % e}
        return jsonify(res_json)
Ejemplo n.º 2
0
def updateComment():
    data = request.get_data()
    json_data = json.loads(data.decode("utf-8"))
    mp = MysqlPool()
    sql = "update tb_comment set content=%s where id=%s"
    mp.insert(sql, [json_data.get('content'), json_data.get('id')])
    res_json = {"code": "0000"}
    return jsonify(res_json)
Ejemplo n.º 3
0
def addComment():
    data = request.get_data()
    json_data = json.loads(data.decode("utf-8"))
    mp = MysqlPool()
    sql = "insert into tb_comment(content,add_time) values(%s,%s)"
    mp.insert(sql, [json_data.get('content'), datetime.now()])
    res_json = {"code": "0000"}
    return jsonify(res_json)
def reviewForm():
    mp = MysqlPool()
    if request.method == 'GET':
        user = session.get('user')
        user_sql = "select * from tb_user where status=1"
        user_list = None
        if user['level'] == 1:
            user_list = mp.fetch_all(user_sql, None)
        return render_template("review/review-form.html",
                               user=user,
                               user_list=user_list,
                               active="reviewForm")
    if request.method == 'POST':
        data = request.get_data()
        json_data = json.loads(data.decode("utf-8"))
        sql = "insert into tb_review_task(user_id,asin,brand,country,img,keyword,kw_page,store," \
              "price,days_order,total_order,is_vp,note,add_time,name) values(%s,%s,%s,'us',%s,%s," \
              "%s,%s,%s,%s,%s,1,%s,now(),%s)"
        try:
            user_id = json_data.get("user_id")
        except:
            user_id = session.get('user')['id']
        if not user_id:
            user_id = session.get('user')['id']
        param = [
            user_id,
            json_data.get("asin"),
            json_data.get("brand"),
            json_data.get("img"),
            json_data.get("keyword"),
            json_data.get("kw_page"),
            json_data.get("store"),
            json_data.get("price"),
            json_data.get("days_order"),
            json_data.get("total_order"),
            json_data.get("note"),
            json_data.get("name")
        ]
        try:
            task_id = mp.insert(sql, param)
            asin_sql = "insert into tb_task_asin(asin,task_id,status,is_put) values(%s,%s,%s,%s)"
            for asin in str(json_data.get("asin")).split("|"):
                asin_param = [asin, task_id, 1, 0]
                mp.insert(asin_sql, asin_param)
            res_json = {"code": "0000", "message": "已成功提交刷单任务"}
        except Exception as e:
            res_json = {"code": "9999", "message": "提交失败%s" % e}
        return jsonify(res_json)
def form():
    if request.method == 'GET':
        config = configparser.RawConfigParser()
        config.read("group-answer.ini", encoding="utf-8")
        return render_template("fb/form.html", groups=config.sections(), user=session.get('user'),active="fbForm")
    if request.method == 'POST':
        data = request.get_data()
        json_data = json.loads(data.decode("utf-8"))
        mp = MysqlPool()
        try:
            for i,group_id in enumerate(json_data.get('group_id')):
                sql = "insert into tb_post(group_id,keyword,nums,share_num,done_num,done_share,content,user_id,status,add_time,accounts) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'')"
                param = [group_id, json_data.get("keyword"), json_data.get("nums"),json_data.get("share_num"),"0","0",
                         json_data.get('content'), session.get('user')['id'], 'working', datetime.now()]
                mp.insert(sql, param)
            res_json = {"code": "0000", "message": "已成功提交%s个任务" % (len(json_data.get('group_id')))}
        except IntegrityError as e:
            res_json = {"code":"1000","message":"%s"%e}
        return jsonify(res_json)
Ejemplo n.º 6
0
def getKeyword():
    mp = MysqlPool()
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches',
                                    ['enable-logging', 'enable-automation'])
    driver = webdriver.Chrome(options=options)
    driver.get(
        "https://www.amz123.com/usatopkeywords-1-1-.htm?rank=0&uprank=0")
    txt = '▶'
    while txt == '▶':
        try:
            data_lists = driver.find_elements_by_xpath(
                '//div[@class="listdata"]')
            for data in data_lists:
                try:
                    keyword = data.find_element_by_xpath('./div').text
                    cur_rank = data.find_element_by_xpath('./div[2]').text
                    last_rank = data.find_element_by_xpath('./div[3]').text
                    sql = "insert into amz123_keyword set keyword=%s,cur_rank=%s,last_rank=%s,add_time=now()"
                    param = [keyword, cur_rank, last_rank]
                    mp.insert(sql, param)
                    print("---%s入库成功---" % keyword)
                except:
                    continue
            sleep(1)
            next_page = driver.find_element_by_xpath('//nav/ul/li[last()]')
            txt = next_page.text
            if next_page.text == '▶':
                next_page.click()
        except:
            continue
    print("采集完毕")
    driver.close()
    driver.quit()
Ejemplo n.º 7
0
def getProData(ip,product_list):
    all_log.logger.info("***start***ip=%s,product_list=%s***"%(ip,len(product_list)))
    ua = UserAgent(verify_ssl=False).chrome
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + ua)
    if ip:
        options.add_argument(('--proxy-server=http://' + ip))
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches', ['enable-logging','enable-automation'])
    driver = webdriver.Chrome(options=options)
    res_success_list = []
    try:
        driver.get("https://www.baidu.com")
        WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'su')))
        cookies = [{'domain': 'www.amazon.com', 'expiry': 1632329890, 'httpOnly': False, 'name': 'csm-hit', 'path': '/', 'secure': False, 'value': 'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no'}, {'domain': '.amazon.com', 'expiry': 2082787202, 'httpOnly': False, 'name': 'lc-main', 'path': '/', 'secure': False, 'value': 'en_US'}, {'domain': '.amazon.com', 'expiry': 1633625853, 'httpOnly': False, 'name': 'session-token', 'path': '/', 'secure': True, 'value': '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'ubid-main', 'path': '/', 'secure': True, 'value': '134-4542133-6572654'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id-time', 'path': '/', 'secure': False, 'value': '2082787201l'}, {'domain': '.amazon.com', 'expiry': 1633625846, 'httpOnly': False, 'name': 'i18n-prefs', 'path': '/', 'secure': False, 'value': 'USD'}, {'domain': '.amazon.com', 'expiry': 1633625894, 'httpOnly': False, 'name': 'session-id', 'path': '/', 'secure': True, 'value': '132-8928912-9834042'}]
        for cookie in cookies:
            driver.add_cookie(cookie_dict=cookie)
        sleep(1)
        for index,product in enumerate(product_list):
            all_log.logger.info("---开始跟踪%s(%s)---"%(product['keyword'],product['asin']))
            driver.get("https://www.amazon.com/s?k=" + product['keyword'] + "&ref=nb_sb_noss")
            pro_num = 0
            page_num = 1
            break_flag = False
            success_flag = True
            while True:
                try:
                    WebDriverWait(driver, 15).until(
                        EC.visibility_of_element_located((By.XPATH, '//ul[@class="a-pagination"]')))
                except:
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.visibility_of_element_located((By.XPATH, '//h4[contains(text(),"characters you see")]')))
                        error_log.logger.error(
                            "***ip=%s,keyword=%s,asin=%s出现验证码,结束当前采集***" % (ip, product['keyword'], product['asin']))
                        driver.quit()
                        return res_success_list
                    except:
                        pass
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.visibility_of_element_located((By.XPATH, '//ul[@class="a-pagination"]')))
                    except:
                        error_log.logger.error(
                            "***ip=%s,keyword=%s,asin=%s页面采集错误,结束当前采集***" % (ip, product['keyword'], product['asin']))
                        driver.quit()
                        return res_success_list
                divs = driver.find_elements_by_xpath('//div[contains(@class,"s-main-slot")]/div')
                for div in divs:
                    pro_asin = div.get_attribute('data-asin')
                    if pro_asin:
                        pro_num += 1
                        if pro_asin in str(product['asin']):
                            try:
                                #跳过广告
                                div.find_element_by_xpath('.//div[@class="a-row a-spacing-micro"]')
                                continue
                            except:
                                pass
                            try:
                                price = div.find_element_by_xpath('.//span[@data-a-color="base"]/span').get_attribute("innerText").replace(
                                    "$", "")
                            except:
                                price = None
                            try:
                                star = div.find_element_by_xpath('.//div[@class="a-row a-size-small"]/span').get_attribute('aria-label').replace(" out of 5 stars","")
                            except:
                                star = None
                            try:
                                review = div.find_element_by_xpath('.//div[@class="a-row a-size-small"]/span[2]').get_attribute(
                                    'aria-label').replace(",", "")
                            except:
                                review = "0"
                            try:
                                div.find_element_by_xpath('.//span[contains(text(),"by Amazon")]')
                                fba = "1"
                            except:
                                fba = "0"
                            pro_url = div.find_element_by_xpath('.//h2/a').get_attribute("href")
                            js = 'window.open("' + pro_url + '")'
                            driver.execute_script(js)
                            driver.switch_to.window(driver.window_handles[1])
                            try:
                                WebDriverWait(driver, 15).until(
                                    EC.visibility_of_element_located((By.ID, 'bylineInfo_feature_div')))
                            except:
                                try:
                                    WebDriverWait(driver, 5).until(
                                        EC.visibility_of_element_located((By.XPATH, '//span[contains(text(),"未连接到互联网")]')))
                                    error_log.logger.error("网络连接断开")
                                    return res_success_list
                                except:
                                    error_log.logger.error("-----%s(%s)采集出错-----" % (product['keyword'], product['asin']))
                                    driver.close()
                                    driver.switch_to.window(driver.window_handles[0])
                                    break_flag = True
                                    success_flag = False
                                    break
                            try:
                                brand = driver.find_element_by_xpath('//a[@id="bylineInfo"]').text.replace('Brand: ', '')
                            except:
                                brand = None
                            try:
                                qa = driver.find_element_by_xpath('//*[@id="askATFLink"]/span').get_attribute(
                                    'innerText').replace(" answered questions", "").replace(",", "").replace("+", "")
                            except:
                                qa = "0"
                            seller = None
                            try:
                                follow_up_text = driver.find_element_by_xpath(
                                    '//div[@class="olp-text-box"]/span').get_attribute('innerText')
                                follow_up_list = re.findall("\d", follow_up_text)
                                for fu in follow_up_list:
                                    seller += fu
                            except:
                                pass
                            br_error_num = 0
                            rank_type = 0
                            big_rank_txt = ""
                            big_rank = 0
                            mid_rank_txt = ""
                            mid_rank = 0
                            small_rank_txt = ""
                            small_rank = 0
                            for_break_flag = False
                            while big_rank_txt == "":
                                if rank_type == 1:
                                    try:
                                        big_rank_txt = driver.find_element_by_xpath(
                                            '//div[@id="detailBullets_feature_div"]/following-sibling::ul').get_attribute(
                                            'innerText')
                                        if big_rank_txt == "":
                                            br_error_num += 1
                                    except:
                                        br_error_num += 1
                                        sleep(1)
                                        big_rank_txt = ""
                                else:
                                    try:
                                        big_rank_txt = getRank(driver, 1)
                                    except:
                                        try:
                                            WebDriverWait(driver, 5).until(
                                                EC.visibility_of_element_located(
                                                    (By.ID, 'detailBulletsWrapper_feature_div')))
                                            rank_type = 1
                                            big_rank_txt = driver.find_element_by_xpath(
                                                '//div[@id="detailBullets_feature_div"]/following-sibling::ul').get_attribute(
                                                'innerText')
                                        except:
                                            br_error_num += 1
                                            sleep(1)
                                            big_rank_txt = ""
                                if br_error_num == 5:
                                    print("未采集到大类排名%s次,退出" % br_error_num)
                                    for_break_flag = True
                                    break_flag = True
                                    success_flag = False
                                    break
                            if for_break_flag:
                                break
                            if big_rank_txt != "":
                                if rank_type == 0:
                                    big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip()
                                    big_rank_list = re.findall("\d", big_rank_txt)
                                    big_rank = ""
                                    for br in big_rank_list:
                                        big_rank += br
                                else:
                                    for br_i, br in enumerate(big_rank_txt.split("#")):
                                        rank_txt = "#" + br.strip()
                                        if br_i == 1:
                                            big_rank_txt = re.sub("\(.*", "", rank_txt).strip()
                                            big_rank_list = re.findall("\d", big_rank_txt)
                                            big_rank = ""
                                            for br_1 in big_rank_list:
                                                big_rank += br_1
                                        elif br_i == 2:
                                            mid_rank_txt = rank_txt
                                            mid_rank_list = re.findall("\d", mid_rank_txt)
                                            mid_rank = ""
                                            for mr in mid_rank_list:
                                                mid_rank += mr
                                        elif br_i == 3:
                                            small_rank_txt = rank_txt
                                            small_rank_list = re.findall("\d", small_rank_txt)
                                            small_rank = ""
                                            for sr in small_rank_list:
                                                small_rank += sr
                            else:
                                big_rank = 0
                            if rank_type == 0:
                                try:
                                    mid_rank_txt = getRank(driver, 2)
                                except:
                                    mid_rank_txt = ""
                                if mid_rank_txt != "":
                                    mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip()
                                    mid_rank_list = re.findall("\d", mid_rank_txt)
                                    mid_rank = ""
                                    for mr in mid_rank_list:
                                        mid_rank += mr
                                else:
                                    mid_rank = 0
                                try:
                                    small_rank_txt = getRank(driver, 3)
                                except:
                                    small_rank_txt = ""
                                if small_rank_txt != "":
                                    small_rank_txt = re.sub("\(.*", "", small_rank_txt).strip()
                                    small_rank_list = re.findall("\d", small_rank_txt)
                                    small_rank = ""
                                    for sr in small_rank_list:
                                        small_rank += sr
                                else:
                                    small_rank = 0
                            rank = pro_num
                            sql = "insert into tb_amz_track_data(pro_id,rank,page_num,price,fba,star,review,brand,qa,seller,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,add_time) " \
                                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())"
                            sql_param = [product['id'], rank, page_num,price, fba, star, review, brand, qa, seller,big_rank_txt,
                                         big_rank, mid_rank_txt, mid_rank, small_rank_txt, small_rank]
                            try:
                                mp = MysqlPool()
                            except:
                                try:
                                    mp = MysqlPool()
                                except:
                                    error_log.logger.error("-----数据库连接失败-----")
                                    success_flag = False
                                    break_flag = True
                                    break
                            try:
                                mp.insert(sql, sql_param)
                                all_log.logger.info("***%s(%s)入库成功***" % (product['asin'], product['keyword']))
                                success_flag = False
                            except Exception:
                                error_log.logger.error("入库异常%s"%sql_param)
                                success_flag = False
                                break_flag = True
                                break
                            driver.close()
                            driver.switch_to.window(driver.window_handles[0])
                            res_success_list.append(product)
                            break_flag = True
                            break
                if break_flag:
                    break
                if page_num == product['page_size']:
                    break
                try:
                    WebDriverWait(driver, 5).until(
                        EC.visibility_of_element_located(
                            (By.XPATH, './/li[@class="a-last"]')))
                    driver.find_element_by_class_name('a-last').click()
                    page_num += 1
                except TimeoutException:
                    print("已到最后一页第%s页"%page_num)
                    break
            if success_flag:
                error_log.logger.error("---%s在%s的%s页内未找到---"%(product['asin'],product['keyword'],page_num))
                res_success_list.append(product)
    except Exception as e:
        traceback.print_exc()
        error_log.logger.error(e)
    finally:
        all_log.logger.info("---end---ip=%s,product_list=%s---" % (ip, product_list))
        driver.quit()
        return res_success_list
Ejemplo n.º 8
0
def getProData(ip, keyword):
    all_log.logger.info("***start***ip=%s,keyword=%s***" % (ip, keyword))
    ua = UserAgent().chrome
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + ua)
    if ip:
        options.add_argument(('--proxy-server=http://' + ip))
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("log-level=3")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches',
                                    ['enable-logging', 'enable-automation'])
    driver = webdriver.Chrome(options=options)
    try:
        driver.get("https://www.baidu.com")
        WebDriverWait(driver,
                      15).until(EC.visibility_of_element_located(
                          (By.ID, 'su')))
        cookies = [{
            'domain':
            'www.amazon.com',
            'expiry':
            1632329890,
            'httpOnly':
            False,
            'name':
            'csm-hit',
            'path':
            '/',
            'secure':
            False,
            'value':
            'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no'
        }, {
            'domain': '.amazon.com',
            'expiry': 2082787202,
            'httpOnly': False,
            'name': 'lc-main',
            'path': '/',
            'secure': False,
            'value': 'en_US'
        }, {
            'domain':
            '.amazon.com',
            'expiry':
            1633625853,
            'httpOnly':
            False,
            'name':
            'session-token',
            'path':
            '/',
            'secure':
            True,
            'value':
            '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'ubid-main',
            'path': '/',
            'secure': True,
            'value': '134-4542133-6572654'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'session-id-time',
            'path': '/',
            'secure': False,
            'value': '2082787201l'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625846,
            'httpOnly': False,
            'name': 'i18n-prefs',
            'path': '/',
            'secure': False,
            'value': 'USD'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'session-id',
            'path': '/',
            'secure': True,
            'value': '132-8928912-9834042'
        }]
        for cookie in cookies:
            driver.add_cookie(cookie_dict=cookie)
        sleep(1)
        driver.get("https://www.amazon.com/s?k=" + keyword + "&ref=nb_sb_noss")
        try:
            WebDriverWait(driver, 20).until(
                EC.visibility_of_element_located(
                    (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
        except:
            try:
                WebDriverWait(driver, 15).until(
                    EC.visibility_of_element_located(
                        (By.XPATH,
                         '//h4[contains(text(),"characters you see")]')))
                error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" %
                                       (ip, keyword))
                driver.quit()
                return False, keyword
            except:
                pass
            try:
                WebDriverWait(driver, 15).until(
                    EC.visibility_of_element_located(
                        (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
            except:
                error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" %
                                       (ip, keyword))
                driver.quit()
                return False, keyword
        divs = driver.find_elements_by_xpath(
            '//div[contains(@class,"s-main-slot")]/div')
        success_num = 0
        error_num = 0
        for div in divs:
            if error_num > 2:
                error_log.logger.error("-----%s采集出错超过%s次,退出采集-----" %
                                       (keyword, error_num))
                all_log.logger.info("-----已采集%s条ASIN-----" % success_num)
                if success_num > 20:
                    return True, keyword
                else:
                    return False, keyword
            try:
                asin = div.get_attribute('data-asin')
            except:
                sleep(1)
                error_num += 1
                continue
            if asin:
                try:
                    div.find_element_by_xpath(
                        './/div[@class="a-row a-spacing-micro"]')
                    sponsored = "1"
                except:
                    pass
                    sponsored = "0"
                try:
                    price = div.find_element_by_xpath(
                        './/span[@data-a-color="base"]/span').get_attribute(
                            "innerText").replace("$", "")
                except:
                    price = None
                try:
                    img = div.find_element_by_xpath('.//img').get_attribute(
                        'src')
                except:
                    img = None
                try:
                    title = div.find_element_by_xpath(
                        './/h2/a/span').get_attribute("innerText")
                except:
                    title = None
                try:
                    div.find_element_by_xpath(
                        './/span[contains(text(),"by Amazon")]')
                    fba = "1"
                except:
                    fba = "0"
                try:
                    star = div.find_element_by_xpath(
                        './/div[@class="a-row a-size-small"]/span'
                    ).get_attribute('aria-label').replace(
                        " out of 5 stars", "")
                except:
                    star = None
                try:
                    review = div.find_element_by_xpath(
                        './/div[@class="a-row a-size-small"]/span[2]'
                    ).get_attribute('aria-label').replace(",", "")
                except:
                    review = "0"
                pro_url = div.find_element_by_xpath('.//h2/a').get_attribute(
                    "href")
                js = 'window.open("' + pro_url + '")'
                driver.execute_script(js)
                driver.switch_to.window(driver.window_handles[1])
                try:
                    WebDriverWait(driver, 15).until(
                        EC.visibility_of_element_located(
                            (By.ID, 'bylineInfo_feature_div')))
                except:
                    try:
                        WebDriverWait(driver, 5).until(
                            EC.visibility_of_element_located(
                                (By.XPATH,
                                 '//span[contains(text(),"未连接到互联网")]')))
                        error_log.logger.error("网络连接断开")
                        all_log.logger.info("-----已采集%s条ASIN-----" %
                                            success_num)
                        if success_num > 20:
                            return True, keyword
                        else:
                            return False, keyword
                    except:
                        pass
                    error_log.logger.error("-----%s(%s)采集出错-----" %
                                           (keyword, asin))
                    error_num += 1
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue
                try:
                    brand = driver.find_element_by_xpath(
                        '//a[@id="bylineInfo"]').text.replace('Brand: ', '')
                except:
                    brand = None
                try:
                    qa = driver.find_element_by_xpath(
                        '//*[@id="askATFLink"]/span').get_attribute(
                            'innerText').replace(" answered questions",
                                                 "").replace(",", "").replace(
                                                     "+", "")
                except:
                    qa = "0"
                try:
                    seller_id = driver.find_element_by_id(
                        'merchantID').get_attribute("value")
                except:
                    seller_id = None
                br_error_num = 0
                rank_type = 0
                big_rank_txt = ""
                big_rank = 0
                mid_rank_txt = ""
                mid_rank = 0
                small_rank_txt = ""
                small_rank = 0
                while big_rank_txt == "":
                    if rank_type == 1:
                        try:
                            big_rank_txt = driver.find_element_by_xpath(
                                '//div[@id="detailBullets_feature_div"]/following-sibling::ul'
                            ).get_attribute('innerText')
                            if big_rank_txt == "":
                                br_error_num += 1
                        except:
                            br_error_num += 1
                            sleep(1)
                            big_rank_txt = ""
                    else:
                        try:
                            big_rank_txt = getRank(driver, 1)
                        except:
                            try:
                                WebDriverWait(driver, 5).until(
                                    EC.visibility_of_element_located(
                                        (By.ID,
                                         'detailBulletsWrapper_feature_div')))
                                rank_type = 1
                                big_rank_txt = driver.find_element_by_xpath(
                                    '//div[@id="detailBullets_feature_div"]/following-sibling::ul'
                                ).get_attribute('innerText')
                            except:
                                br_error_num += 1
                                sleep(1)
                                big_rank_txt = ""
                    if br_error_num == 3:
                        print("未采集到大类排名%s次,跳过" % br_error_num)
                        break
                if big_rank_txt != "":
                    if rank_type == 0:
                        big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip()
                        big_rank_list = re.findall("\d", big_rank_txt)
                        big_rank = ""
                        for br in big_rank_list:
                            big_rank += br
                    else:
                        for br_i, br in enumerate(big_rank_txt.split("#")):
                            rank_txt = "#" + br.strip()
                            if br_i == 1:
                                big_rank_txt = re.sub("\(.*", "",
                                                      rank_txt).strip()
                                big_rank_list = re.findall("\d", big_rank_txt)
                                big_rank = ""
                                for br_1 in big_rank_list:
                                    big_rank += br_1
                            elif br_i == 2:
                                mid_rank_txt = rank_txt
                                mid_rank_list = re.findall("\d", mid_rank_txt)
                                mid_rank = ""
                                for mr in mid_rank_list:
                                    mid_rank += mr
                            elif br_i == 3:
                                small_rank_txt = rank_txt
                                small_rank_list = re.findall(
                                    "\d", small_rank_txt)
                                small_rank = ""
                                for sr in small_rank_list:
                                    small_rank += sr
                else:
                    big_rank = 0
                if rank_type == 0:
                    try:
                        mid_rank_txt = getRank(driver, 2)
                    except:
                        mid_rank_txt = ""
                    if mid_rank_txt != "":
                        mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip()
                        mid_rank_list = re.findall("\d", mid_rank_txt)
                        mid_rank = ""
                        for mr in mid_rank_list:
                            mid_rank += mr
                    else:
                        mid_rank = 0
                    try:
                        small_rank_txt = getRank(driver, 3)
                    except:
                        small_rank_txt = ""
                    if small_rank_txt != "":
                        small_rank_txt = re.sub("\(.*", "",
                                                small_rank_txt).strip()
                        small_rank_list = re.findall("\d", small_rank_txt)
                        small_rank = ""
                        for sr in small_rank_list:
                            small_rank += sr
                    else:
                        small_rank = 0
                try:
                    put_date = driver.find_element_by_xpath(
                        '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[4]/td'
                    ).get_attribute('innerText')
                    if put_date:
                        put_date = datetime.strptime(
                            put_date, '%B %d, %Y').strftime("%Y-%m-%d")
                except:
                    put_date = None
                sql = "insert into tb_amz_pro(keyword,asin,img,sponsored,price,title,fba,star,review,brand,seller_id,qa,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \
                      "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())"
                sql_param = [
                    keyword, asin, img, sponsored, price, title, fba, star,
                    review, brand, seller_id, qa, big_rank_txt, big_rank,
                    mid_rank_txt, mid_rank, small_rank_txt, small_rank,
                    put_date
                ]
                try:
                    mp = MysqlPool()
                except:
                    try:
                        mp = MysqlPool()
                    except:
                        error_log.logger.error("-----数据库连接失败-----")
                        continue
                try:
                    mp.insert(sql, sql_param)
                    all_log.logger.info("***%s(%s)入库成功***" % (asin, keyword))
                except pymysql.err.IntegrityError:
                    print("重复入库")
                    pass
                except Exception:
                    error_log.logger.error("入库异常%s" % sql_param)
                    pass
                success_num += 1
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
        return True, keyword
    except Exception as e:
        traceback.print_exc()
        error_log.logger.error(e)
        return False, keyword
    finally:
        all_log.logger.info("---end---ip=%s,keyword=%s---" % (ip, keyword))
        driver.quit()
Ejemplo n.º 9
0
# -*- codeing = utf-8 -*-
# @Time : 2020/10/27 23:41
# @Author : Cj
# @File : test2.py.py
# @Software : PyCharm

from time import sleep
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from db import MysqlPool

if __name__ == "__main__":
    mp = MysqlPool()
    sql = "select * from tb_review_task"
    s_list = mp.fetch_all(sql, None)
    for s in s_list:
        asins = str(s['asin']).split("|")
        for asin in asins:
            in_sql = "insert into tb_task_asin(task_id,asin,status) values(%s,%s,1)"
            param = [s['id'], asin]
            mp.insert(in_sql, param)
Ejemplo n.º 10
0
def getData():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches',
                                    ['enable-logging', 'enable-automation'])
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.baidu.com/")
    cookies = [{
        'domain': '.cashbackbase.com',
        'expiry': 1599363219,
        'httpOnly': False,
        'name': '_gat_gtag_UA_119767146_3',
        'path': '/',
        'secure': False,
        'value': '1'
    }, {
        'domain':
        '.cashbackbase.com',
        'expiry':
        1757043117,
        'httpOnly':
        True,
        'name':
        'cash-ab-test',
        'path':
        '/',
        'secure':
        False,
        'value':
        'eyJpdiI6InFRZHJVdDNYUHA0UXpPell2MFZFM2c9PSIsInZhbHVlIjoieXFZSjRManpHb00zR01vaGpiNlY5Zz09IiwibWFjIjoiMDU5ZGY3YjA3OGVlNjM2MWRhMjk1YmIxNDJiZTNhOTkxMzRkN2UzNGVhYzViYjM3NGIzMWViZTU2OGY3MGViMyJ9'
    }, {
        'domain': '.cashbackbase.com',
        'expiry': 1599363180,
        'httpOnly': False,
        'name': '_gat_gtag_UA_119767146_1',
        'path': '/',
        'secure': False,
        'value': '1'
    }, {
        'domain': '.cashbackbase.com',
        'expiry': 1599449559,
        'httpOnly': False,
        'name': '_gid',
        'path': '/',
        'secure': False,
        'value': 'GA1.2.731395315.1599363120'
    }, {
        'domain': 'www.cashbackbase.com',
        'httpOnly': False,
        'name': 'current-page',
        'path': '/',
        'secure': False,
        'value': 'https%3A%2F%2Fwww.cashbackbase.com%2Fseller-central'
    }, {
        'domain':
        '.cashbackbase.com',
        'httpOnly':
        True,
        'name':
        'cashbackbasev6_session',
        'path':
        '/',
        'secure':
        False,
        'value':
        'eyJpdiI6IlNveW42QlNCN29ndXdicE96RVVRTFE9PSIsInZhbHVlIjoidWQ0MmtvS2c5RUg4SVN4YlY4NzNjN2h2Wjl0MGFaOW5CK0FFbU5YOFBoMmJHYlZPQzdmUDFDWEtkU2xEaFppQyIsIm1hYyI6ImM3YTI4YWEyNzEzYTI2ZWIyZTMyOWU5YTc5MzNhMWI5ZTViNGZiZDgzZGYyNmJjNmUxYTUzY2MzZmIzNmUzYmQifQ%3D%3D'
    }, {
        'domain': '.cashbackbase.com',
        'expiry': 1662435159,
        'httpOnly': False,
        'name': '_ga',
        'path': '/',
        'secure': False,
        'value': 'GA1.2.606942347.1599363120'
    }]
    for cookie in cookies:
        driver.add_cookie(cookie_dict=cookie)
    driver.get(
        "https://www.cashbackbase.com/seller/order?key=amz_order_id&value=&status=refunded"
    )
    try:
        WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located((By.ID, 'msg-notify')))
        driver.find_element_by_xpath(
            '//*[@id="msg-notify"]//button[@class="close"]').click()
    except:
        pass
    sleep(0.5)
    country = driver.find_element_by_xpath(
        '//*[@id="navbar"]/ul[1]/li[9]/div/a/img').get_attribute('title')
    if country == "UK":
        driver.find_element_by_xpath(
            '//*[@id="navbar"]/ul[1]/li[9]/div/a').click()
        WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located(
                (By.XPATH, '//*[@id="navbar"]/ul[1]/li[9]/div/ul')))
        driver.find_element_by_xpath(
            '//*[@id="navbar"]/ul[1]/li[9]/div/ul/li[1]').click()
        WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located((By.XPATH, '//img[@title="US"]')))
    last_class = driver.find_element_by_xpath(
        '//ul[@class="pagination"]/li[last()]').get_attribute('class')
    while True:
        trs = driver.find_elements_by_xpath('//tbody/tr')
        for tr in trs:
            try:
                order_id = tr.find_element_by_xpath('./td').text
                asin = tr.find_element_by_xpath('.//strong').text
                img = tr.find_element_by_xpath('.//img').get_attribute("src")
                url = tr.find_element_by_xpath(
                    './td[last()]/p/a').get_attribute('href')
                js = 'window.open("' + url + '")'
                driver.execute_script(js)
                driver.switch_to.window(driver.window_handles[1])
                WebDriverWait(driver, 5).until(
                    EC.visibility_of_element_located(
                        (By.CLASS_NAME, 'line-1')))
                paypal = driver.find_element_by_class_name(
                    'line-1').text.replace("PayPal Account:", "").strip()
                customer_name = driver.find_element_by_xpath(
                    '//div[@class="deal-info-title"]/span').text
                try:
                    profile = driver.find_element_by_xpath(
                        '//div[@class="deal-info-title"]//a').get_attribute(
                            'href')
                except:
                    profile = ""
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                mp = MysqlPool()
                sql = "insert into tb_cbb_customer(order_id,asin,img,customer_name,paypal,profile,add_time) values(%s,%s,%s,%s,%s,%s,now())"
                param = [order_id, asin, img, customer_name, paypal, profile]
                try:
                    mp.insert(sql, param)
                    print("%s入库成功" % order_id)
                except:
                    print("%s已存在" % order_id)
                sleep(0.5)
            except:
                traceback.print_exc()
                print("-----采集出错-----")
        if not last_class:
            driver.find_element_by_xpath(
                '//ul[@class="pagination"]/li[last()]/a').click()
            sleep(1)
        else:
            break
    driver.quit()
Ejemplo n.º 11
0
def collectData():

    ua = UserAgent().chrome
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + ua)
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_argument("--disable-gpu")
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches',
                                    ['enable-logging', 'enable-automation'])
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.baidu.com")
    try:
        WebDriverWait(driver,
                      10).until(EC.visibility_of_element_located(
                          (By.ID, 'su')))
    except:
        all_log.logger.error("---打开百度失败---")
    cookies = [{
        'domain': 'www.asinseed.com',
        'httpOnly': True,
        'name': 'JSESSIONID',
        'path': '/',
        'secure': False,
        'value': 'B0141BDB986A2D91ADCE21BCD1ACA3D2'
    }, {
        'domain':
        'www.asinseed.com',
        'expiry':
        1609251926,
        'httpOnly':
        False,
        'name':
        'asinseed-login-user',
        'path':
        '/',
        'secure':
        False,
        'value':
        '4291529061IrZXNTSoIlHhPKyHGfg/7TMbw6xY7YpCjminsqgfQO1ekWtRZ9/kAs/qVnCI5AMe'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1638195927,
        'httpOnly': False,
        'name': 'ecookie',
        'path': '/',
        'secure': False,
        'value': 'dWcWHqqTU5LL9saj_CN'
    }, {
        'domain': 'www.asinseed.com',
        'expiry': 1606660198,
        'httpOnly': False,
        'name': 'crisp-client%2Fsocket%2Fb43aa37b-4c35-4551-a9d4-ad983960d40c',
        'path': '/',
        'sameSite': 'Lax',
        'secure': False,
        'value': '0'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1669731927,
        'httpOnly': False,
        'name': '_ga',
        'path': '/',
        'secure': False,
        'value': 'GA1.2.1615561945.1606659387'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1622427931,
        'httpOnly': False,
        'name':
        'crisp-client%2Fsession%2Fb43aa37b-4c35-4551-a9d4-ad983960d40c',
        'path': '/',
        'sameSite': 'Lax',
        'secure': False,
        'value': 'session_f9e04788-6bf4-48fa-8a09-883989976e41'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1606659960,
        'httpOnly': False,
        'name': '_gat_gtag_UA_125163434_1',
        'path': '/',
        'secure': False,
        'value': '1'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1606746327,
        'httpOnly': False,
        'name': '_gid',
        'path': '/',
        'secure': False,
        'value': 'GA1.2.1043797262.1606659387'
    }, {
        'domain': '.asinseed.com',
        'expiry': 1922019384,
        'httpOnly': False,
        'name': 'w_guest',
        'path': '/',
        'secure': False,
        'value': 'NpicHiupaa1M_201129-223501'
    }]
    for cookie in cookies:
        driver.add_cookie(cookie_dict=cookie)
    sleep(0.5)
    mp = MysqlPool()
    trend_sql = "select t.* from selected_products t where t.trend_data is null or t.trend_data=''"
    trend_data_list = mp.fetch_all(trend_sql, None)
    for trend_data in trend_data_list:
        driver.get("https://www.asinseed.com/en/US?q=%s" %
                   trend_data['keyword'])
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located(
                (By.XPATH, '//div[@class="morris-table-inline"]')))
        trs = driver.find_elements_by_xpath(
            '//div[@class="morris-table-inline"]/../..')
        searches = ''
        for tr in trs:
            if trend_data['keyword'] == tr.find_element_by_xpath(
                    './td[2]').text:
                searches = eval(
                    tr.find_element_by_xpath('./td[3]/div').get_attribute(
                        "data-y"))
        if searches == '':
            searches = eval(
                driver.find_element_by_xpath(
                    '//div[@class="morris-table-inline"]').get_attribute(
                        "data-y"))
        update_sql = "update selected_products set trend_data=%s where id=%s"
        update_param = [str(searches), trend_data['id']]
        mp.insert(update_sql, update_param)
        all_log.logger.info("---%s趋势采集成功---" % trend_data['asin'])
        sleep(1)

    asin_sql = "select t.* from selected_products t where t.id not in (select t2.main_id from asin_searches t2 where t2.main_id=t.id)"
    asin_data_list = mp.fetch_all(asin_sql, None)
    for asin_data in asin_data_list:
        driver.get("https://www.asinseed.com/en/US?q=%s" % asin_data['asin'])
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located(
                (By.XPATH, '//td[@class="text-right"]')))
        trs = driver.find_elements_by_xpath('//td[@class="text-right"]/..')
        insert_sql = "insert into asin_searches(main_id,asin,keyword,searches,add_time) values"
        update_param = []
        for tr in trs:
            keyword = tr.find_element_by_xpath('./td').text
            searches = tr.find_element_by_xpath('./td[2]').text.replace(
                ",", "")
            if searches is None or searches == "":
                searches = 0
            insert_sql += "(%s,%s,%s,%s,now()),"
            update_param.append(asin_data['id'])
            update_param.append(asin_data['asin'])
            update_param.append(keyword)
            update_param.append(searches)
        if insert_sql.endswith(","):
            insert_sql = insert_sql[:-1]
        mp.insert(insert_sql, update_param)
        all_log.logger.info("---%s关联关键词成功---" % asin_data['asin'])
        sleep(1)
Ejemplo n.º 12
0
def getProData():
    mp = MysqlPool()
    data_sql = "select * from amz123_keyword_left9 where status is null or status=0 order by id limit 2000"
    data_list = mp.fetch_all(data_sql, None)
    for data in data_list:
        os.system("taskkill /f /im chrome.exe /t")
        proxy = "C:\\py_file\\proxyauth\\%s" % os.listdir(
            "C:\\py_file\\proxyauth")[random.randint(0, 4)]
        # proxy = 1
        all_log.logger.info("---ip=%s,keyword=%s开始采集---" %
                            (proxy, data['keyword']))
        ua = UserAgent().chrome
        options = webdriver.ChromeOptions()
        options.add_extension(proxy)
        options.add_argument("user-agent=" + ua)
        # options.add_argument("--start-maximized")
        # options.add_argument("--headless")
        options.add_argument('blink-settings=imagesEnabled=false')
        options.add_argument("--disable-gpu")
        options.add_argument("log-level=3")
        options.add_experimental_option('useAutomationExtension', False)
        options.add_experimental_option(
            'excludeSwitches', ['enable-logging', 'enable-automation'])
        driver = webdriver.Chrome(options=options)
        driver.set_window_size(600, 600)
        cookies = [{
            'domain':
            'www.amazon.com',
            'expiry':
            1632329890,
            'httpOnly':
            False,
            'name':
            'csm-hit',
            'path':
            '/',
            'secure':
            False,
            'value':
            'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no'
        }, {
            'domain': '.amazon.com',
            'expiry': 2082787202,
            'httpOnly': False,
            'name': 'lc-main',
            'path': '/',
            'secure': False,
            'value': 'en_US'
        }, {
            'domain':
            '.amazon.com',
            'expiry':
            1633625853,
            'httpOnly':
            False,
            'name':
            'session-token',
            'path':
            '/',
            'secure':
            True,
            'value':
            '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'ubid-main',
            'path': '/',
            'secure': True,
            'value': '134-4542133-6572654'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'session-id-time',
            'path': '/',
            'secure': False,
            'value': '2082787201l'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625846,
            'httpOnly': False,
            'name': 'i18n-prefs',
            'path': '/',
            'secure': False,
            'value': 'USD'
        }, {
            'domain': '.amazon.com',
            'expiry': 1633625894,
            'httpOnly': False,
            'name': 'session-id',
            'path': '/',
            'secure': True,
            'value': '132-8928912-9834042'
        }]
        driver.get("https://www.baidu.com")
        try:
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.ID, 'su')))
        except:
            error_log.logger.error("---%s打开百度失败---" % proxy)
            continue
        for cookie in cookies:
            driver.add_cookie(cookie_dict=cookie)
        sleep(0.5)
        driver.get("https://www.amazon.com/s?k=" + data['keyword'] +
                   "&ref=nb_sb_noss")
        try:
            WebDriverWait(driver, 15).until(
                EC.visibility_of_element_located(
                    (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
        except:
            try:
                WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located(
                        (By.XPATH,
                         '//h4[contains(text(),"characters you see")]')))
                error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" %
                                       (proxy, data['keyword']))
                driver.quit()
                continue
            except:
                pass
            try:
                WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located(
                        (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
            except:
                error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" %
                                       (proxy, data['keyword']))
                driver.quit()
                continue
        divs = driver.find_elements_by_xpath(
            '//div[contains(@class,"s-main-slot")]/div')
        try:
            success_num = 0
            update_sql = "update amz123_keyword_left9 set status=1 where id=%s"
            for div in divs:
                asin = div.get_attribute('data-asin')
                if asin and str(asin).startswith("B"):
                    try:
                        div.find_element_by_xpath(
                            './/div[@class="a-row a-spacing-micro"]')
                        sponsored = "1"
                    except:
                        pass
                        sponsored = "0"
                    try:
                        price = div.find_element_by_xpath(
                            './/span[@data-a-color="base"]/span'
                        ).get_attribute("innerText").replace("$", "")
                    except:
                        price = None
                    try:
                        img1 = div.find_element_by_xpath(
                            './/img').get_attribute('src')
                    except:
                        img1 = None
                    try:
                        title = div.find_element_by_xpath(
                            './/h2/a/span').get_attribute("innerText")
                    except:
                        title = None
                    try:
                        div.find_element_by_xpath(
                            './/span[contains(text(),"by Amazon")]')
                        fba = "1"
                    except:
                        fba = "0"
                    try:
                        star = div.find_element_by_xpath(
                            './/div[@class="a-row a-size-small"]/span'
                        ).get_attribute('aria-label').replace(
                            " out of 5 stars", "")
                    except:
                        star = None
                    try:
                        review = div.find_element_by_xpath(
                            './/div[@class="a-row a-size-small"]/span[2]'
                        ).get_attribute('aria-label').replace(",", "")
                    except:
                        review = "0"
                    try:
                        if int(review) > 70:
                            all_log.logger.info("---%s评价数为%s,跳过---" %
                                                (asin, review))
                            continue
                        if float(price) > 40:
                            all_log.logger.info("---%s价格为%s,跳过---" %
                                                (asin, price))
                            continue
                        if sponsored == "1":
                            all_log.logger.info("---%s为广告,跳过---" % asin)
                            continue
                    except:
                        all_log.logger.info("---%s过滤报错,跳过---" % asin)
                        continue
                    pro_url = div.find_element_by_xpath(
                        './/h2/a').get_attribute("href")
                    js = 'window.open("' + pro_url + '")'
                    driver.execute_script(js)
                    driver.switch_to.window(driver.window_handles[1])
                    try:
                        WebDriverWait(driver, 15).until(
                            EC.visibility_of_element_located(
                                (By.ID, 'bylineInfo_feature_div')))
                        try:
                            brand = driver.find_element_by_xpath(
                                '//a[@id="bylineInfo"]').text.replace(
                                    'Brand: ',
                                    '').replace('Visit the ',
                                                '').replace('Store',
                                                            '').strip()
                        except:
                            brand = None
                        try:
                            store = filter_str(
                                driver.find_element_by_id(
                                    'sellerProfileTriggerId').text)
                        except:
                            store = None
                        try:
                            qa = driver.find_element_by_xpath(
                                '//*[@id="askATFLink"]/span').get_attribute(
                                    'innerText').replace(
                                        " answered questions", "")
                        except:
                            qa = "0"
                        try:
                            seller_id = driver.find_element_by_id(
                                'merchantID').get_attribute("value")
                        except:
                            seller_id = None
                        try:
                            seller_num = driver.find_element_by_xpath(
                                '//div[@id="olp-upd-new-freeshipping-threshold"]//a/span'
                            ).text
                            seller_num = re.findall("\((.*)\)", seller_num)[0]
                        except:
                            seller_num = 0
                        br_error_num = 0
                        rank_type = 0
                        big_rank_txt = ""
                        big_rank = 0
                        mid_rank_txt = ""
                        mid_rank = 0
                        small_rank_txt = ""
                        small_rank = 0
                        while big_rank_txt == "":
                            if rank_type == 1:
                                try:
                                    big_rank_txt = driver.find_element_by_xpath(
                                        '//div[@id="detailBullets_feature_div"]/following-sibling::ul'
                                    ).get_attribute('innerText')
                                    if big_rank_txt == "":
                                        br_error_num += 1
                                except:
                                    br_error_num += 1
                                    sleep(1)
                                    big_rank_txt = ""
                            else:
                                try:
                                    big_rank_txt = getRank(driver, 1)
                                except:
                                    try:
                                        WebDriverWait(driver, 5).until(
                                            EC.visibility_of_element_located((
                                                By.ID,
                                                'detailBulletsWrapper_feature_div'
                                            )))
                                        rank_type = 1
                                        big_rank_txt = driver.find_element_by_xpath(
                                            '//div[@id="detailBullets_feature_div"]/following-sibling::ul'
                                        ).get_attribute('innerText')
                                    except:
                                        br_error_num += 1
                                        sleep(1)
                                        big_rank_txt = ""
                            if br_error_num == 3:
                                all_log.logger.error("%s未采集到大类排名%s次" %
                                                     (asin, br_error_num))
                                big_rank_txt = ""
                                break
                        if big_rank_txt != "":
                            if rank_type == 0:
                                big_rank_txt = re.sub("\(.*", "",
                                                      big_rank_txt).strip()
                                big_rank_list = re.findall("\d", big_rank_txt)
                                big_rank = ""
                                for br in big_rank_list:
                                    big_rank += br
                            else:
                                for br_i, br in enumerate(
                                        big_rank_txt.split("#")):
                                    rank_txt = "#" + br.strip()
                                    if br_i == 1:
                                        big_rank_txt = re.sub(
                                            "\(.*", "", rank_txt).strip()
                                        big_rank_list = re.findall(
                                            "\d", big_rank_txt)
                                        big_rank = ""
                                        for br_1 in big_rank_list:
                                            big_rank += br_1
                                    elif br_i == 2:
                                        mid_rank_txt = rank_txt
                                        mid_rank_list = re.findall(
                                            "\d", mid_rank_txt)
                                        mid_rank = ""
                                        for mr in mid_rank_list:
                                            mid_rank += mr
                                    elif br_i == 3:
                                        small_rank_txt = rank_txt
                                        small_rank_list = re.findall(
                                            "\d", small_rank_txt)
                                        small_rank = ""
                                        for sr in small_rank_list:
                                            small_rank += sr
                        else:
                            big_rank = 0
                        if rank_type == 0:
                            try:
                                mid_rank_txt = getRank(driver, 2)
                            except:
                                mid_rank_txt = ""
                            if mid_rank_txt != "":
                                mid_rank_txt = re.sub("\(.*", "",
                                                      mid_rank_txt).strip()
                                mid_rank_list = re.findall("\d", mid_rank_txt)
                                mid_rank = ""
                                for mr in mid_rank_list:
                                    mid_rank += mr
                            else:
                                mid_rank = 0
                            try:
                                small_rank_txt = getRank(driver, 3)
                            except:
                                small_rank_txt = ""
                            if small_rank_txt != "":
                                small_rank_txt = re.sub(
                                    "\(.*", "", small_rank_txt).strip()
                                small_rank_list = re.findall(
                                    "\d", small_rank_txt)
                                small_rank = ""
                                for sr in small_rank_list:
                                    small_rank += sr
                            else:
                                small_rank = 0
                        try:
                            put_date = driver.find_element_by_xpath(
                                '//th[contains(text(),"Date First Available")]/following-sibling::td[1]'
                            ).get_attribute('innerText')
                            if put_date:
                                put_date = datetime.strptime(
                                    put_date, '%B %d, %Y').strftime("%Y-%m-%d")
                        except:
                            put_date = None
                        if big_rank == '' or int(
                                big_rank) == 0 or int(big_rank) > 15000:
                            all_log.logger.info("---%s大类排名为%s,跳过---" %
                                                (asin, big_rank))
                            driver.close()
                            driver.switch_to.window(driver.window_handles[0])
                            continue
                        img2 = ''
                        img3 = ''
                        img2_num = 0
                        img2_click_num = 0
                        img3_num = 0
                        img3_click_num = 0
                        while img2 == '' and img2_click_num < 40 and img2_num < 5:
                            sleep(0.5)
                            try:
                                driver.find_element_by_xpath(
                                    '//div[@id="altImages"]/ul//li[@class="a-spacing-small template"]/following-sibling::li[2]'
                                ).click()
                            except:
                                img2_click_num += 1
                            try:
                                WebDriverWait(driver, 5).until(
                                    EC.visibility_of_element_located(
                                        (By.XPATH,
                                         '//li[contains(@class,"itemNo1")]')))
                                img2 = driver.find_element_by_xpath(
                                    '//li[contains(@class,"itemNo1")]//img'
                                ).get_attribute("src")
                            except:
                                img2_num += 1
                        while img3 == '' and img3_click_num < 40 and img3_num < 5:
                            sleep(0.5)
                            try:
                                driver.find_element_by_xpath(
                                    '//div[@id="altImages"]/ul//li[@class="a-spacing-small template"]/following-sibling::li[3]'
                                ).click()
                            except:
                                img3_click_num += 1
                            try:
                                WebDriverWait(driver, 5).until(
                                    EC.visibility_of_element_located(
                                        (By.XPATH,
                                         '//li[contains(@class,"itemNo2")]')))
                                img3 = driver.find_element_by_xpath(
                                    '//li[contains(@class,"itemNo2")]//img'
                                ).get_attribute("src")
                            except:
                                img3_num += 1
                        sql = "insert into tb_amz_pro_1129(keyword,asin,img1,img2,img3,sponsored,price,title,fba,star,review,brand,store,qa,seller_id,seller_num," \
                              "big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \
                              "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())"
                        sql_param = [
                            data['keyword'], asin, img1, img2, img3, sponsored,
                            price, title, fba, star, review, brand, store, qa,
                            seller_id, seller_num, big_rank_txt, big_rank,
                            mid_rank_txt, mid_rank, small_rank_txt, small_rank,
                            put_date
                        ]
                        try:
                            mp.insert(sql, sql_param)
                            all_log.logger.info("-----%s(%s)入库成功-----" %
                                                (asin, data['keyword']))
                            success_num += 1
                        except IntegrityError:
                            all_log.logger.info("-----%s(%s)已存在-----" %
                                                (asin, data['keyword']))
                            success_num += 1
                        except Exception as e:
                            error_log.logger.error("-----%s(%s)入库失败%s-----" %
                                                   (asin, data['keyword'], e))
                    except:
                        traceback.print_exc()
                        error_log.logger.error("-----%s---%s采集出错-----" %
                                               (data['keyword'], proxy))
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
            mp.update(update_sql, (data['id'], ))
        except:
            traceback.print_exc()
            error_log.logger.error("-----%s---%s出错-----" %
                                   (data['keyword'], proxy))
        finally:
            all_log.logger.info("---end---ip=%s,keyword=%s---" %
                                (proxy, data['keyword']))
            driver.quit()
Ejemplo n.º 13
0
def collectionGroup(acc, keywrods):
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + acc['user-agent'])
    prefs = {'profile.default_content_setting_values': {'notifications': 2}}
    options.add_experimental_option('prefs', prefs)
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.baidu.com/")
    if type('') is type(acc['cookies']):
        cookie_list = eval(acc['cookies'])
    else:
        cookie_list = acc['cookies']
    for cookie in cookie_list:
        driver.add_cookie(cookie_dict=cookie)
    for keyword in keywrods:
        # 公开小组 & 非公开小组
        group_types = [
            "&epa=FILTERS&filters=eyJncm91cHNfc2hvd19vbmx5Ijoie1wibmFtZVwiOlwicHVibGljX2dyb3Vwc1wiLFwiYXJnc1wiOlwiXCJ9In0%3D",
            "&epa=FILTERS&filters=eyJncm91cHNfc2hvd19vbmx5Ijoie1wibmFtZVwiOlwiY2xvc2VkX2dyb3Vwc1wiLFwiYXJnc1wiOlwiXCJ9In0%3D"
        ]
        for group_num, group in enumerate(group_types):
            driver.get("https://www.facebook.com/search/groups/?q=%s" %
                       keyword + group)
            WebDriverWait(driver, 5).until(
                EC.visibility_of_element_located(
                    (By.XPATH, '//div[@id="BrowseResultsContainer"]')))
            for i in range(15):
                ActionChains(driver).send_keys(Keys.END).perform()
                sleep(1.5)
                if i > 10:
                    try:
                        WebDriverWait(driver, 5).until(
                            EC.visibility_of_element_located(
                                (By.ID, 'browse_end_of_results_footer')))
                        break
                    except:
                        pass
            divs = driver.find_elements_by_xpath(
                '//div[@id="BrowseResultsContainer"]/../div')
            for j, div in enumerate(divs):
                if j == 0:
                    proDivs = div.find_elements_by_xpath('./div')
                elif j == 1:
                    proDivs = div.find_elements_by_xpath('./div/div/div')
                else:
                    proDivs = div.find_elements_by_xpath('./div/div')
                if len(proDivs) <= 1:
                    break
                for pro in proDivs:
                    pro_url = pro.find_element_by_tag_name('a').get_attribute(
                        "href")
                    if group_num == 0:
                        pro_url = re.sub("\?.*", "members/", pro_url)
                    js = 'window.open("' + pro_url + '")'
                    driver.execute_script(js)
                    driver.switch_to.window(driver.window_handles[1])
                    sleep(1000)
                    try:
                        if group_num == 0:
                            WebDriverWait(driver, 5).until(
                                EC.visibility_of_element_located(
                                    (By.XPATH,
                                     '//div[@id="groupsMemberBrowser"]')))
                        else:
                            WebDriverWait(driver, 5).until(
                                EC.visibility_of_element_located(
                                    (By.XPATH,
                                     '//div[@id="content_container"]')))
                        group_name = driver.find_element_by_xpath(
                            '//div[@data-testid="group_sidebar_nav"]//a').text
                        group_admin = ""
                        if group_num == 0:
                            nums = driver.find_element_by_xpath(
                                '//div[@id="groupsMemberBrowser"]//span'
                            ).text.replace(",", "")
                            admins_div = driver.find_elements_by_xpath(
                                '//div[@data-testid="GroupAdminGrid"]/ul/div')
                            for admin_div in admins_div:
                                group_admin += admin_div.find_element_by_tag_name(
                                    'img').get_attribute("aria-label") + "|"
                        else:
                            nums = driver.find_element_by_xpath(
                                '//div[@id="pagelet_group_about"]/div[2]//span'
                            ).text.replace("成员 · ", "").replace(",", "")
                            admins_div = driver.find_elements_by_xpath(
                                '//div[@id="pagelet_group_about"]/div[2]/div[2]/div[2]/a'
                            )
                            for admin_div in admins_div:
                                group_admin += admin_div.find_element_by_tag_name(
                                    'img').get_attribute("aria-label") + "|"
                        if int(nums) < 4000:
                            error_log.logger.error("-----%s人数为%s,跳过-----" %
                                                   (group_name, nums))
                            driver.close()
                            driver.switch_to.window(driver.window_handles[0])
                            continue
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                        mp = MysqlPool()
                        sql = "insert into tb_group(name,nums,admins,type,url,add_time) values(%s,%s,%s,%s,%s,now())"
                        param = [
                            filter_str(group_name), nums,
                            filter_str(group_admin), group_num, pro_url
                        ]
                        try:
                            mp.insert(sql, param)
                            all_log.logger.info("-----%s入库成功-----" %
                                                group_name)
                        except:
                            error_log.logger.error("-----%s已入库,跳过-----" %
                                                   group_name)
                        sleep(1)
                    except:
                        traceback.print_exc()
                        error_log.logger.error("*****获取%s信息出错*****" % pro_url)
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
Ejemplo n.º 14
0
def getSellerId(asin_list, process_name):
    ua = UserAgent(verify_ssl=False).chrome
    ua = re.sub("Chrome/\d{2}", "Chrome/" + str(random.randint(49, 85)), ua)
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + ua)
    url = "http://ip.ipjldl.com/index.php/api/entry?method=proxyServer.tiqu_api_url&packid=0&fa=0&dt=0&groupid=0&fetch_key=&qty=1&time=1&port=1&format=json&ss=5&css=&dt=0&pro=&city=&usertype=6"
    options.add_argument("--start-maximized")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_experimental_option('excludeSwitches',
                                    ['enable-logging', 'enable-automation'])
    driver = None
    error_url = ""
    while True:
        asin_url = ""
        try:
            ip_data = urllib.request.urlopen(url).read()
            print(ip_data)
            json_list = list(json.loads(ip_data)['data'])
            ip = "%s:%s" % (json_list[0]['IP'], json_list[0]['Port'])
            options.add_argument(('--proxy-server=http://' + ip))
            driver = webdriver.Chrome(options=options)
            driver.get("https://www.baidu.com")
            WebDriverWait(driver, 15).until(
                EC.visibility_of_element_located((By.ID, 'su')))
            cookies = [{
                'domain':
                'www.amazon.com',
                'expiry':
                1633407103,
                'httpOnly':
                False,
                'name':
                'csm-hit',
                'path':
                '/',
                'secure':
                False,
                'value':
                'tb:s-Z135Q1Y24PZMTTNF8DDZ|1603167100870&t:1603167103192&adb:adblk_no'
            }, {
                'domain': '.amazon.com',
                'expiry': 2082787201,
                'httpOnly': False,
                'name': 'lc-main',
                'path': '/',
                'secure': False,
                'value': 'en_US'
            }, {
                'domain':
                '.amazon.com',
                'expiry':
                1634703091,
                'httpOnly':
                False,
                'name':
                'session-token',
                'path':
                '/',
                'secure':
                True,
                'value':
                'fxzmNhMySgaV1gVga7nbDig972AmQGFxhFgyEZISkgU6//KEtZqCk54TxZV/ttWlmA+5gxnaUgZzFBKseUNhVdQgTHbVI7sDvNIFguqFFGDHATp9swCwfYcd3ViRzafe3d9YkzdIfga0G4kRm5SyB8MRExx3AnOc6jNxeMYPpYxuhaZX8Pe3viZFX6OK551eUxMz5vMEzje8b4ugkSCVV5OKFaJsgqL/iFHyHqnntlRSPPiPwK1eZ2gUicC09p3Q'
            }, {
                'domain': '.amazon.com',
                'expiry': 1634703109,
                'httpOnly': False,
                'name': 'session-id-time',
                'path': '/',
                'secure': False,
                'value': '2082787201l'
            }, {
                'domain': '.amazon.com',
                'httpOnly': False,
                'name': 'skin',
                'path': '/',
                'secure': False,
                'value': 'noskin'
            }, {
                'domain': '.amazon.com',
                'expiry': 1634703109,
                'httpOnly': False,
                'name': 'ubid-main',
                'path': '/',
                'secure': True,
                'value': '130-0463586-1564060'
            }, {
                'domain': '.amazon.com',
                'expiry': 1634703086,
                'httpOnly': False,
                'name': 'i18n-prefs',
                'path': '/',
                'secure': False,
                'value': 'USD'
            }, {
                'domain': '.amazon.com',
                'expiry': 1634703109,
                'httpOnly': False,
                'name': 'session-id',
                'path': '/',
                'secure': True,
                'value': '147-0153722-0121323'
            }]
            for cookie in cookies:
                driver.add_cookie(cookie_dict=cookie)
            while len(asin_list) > 1:
                print("---第%s个线程剩余asin数量%s---" %
                      (process_name + 1, len(asin_list)))
                sleep(1)
                asin_url = "https://www.amazon.com/dp/" + str(
                    asin_list[0]['asin'])
                driver.get(asin_url)
                try:
                    WebDriverWait(driver, 10).until(
                        EC.visibility_of_element_located(
                            (By.ID, 'bylineInfo_feature_div')))
                except:
                    WebDriverWait(driver,
                                  5).until(EC.title_contains('Page Not Found'))
                    error_log.logger.error("%s页面未找到" % asin_list[0]['asin'])
                    asin_list.pop(0)
                    continue
                sleep(0.5)
                seller_id = driver.find_element_by_id(
                    'merchantID').get_attribute("value")
                if seller_id:
                    insert_sql = "insert into tb_seller_id(seller_id,add_time) values(%s,now())"
                    insert_param = [seller_id]
                    try:
                        insert_mp = MysqlPool()
                        insert_mp.insert(insert_sql, insert_param)
                    except pymysql.err.IntegrityError:
                        pass
                asin_list.pop(0)
        except:
            error_log.logger.error("***第%s个线程%s报错***" %
                                   (process_name + 1, asin_url))
            if error_url == asin_url: asin_list.pop(0)
            else: error_url = asin_url
            if driver:
                driver.quit()
            continue
        break
    all_log.logger.info("---第%s个线程运行结束---" % (process_name + 1))
Ejemplo n.º 15
0
def getProData(ip, keyword):
    all_log.logger.info(
        "***start***ip=%s,keyword=%s,%s***" %
        (ip, keyword, datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
    ua = UserAgent().chrome
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=" + ua)
    if ip:
        options.add_argument(('--proxy-server=http://' + ip))
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    cookies = [{
        'domain':
        'www.amazon.com',
        'expiry':
        1632329890,
        'httpOnly':
        False,
        'name':
        'csm-hit',
        'path':
        '/',
        'secure':
        False,
        'value':
        'tb:s-TW8A7SAQXE5512HEHN3F|1602089889292&t:1602089890223&adb:adblk_no'
    }, {
        'domain': '.amazon.com',
        'expiry': 2082787202,
        'httpOnly': False,
        'name': 'lc-main',
        'path': '/',
        'secure': False,
        'value': 'en_US'
    }, {
        'domain':
        '.amazon.com',
        'expiry':
        1633625853,
        'httpOnly':
        False,
        'name':
        'session-token',
        'path':
        '/',
        'secure':
        True,
        'value':
        '3QBwaC0p4MPUmPmkTggA/5KFuQV86y0YLrdo7ONa0Jj32bh7dV8URjqYgcRBuBz3ADk9Svq0h89qS1OuCpZy+uA1IYfO1TNpiYJaP6z6zHy2O/AO4FlwdTphm7+S2ahm1LBYNUTY+xDrwGQmgF8u6Dqx7nXqXJNSOkBCdVrQZ6a30LnhBpQgwinDvWxMFeKNsbK8LnDO+tARUPQiRm0va3zvb4gqiUAPSBe8RxIeunmQvASbwAR4Yc1WHotY6utU'
    }, {
        'domain': '.amazon.com',
        'expiry': 1633625894,
        'httpOnly': False,
        'name': 'ubid-main',
        'path': '/',
        'secure': True,
        'value': '134-4542133-6572654'
    }, {
        'domain': '.amazon.com',
        'expiry': 1633625894,
        'httpOnly': False,
        'name': 'session-id-time',
        'path': '/',
        'secure': False,
        'value': '2082787201l'
    }, {
        'domain': '.amazon.com',
        'expiry': 1633625846,
        'httpOnly': False,
        'name': 'i18n-prefs',
        'path': '/',
        'secure': False,
        'value': 'USD'
    }, {
        'domain': '.amazon.com',
        'expiry': 1633625894,
        'httpOnly': False,
        'name': 'session-id',
        'path': '/',
        'secure': True,
        'value': '132-8928912-9834042'
    }]
    driver.get("https://www.baidu.com")
    sleep(0.5)
    for cookie in cookies:
        driver.add_cookie(cookie_dict=cookie)
    driver.get("https://www.amazon.com/s?k=" + keyword + "&ref=nb_sb_noss")
    try:
        WebDriverWait(driver, 15).until(
            EC.visibility_of_element_located(
                (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
    except:
        try:
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located(
                    (By.XPATH, '//h4[contains(text(),"characters you see")]')))
            error_log.logger.error("***ip=%s,keyword=%s,出现验证码,结束当前采集***" %
                                   (ip, keyword))
            driver.quit()
            return False, keyword
        except:
            pass
        try:
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located(
                    (By.XPATH, '//div[contains(@class,"s-main-slot")]')))
        except:
            error_log.logger.error("***ip=%s,keyword=%s,页面采集错误,结束当前采集***" %
                                   (ip, keyword))
            driver.quit()
            return False, keyword
    divs = driver.find_elements_by_xpath(
        '//div[contains(@class,"s-main-slot")]/div')
    try:
        for div in divs:
            asin = div.get_attribute('data-asin')
            if asin:
                try:
                    div.find_element_by_xpath(
                        './/div[@class="a-row a-spacing-micro"]')
                    sponsored = "1"
                except:
                    pass
                    sponsored = "0"
                try:
                    price = div.find_element_by_xpath(
                        './/span[@data-a-color="base"]/span').get_attribute(
                            "innerText").replace("$", "")
                except:
                    price = None
                try:
                    img = div.find_element_by_xpath('.//img').get_attribute(
                        'src')
                except:
                    img = None
                try:
                    title = div.find_element_by_xpath(
                        './/h2/a/span').get_attribute("innerText")
                except:
                    title = None
                try:
                    div.find_element_by_xpath(
                        './/span[contains(text(),"by Amazon")]')
                    fba = "1"
                except:
                    fba = "0"
                try:
                    star = div.find_element_by_xpath(
                        './/div[@class="a-row a-size-small"]/span'
                    ).get_attribute('aria-label').replace(
                        " out of 5 stars", "")
                except:
                    star = None
                try:
                    review = div.find_element_by_xpath(
                        './/div[@class="a-row a-size-small"]/span[2]'
                    ).get_attribute('aria-label').replace(",", "")
                except:
                    review = "0"
                pro_url = div.find_element_by_xpath('.//h2/a').get_attribute(
                    "href")
                js = 'window.open("' + pro_url + '")'
                driver.execute_script(js)
                driver.switch_to.window(driver.window_handles[1])
                try:
                    WebDriverWait(driver, 15).until(
                        EC.visibility_of_element_located(
                            (By.ID, 'bylineInfo_feature_div')))
                    brand = driver.find_element_by_xpath(
                        '//a[@id="bylineInfo"]').text.replace('Brand: ', '')
                    try:
                        qa = driver.find_element_by_xpath(
                            '//*[@id="askATFLink"]/span').get_attribute(
                                'innerText').replace(" answered questions", "")
                    except:
                        qa = "0"
                    try:
                        big_rank_txt = driver.find_element_by_xpath(
                            '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[1]'
                        ).get_attribute('innerText')
                    except:
                        try:
                            big_rank_txt = driver.find_element_by_xpath(
                                '//*[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[1]'
                            ).get_attribute('innerText')
                        except:
                            big_rank_txt = ""
                    if big_rank_txt:
                        big_rank_txt = re.sub("\(.*", "", big_rank_txt).strip()
                        big_rank_list = re.findall("\d", big_rank_txt)
                        big_rank = ""
                        for br in big_rank_list:
                            big_rank += br
                    else:
                        big_rank = 0
                    try:
                        mid_rank_txt = driver.find_element_by_xpath(
                            '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[2]'
                        ).get_attribute('innerText')
                    except:
                        mid_rank_txt = ""
                    if mid_rank_txt:
                        mid_rank_txt = re.sub("\(.*", "", mid_rank_txt).strip()
                        mid_rank_list = re.findall("\d", mid_rank_txt)
                        mid_rank = ""
                        for mr in mid_rank_list:
                            mid_rank += mr
                    else:
                        mid_rank = 0
                    try:
                        small_rank_txt = driver.find_element_by_xpath(
                            '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[3]/td/span/span[3]'
                        ).get_attribute('innerText')
                    except:
                        small_rank_txt = ""
                    if small_rank_txt:
                        small_rank_txt = re.sub("\(.*", "",
                                                small_rank_txt).strip()
                        small_rank_list = re.findall("\d", small_rank_txt)
                        small_rank = ""
                        for sr in small_rank_list:
                            small_rank += sr
                    else:
                        small_rank = 0
                    try:
                        put_date = driver.find_element_by_xpath(
                            '//table[@id="productDetails_detailBullets_sections1"]/tbody/tr[4]/td'
                        ).get_attribute('innerText')
                        if put_date:
                            put_date = datetime.strptime(
                                put_date, '%B %d, %Y').strftime("%Y-%m-%d")
                    except:
                        put_date = None
                    mp = MysqlPool()
                    sql = "insert into tb_amz_pro(keyword,asin,img,sponsored,price,title,fba,star,review,brand,qa,big_rank_txt,big_rank,mid_rank_txt,mid_rank,small_rank_txt,small_rank,put_date,add_date) " \
                          "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())"
                    sql_param = [
                        keyword, asin, img, sponsored, price, title, fba, star,
                        review, brand, qa, big_rank_txt, big_rank,
                        mid_rank_txt, mid_rank, small_rank_txt, small_rank,
                        put_date
                    ]
                    try:
                        mp.insert(sql, sql_param)
                        all_log.logger.info("-----%s(%s)入库成功-----" %
                                            (asin, keyword))
                    except:
                        traceback.print_exc()
                        all_log.logger.info("-----%s(%s)已存在-----" %
                                            (asin, keyword))
                except:
                    traceback.print_exc()
                    error_log.logger.error("-----%s---%s采集出错-----" %
                                           (keyword, asin))
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
        return True, keyword
    except:
        traceback.print_exc()
        return False, keyword
    finally:
        all_log.logger.info(
            "---end---ip=%s,keyword=%s,%s---" %
            (ip, keyword, datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        driver.quit()