def get_account_info(driver): """ 获取邮箱,手机,是否完成身份验证 :param driver:驱动实例对象 :return: 返回邮箱,手机,是否完成身份验证 """ account_ele_href = chrome_api.get_element_href( driver, "#J_MtMainNav > li:nth-child(2) > a") driver.get(account_ele_href) refresh_page(driver, "dl.detail-info") html_str = driver.page_source.encode("utf-8").decode() html = etree.HTML(html_str) email_list = html.xpath( "//span[contains(text(),'登 录 邮 箱:')]/../span[2]/text()") if len(email_list) == 0: email = "" else: email = email_list[0] binding_phone_list = html.xpath( "//span[contains(text(),'绑 定 手 机:')]/../span[2]/text()") if len(binding_phone_list) == 0: binding_phone = "" else: binding_phone = binding_phone_list[0] binding_phone = binding_phone.replace("\n", "").replace("\t", "").replace(" ", "") authentication_list = html.xpath("//span[contains(text(),'已完成')]/text()") if len(authentication_list) == 0: authentication = "未完成" else: authentication = authentication_list[0] return email, binding_phone, authentication
def get_account_type(driver): """ 获取支付宝账户类别 :param driver: :return: 支付宝账户类别 """ refresh_page( driver, "#main-content table > tbody > tr:nth-child(1) > td:nth-child(2)") ele = wait_ele( driver, "#main-content table > tbody > tr:nth-child(1) > td:nth-child(2)") if ele: account_type_ele = chrome_api.get_element( driver, "#main-content table > tbody > tr:nth-child(1) > td:nth-child(2)") if account_type_ele: account_type = account_type_ele.text account_type = account_type.replace(" ", "").replace("\n", "").replace( "\t", "") else: account_type = "" else: account_type = "" return account_type
def get_integral(driver): """我的积分,页面跳转会出现bug""" driver.get("https://pages.tmall.com/wow/jifen/act/point-details") refresh_page(driver, "div#pointContent") wait_ele(driver, "div#pointContent") html_str = driver.page_source.encode("utf-8").decode() html = etree.HTML(html_str) tianmao_grade_list = html.xpath( "//span[contains(text(),'可用的积分')]/../span[2]/text()") if len(tianmao_grade_list) == 0: tianmao_grade = "" else: tianmao_grade = tianmao_grade_list[0] return tianmao_grade
def get_index_data(driver, current_mouse): """ 从主页面获取买家淘气值,会员等级,会员名 :param driver: 驱动实例对象 :param current_mouse: 上一次鼠标的位置 :return: score """ refresh_page( driver, "#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a") try: rect = chrome_api.get_element_rect( driver, '#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a') offset_x, offset_y = int(random.uniform(2, rect['width'] - 2)), int( random.uniform(2, rect['height'] - 2)) current_mouse = chrome_api.move_to_position( driver, current_mouse, [rect['left'] + offset_x, rect['top'] + offset_y]) # 获取淘气值 score_ele = wait_ele(driver, "div.site-nav-user-info p:nth-child(2)") if score_ele: score = chrome_api.get_element_text( driver, "div.site-nav-user-info p:nth-child(2)") else: score = "" # 获取会员等级 vip_level_ele = wait_ele(driver, "div.site-nav-user-info p:nth-child(3)") if vip_level_ele: vip_level = chrome_api.get_element_text( driver, "div.site-nav-user-info p:nth-child(3)") else: vip_level = "" # 获取会员名 login_name_ele = wait_ele(driver, "a.site-nav-login-info-nick") if login_name_ele: login_name = chrome_api.get_element_text( driver, "a.site-nav-login-info-nick") else: login_name = "" except Exception as e: score = "" vip_level = "" login_name = "" return score, vip_level, login_name, current_mouse
def get_personal_deal_info(driver): """ 获取个人真实姓名,个人地址 :param driver: 驱动实例对象 :return: 个人真实姓名,个人地址 """ deal_info_href = chrome_api.get_element_href(driver, "#newAccountProfile > a") driver.get(deal_info_href) refresh_page(driver, "h2.h2-single") name_ele = chrome_api.get_element( driver, "#main-content > form > h2 + ul > li:nth-child(1) > strong") if name_ele is None: name = "" else: name = name_ele.text address = chrome_api.get_element_value( driver, "#main-content > form > h2 + ul > li:nth-child(6) > input") if not address: address = "" return name, address
def get_refund_number(driver, current_mouse): """退款数量""" refund_number_ele = wait_ele(driver, "#refundList > a") if refund_number_ele: current_mouse = enter_next(driver, current_mouse, "#refundList > a") refresh_page(driver, "#topContainer_1") refund_div = chrome_api.get_element( driver, '#bottomContainer_1 > div:nth-child(2)') refund_number = 0 if refund_div is not None: num = 2 while True: current_div = chrome_api.get_element( driver, '#bottomContainer_1 > div:nth-child({})'.format(num)) if current_div is None: break num += 2 refund_number += 1 return refund_number, current_mouse else: refund_number = 0 return refund_number, current_mouse
def get_good_reputation(driver, user_id): """买家累积信用 好评率""" logger.info("user {}: start crawl rate_summary".format(user_id)) rate_summary_href = chrome_api.get_element_href(driver, "#myRate > a") list_bought_href = chrome_api.get_element_href(driver, "#bought") driver.get(rate_summary_href) refresh_page(driver, "table.seller-rate-info") html_str = driver.page_source.encode("utf-8").decode() html = etree.HTML(html_str) cumulative_credit_list = html.xpath( "//h4[contains(text(),'买家累积信用')]/a/text()") if len(cumulative_credit_list) == 0: cumulative_credit = "" else: cumulative_credit = cumulative_credit_list[0] rate_summary_list = html.xpath("//p[contains(text(),'好评率')]/strong/text()") if len(rate_summary_list) == 0: rate_summary = "" else: rate_summary = rate_summary_list[0] data_list = html.xpath( '//table[@class="tb-rate-table align-c thm-plain"]/tbody/tr') one_week = {} one_month = {} for i, tr in enumerate(data_list): moment_list = tr.xpath('./td//text()') if moment_list[0] == "好评": key = "good" elif moment_list[0] == "中评": key = "medium" elif moment_list[0] == "差评": key = "bad" elif moment_list[0] == "总计": key = "total" one_week[key] = moment_list[1] one_month[key] = moment_list[2] return cumulative_credit, rate_summary, one_week, one_month, list_bought_href
def run(self, driver, taobao_total_data, size=(1366, 768)): tb_user = {} tb_order = {} try: logger.info("user {}: start crawl login_name".format(self.user_id)) current_mouse = [ random.randint(1, int(size[0] * 0.75)), random.randint(2, int(size[1] * 0.75)) ] ActionChains(driver).move_by_offset( xoffset=current_mouse[0], yoffset=current_mouse[1]).perform() score, vip_level, login_name, current_mouse = get_index_data( driver, current_mouse) tb_user["score"] = score tb_user["vip_level"] = vip_level tb_user["login_name"] = login_name # 进入我的淘宝页面 current_mouse = enter_next( driver, current_mouse, '#J_SiteNavMytaobao > div.site-nav-menu-hd > a') refresh_page(driver, "header.mt-header") logistics_information = get_logistics_information(driver) # 进入评价管理获取买家累积信用 好评 cumulative_credit, rate_summary, one_week, one_month, list_bought_href = get_good_reputation( driver, self.user_id) tb_user["cumulative_credit"] = cumulative_credit tb_user["rate_summary"] = rate_summary tb_user["one_week"] = one_week tb_user["one_month"] = one_month # 进入退款维权页面 refresh_page(driver, "a.J_MtIndicator") current_mouse = enter_next(driver, current_mouse, "a.J_MtIndicator") time.sleep(random.uniform(1.5, 2.5)) # 进入退款管理页面 refund_number, current_mouse = get_refund_number( driver, current_mouse) # 进入退款维权页面 refresh_page(driver, "a.J_MtIndicator") current_mouse = enter_next(driver, current_mouse, "a.J_MtIndicator") time.sleep(random.uniform(1.5, 2.5)) # 进入投诉管理页面 current_mouse = enter_next(driver, current_mouse, "#rulesManager > a") # 等待售后管理页面加载完成 refresh_page(driver, "#rightManager > a") # 需要往下滑动几下 v = abs(random.gauss(200, 20)) time.sleep(random.uniform(0.2, 0.8)) driver.execute_script("window.scrollBy(0,%d)" % v) # 进入售后管理页面 current_mouse = enter_next(driver, current_mouse, "#rightManager > a") refresh_page(driver, "div.table-hd") after_number = get_after_number(driver) # 进入账户设置页面 email, binding_phone, authentication = get_account_info(driver) tb_user["email"] = email tb_user["binding_phone"] = binding_phone tb_user["authentication"] = authentication # 进入个人交易信息页面 logger.info("user {}: start crawl deal info".format(self.user_id)) name, address = get_personal_deal_info(driver) tb_user["name"] = name tb_user["address"] = address tb_user["host_age"] = get_host_age(driver) # 进入收货地址页面 logger.info("user {}: start crawl address_list".format( self.user_id)) address_list = get_personal_address(driver) taobao_total_data["tb_deliver_addrs"] = address_list logger.info("user {}: start crawl zhifubao info".format( self.user_id)) tb_zhifubao_binding, current_mouse = get_alipay_data( driver, current_mouse) # 进入我的积分 获取天猫积分 tianmao_grade = get_integral(driver) tb_user["tianmao_grade"] = tianmao_grade taobao_total_data["tb_user"] = tb_user # 进入商品列表页 logger.info("user {}: start crawl goods list".format(self.user_id)) driver.get(list_bought_href) # 等待已买宝贝标签加载完 refresh_page(driver, "#bought") html_0 = driver.page_source html_str0 = html_0.encode("utf-8").decode() logger.info("user {}: start crawl good detail".format( self.user_id)) order_list = flip_over(driver, html_str0, address_list) tb_order["order_list"] = order_list tb_order["refund_number"] = refund_number tb_order["after_number"] = after_number tb_order["logistics_information"] = logistics_information taobao_total_data["tb_order"] = tb_order taobao_total_data["tb_order"]["tb_order_num"] = len( taobao_total_data["tb_order"]["order_list"]) taobao_total_data["tb_zhifubao_binding"] = tb_zhifubao_binding except Exception as e: taobao_total_data["now_time"] = get_now_time() taobao_total_data["status_code"] = "5002" taobao_total_data[ "data_status_message"] = "crawl server error so get data fail" taobao_total_data["user_id"] = self.user_id DingDing.send(json.dumps(taobao_total_data)) logger.error('user {}:{}'.format(self.user_id, e)) else: taobao_total_data["now_time"] = get_now_time() taobao_total_data["status_code"] = "2002" taobao_total_data["data_status_message"] = "get data success" taobao_total_data["user_id"] = self.user_id InfoManager().save_userinfo(taobao_total_data) InfoManager().save_deliveraddrsinfo(taobao_total_data) InfoManager().save_orderinfo(taobao_total_data) InfoManager().save_productinfo(taobao_total_data) InfoManager().save_zhifubaoinfo(taobao_total_data) logger.info("user {}: crawl completed.".format(self.user_id)) finally: save_to_kafka(taobao_total_data) self.sr.decr(settings.SERVER_RANDOM_UUID) driver.quit() return taobao_total_data
def get_alipay_data(driver, current_mouse): # 进入支付宝引导页面 tb_zhifubao_binding = {} current_mouse = enter_next(driver, current_mouse, '#newAccountManagement > a') tb_zhifubao_binding["account_type"] = get_account_type(driver) # 等待页面加载 refresh_page(driver, "p.ui-tipbox-explain > a:nth-child(1)") # 进入支付宝页面 enter_next(driver, current_mouse, "p.ui-tipbox-explain > a:nth-child(1)") refresh_page(driver, "td.i-assets-balance") try: # 点击屏幕 c_url = driver.current_url.split("/")[2] if c_url == "my.alipay.com": pass elif c_url == "mrchportalweb.alipay.com": # 支付宝商家界面 driver.get("https://personalweb.alipay.com/portal/i.htm") else: driver.get("https://personalweb.alipay.com/portal/i.htm") # 显示支付宝隐藏标签 hide_label_list = driver.find_elements_by_xpath('//a[text()="显示金额"]') if hide_label_list != []: a = 1 b = 1 c = 1 num = 0 while a + b + c: num += 1 if num >= 20: break star_list = driver.find_elements_by_xpath("//strong") for i, star in enumerate(star_list): if i == 0: try: if "*" in star.text: hide_label_list[0].click() else: a = 0 except: continue elif len(star_list) > 3 and i == 3: try: if "*" in star.text: hide_label_list[2].click() else: c = 0 except: continue elif len(star_list) > 3 and (i == 1 or i == 2): try: if "*" in star.text: hide_label_list[1].click() else: b = 0 except: continue html_str = driver.page_source.encode("utf-8").decode() alipay_html = etree.HTML(html_str) # 支付宝账户 data_list = alipay_html.xpath( '//*[@id="J-userInfo-account-userEmail"]//text()') tb_zhifubao_binding["zhifubao_account"] = ''.join(data_list) # 花呗总额度 data_list = alipay_html.xpath('//p[text()="总额度:"]//strong//text()') tb_zhifubao_binding["huabei_total_credit_amount"] = ''.join(data_list) if tb_zhifubao_binding["huabei_total_credit_amount"] == "": data_list = alipay_html.xpath( '//h3[text()="花呗"]/../..//strong[2]//text()') tb_zhifubao_binding["huabei_total_credit_amount"] = ''.join( data_list) # 余额宝历史累计收益 data_list = alipay_html.xpath('//*[@id="J-income-num"]//text()') tb_zhifubao_binding["total_profit"] = ''.join(data_list) # 支付宝余额 data_list = alipay_html.xpath( '//a[text()="充 值"]/../../../div/strong//text()') tb_zhifubao_binding["balance"] = ''.join(data_list) # 余额宝账户余额 data_list = alipay_html.xpath( '//tbody[1]//h3[text()="余额宝"]/../../div[2]/p[1]/strong//text()') tb_zhifubao_binding["total_quotient"] = ''.join(data_list) if tb_zhifubao_binding["total_quotient"] == "": data_list = alipay_html.xpath( '//a[text()="转出"][1]/../..//strong//text()') tb_zhifubao_binding["total_quotient"] = ''.join(data_list) # 花呗可用额度 data_list = alipay_html.xpath('//p[text()="可用额度"]//strong//text()') tb_zhifubao_binding["huabei_credit_amount"] = ''.join(data_list) if tb_zhifubao_binding["huabei_credit_amount"] == "": data_list = alipay_html.xpath( '//h3[text()="花呗"]/../..//strong[1]//text()') tb_zhifubao_binding["huabei_credit_amount"] = ''.join(data_list) # 进入账户管理 wait_ele(driver, ".userInfo-portrait") hua_bei = driver.find_element_by_css_selector(".userInfo-portrait") ActionChains(driver).move_to_element(hua_bei).click().perform() wait_ele(driver, ".table-list") html_str = driver.page_source.encode("utf-8").decode() basic_info_html = etree.HTML(html_str) # 绑定的手机号 data_list = basic_info_html.xpath('//tbody/tr[3]/td[1]/span/text()') tb_zhifubao_binding["binding_phone"] = ''.join(data_list) # 支付宝账户类型 tb_zhifubao_binding["account_type"] = "个人账户" # 支付宝实名认证的姓名 data_list = basic_info_html.xpath('//*[@id="username"]/text()') tb_zhifubao_binding["verified_name"] = ''.join(data_list) # 支付宝实名认证的身份证号 data_list = basic_info_html.xpath('//tbody/tr[1]/td[1]/span[3]/text()') tb_zhifubao_binding["verified_id_card"] = ''.join(data_list) except Exception as e: tb_zhifubao_binding = "" logger.error(e) return tb_zhifubao_binding, current_mouse