def get_detail(content): item = {} # print(content) doc = pq(content) item['link_id'] = doc.find("#J_Pine").attr("data-itemid") item['shop_id'] = doc.find("#J_Pine").attr("data-shopid") item['typeabbrev'] = "" item['price_erp'] = 0 item['currabrev'] = "CNY" item['operator'] = "" item['last_time'] = time_now() item['flag'] = "add" item['freight'] = doc("#J_WlServiceTitle").text() item['ratio'] = 1 item['promotionprice'] = 0 item['package_number'] = 1 item['SpiderDate'] = time_now() item['Checker'] = "" item['CheckDate'] = time_now() item["description"] = doc.find(".tb-main-title").text() item["rates"] = doc.find("#J_RateCounter").text() if item['rates'] == "-": item['rates'] = 0 item["sales"] = doc.find("#J_SellCounter").text() if item['sales'] == "-": item['sales'] = 0 # print(item) return item
def maintain(self, operation, **kwargs): item = { 'stockid': kwargs['goodsCode'], 'link_id': kwargs['link_id'], 'shop_id': self.shop_id(kwargs['fromStore']), 'price_tb': kwargs['unitPrice'], # 'first_discount': kwargs['unitBenefits'], 'currabrev': 'CNY', 'operator': '爬虫维护', 'SpiderDate': time_now(), 'attribute': kwargs['goodsAttribute'], 'flag': None, 'description': kwargs['tbName'], 'typeabbrev': "", 'price_erp': 0, 'last_time': time_now(), 'freight': "", 'ratio': 1, 'promotionprice': 0, 'sales': 0, 'rates': 0, 'Checker': "", 'package_number': 1, 'CheckDate': time_now(), } if operation == "更新": item['flag'] = 'update' item_set = { 'SpiderDate': time_now(), 'flag': 'update', 'price_tb': kwargs['unitPrice'], 'description': kwargs['tbName'], 'ratio': kwargs['ratio'], 'attribute': kwargs['goodsAttribute'], # 'first_discount': kwargs['unitBenefits'] } item_where = { 'stockid': kwargs['goodsCode'], 'link_id': kwargs['link_id'], 'shop_id': self.shop_id(kwargs['fromStore']) } mysql.update_data(t="prices_tb", set=item_set, c=item_where) elif operation == "创建": item['flag'] = 'create' mysql.insert_data(t="prices_tb", d=item) else: item['flag'] = 'lookup' self.report_in(**item)
async def verify(self, p): try: await p.waitForSelector("div.aq_overlay_mask", timeout=10000) except errors.TimeoutError: pass else: logger.info("需要要手机验证码") if LINUX: test_server = ts.copy() test_server['db'] = "test" id = random.randint(0, 100) mysql.insert_data(db=test_server, t="phone_verify", d={"id": id}) frames = p.frames net_check() verify_code = "0" while True: net_check() await frames[1].click(".J_SendCodeBtn") for i in range(120): await asyncio.sleep(5) res = mysql.get_data( db=test_server, cn=["verify_code"], t="phone_verify", c={"id": id}, ) verify_code = res[0][0] if verify_code != "0": mysql.delete_data(db=test_server, t="phone_verify", c={"id": id}) break if verify_code != "0": break await asyncio.sleep(10) else: frames = p.frames net_check() await frames[1].click(".J_SendCodeBtn") verify_code = input(time_now() + " | 请输入6位数字验证码:") # await frames[1].click(".J_SendCodeBtn") # verify_code = input(time_now() + " | 请输入6位数字验证码:") await frames[1].type(".J_SafeCode", verify_code, {'delay': self.input_time_random() - 50}) net_check() await frames[1].click("#J_FooterSubmitBtn")
async def verify(self, p): try: await p.waitForSelector("div.aq_overlay_mask", timeout=10000) except errors.TimeoutError: pass else: logger.info("需要要手机验证码") await asyncio.sleep(10) frames = p.frames net_check() await frames[1].click(".J_SendCodeBtn") a = input(time_now() + " | 请输入6位数字验证码:") await frames[1].type(".J_SafeCode", a, {'delay': self.input_time_random() - 50}) net_check() await frames[1].click("#J_FooterSubmitBtn")
async def run(self): net_check() await self.page.goto(self.url) await asyncio.sleep(2) await self.page.waitForSelector("input[name='queryItemId']", timeout=0) frames = self.page.frames frame = await self.get_nc_frame(frames) if frame: logger.info("条形验证码") while True: await asyncio.sleep(1) await frame.hover("#nc_1_n1z") await self.page.mouse.down() await self.page.mouse.move( 2000, 0, {'delay': random.randint(1000, 2000)}) await self.page.mouse.up() try: frame.waitForSelector(".nc-lang-cnt a", timeout=10000) await asyncio.sleep(2) await frame.click(".nc-lang-cnt a") except errors.TimeoutError: await asyncio.sleep(1) slider = await frame.J("#nc_1_n1z") if not slider: break except errors.PageError: await asyncio.sleep(1) slider = await frame.J("#nc_1_n1z") if not slider: break operator = "" if MODE == 2: operator = input(time_now() + " | 输入操作者名字:") if operator: self.operator = operator logger.info("当前操作者 :" + self.operator) while True: a = await self.fix_data() if a == 1: break
async def next_page(self, page_num=1): """执行翻页""" temp = 0 while 1: t = time_zone(["08:00", "18:00", "23:00"]) a = datetime.datetime.now() if a < t[0]: if not temp: temp = 0 n_p_time = 600 elif t[0] < a < t[1]: temp += 1 if temp == 1: page_num = 1 n_p_time = NEXT_PAGE_TIME elif a > t[2]: n_p_time = 60 if not LINUX: subprocess.call("shutdown /s") exit("到点关机") else: n_p_time = 60 await self.page.bringToFront() if self.orderno: await self.page.focus("#bizOrderId") await asyncio.sleep(1) await self.page.keyboard.down("ShiftLeft") await asyncio.sleep(1) await self.page.keyboard.press("Home") await asyncio.sleep(1) await self.page.keyboard.down("ShiftLeft") await asyncio.sleep(1) await self.page.keyboard.press("Delete") await asyncio.sleep(1) orderno = input(time_now() + " | 输入订单号:") await self.page.type("#bizOrderId", orderno) await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) net_check() await self.page.click(".button-mod__primary___17-Uv") await asyncio.sleep(10) else: while 1: try: await self.page.waitForSelector( ".pagination-options-go") await self.page.focus(".pagination-options input") # await self.page.click(".pagination-options input", clickCount=2) await self.page.keyboard.press("Delete") await self.page.keyboard.press("Delete") await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") await self.page.keyboard.press("Backspace") await self.page.keyboard.press("Backspace") await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) net_check() await self.page.type(".pagination-options input", str(page_num)) await self.page.keyboard.press("Enter") self.page.waitForSelector( ".pagination-item.pagination-item-" + str(page_num) + ".pagination-item-active", timeout=10000) except errors.TimeoutError: logger.info('翻页超时,5秒后重新翻页') sleep(5) else: break # await self.page.waitForSelector(".pagination-item-" + str(page_num) + " a", timeout=30000) # await self.page.click(".pagination-item-" + str(page_num) + " a") while 1: if self.complete == 1: s = random.random() if s > 0.5: await self.link_spider() await self.order_page() logger.info(str(int(s * n_p_time)) + " 秒后开始下一页爬取") sleep(int(s * n_p_time)) break elif self.complete == 2: page_num = 0 s = random.random() if s > 0.9: mysql.update_data(t="tb_order_spider", set={"isDetaildown": 0}, c={ "isDetaildown": 2, "fromStore": self.fromStore }) sleep(int(s * n_p_time)) break else: # if i == 59: # logger.info("超时") # await self.page.screenshot({'path': './headless-test-result.png'}) await asyncio.sleep(3) self.complete = 0 page_num += 1
async def parse(self, mainOrders, pageNum): """解析爬取内容信息""" t = time_zone(["08:00", "18:00", "23:59"]) a = datetime.datetime.now() if a < t[0]: eoc = EARLIEST_ORDER_CREATETIME elif t[0] < a < t[1]: eoc = 2 else: eoc = 20 start_time = datetime.datetime.now() logger.info("开始第 " + str(pageNum) + " 页订单爬取") logger.info(store_trans(self.fromStore)) if pageNum == 1: self._loop_start_time = datetime.datetime.now() loop_control = 0 for i in range(len(mainOrders)): order = {} # 用于存储订单详细信息 order['orderNo'] = mainOrders[i]["id"] order['createTime'] = mainOrders[i]['orderInfo']['createTime'] order['buyerName'] = mainOrders[i]['buyer']['nick'] flag = mainOrders[i]['extra']['sellerFlag'] order['actualFee'] = mainOrders[i]['payInfo']['actualFee'] order['deliverFee'] = re.search( "\(含快递:¥(\d+\.\d+)\)", mainOrders[i]['payInfo']['postType']).group(1) order['datailURL'] = "https:" + mainOrders[i]['statusInfo'][ 'operations'][0]['url'] order['orderStatus'] = mainOrders[i]['statusInfo']['text'] order['fromStore'] = self.fromStore order['updateTime'] = time_now() if flag == 1: data_url = self.base_url + mainOrders[i]['operations'][0][ 'dataUrl'] order['sellerFlag'] = await self.get_flag_text(data_url) try: order['isPhoneOrder'] = mainOrders[i]['payInfo']['icons'][0][ 'linkTitle'] except KeyError: pass items = mainOrders[i]['subOrders'] line_no = 0 for j in range(len(items)): continue_code = 0 item = {} # 用于存储售出商品详细信息 item['orderNo'] = mainOrders[i]["id"] item['itemNo'] = line_no try: item['goodsCode'] = items[j]['itemInfo']['extra'][0][ 'value'] except KeyError: item['goodsCode'] = 'error' logger.error(time_now() + " 订单:" + item['orderNo']) item['tbName'] = items[j]['itemInfo']['title'].strip() \ .replace("±", "±").replace("Φ", "Φ").replace("Ω", "Ω") \ .replace("—", "—").replace("°", "°").replace("×", "×") \ .replace("μ", "μ").replace(" ", "").replace("(", "(").replace(")", ")") item['unitPrice'] = items[j]['priceInfo']['realTotal'] item['sellNum'] = items[j]['quantity'] item['orderStatus'] = order['orderStatus'] if self.orderno: logger.info(item['orderStatus']) item['refundStatus'] = None item['isRefund'] = 0 item['goodsAttribute'] = "" item['url'] = "https:" + items[j]['itemInfo']['itemUrl'] try: goodsAttributes = items[j]['itemInfo']['skuText'] except KeyError: pass else: temp = [] for k in range(len(goodsAttributes)): try: goodsAttributes[k]['name'] except KeyError: n = len(temp) temp[n - 1] += goodsAttributes[k]['value'].replace( "&Omega", "Ω").replace("·", "·") else: temp.append(goodsAttributes[k]['value'].replace( "&Omega", "Ω").replace("·", "·")) temp_ga = "-".join(temp) item['goodsAttribute'] = temp_ga.replace("(", "(").replace( ")", ")") try: operations = items[j]['operations'] except KeyError: pass else: for x in range(len(operations)): t = operations[x]['style'] if t in ['t12', 't16' ] and operations[x]['text'] != "退运保险": item['refundStatus'] = operations[x]['text'] item['isRefund'] = "1" elif t == 't0' and operations[x]['text'] == '已取消': continue_code = 1 delete_item = { 'orderNo': item['orderNo'], 'itemNo': item['itemNo'], 'goodsCode': item['goodsCode'] } is_exist = mysql.get_data( t="tb_order_detail_spider", l=1, c=delete_item) if is_exist: mysql.delete_data(t="tb_order_detail_spider", c=delete_item) sql = """ UPDATE tb_order_detail_spider SET itemNo=itemNo-1 WHERE OrderNo='%s' and itemNo>'%s' """ % (item['orderNo'], item['itemNo']) mysql.update_data(sql=sql) pass if continue_code: continue else: line_no += 1 self.save_in_sql(item=item, tableName='tb_order_detail_spider') self.save_in_sql(item=order, tableName='tb_order_spider') if self.orderno: logger.info("定向爬取订单完成") return date = datetime.date.today() date_limit = ( date - datetime.timedelta(eoc)).strftime("%Y-%m-%d %H:%M:%S") if order['createTime'] < date_limit: logger.info("完成本轮爬取,共翻 " + str(pageNum) + " 页。") loop_control = 1 break end_time = datetime.datetime.now() spend_time = end_time - start_time logger.info( str(spend_time.seconds) + " 秒完成第 " + str(pageNum) + " 页订单爬取") if loop_control: self._loop_end_time = datetime.datetime.now() loop_spend_time = round( (self._loop_end_time - self._loop_start_time).seconds / 60, 0) logger.info(str(loop_spend_time) + " 分钟完成本轮订单爬取") self.complete = 2 else: self.complete = 1
async def parse(self, data): if data != "q": for i in range(len(data)): self.item = {} self.item = self.common.copy() self.item['stockid'] = re.search( "编码:(.*)", data[i]['itemDesc']['desc'][1]['text']).group(1).upper() self.item['link_id'] = data[i]['itemId'] self.item['attribute'] = "" self.item['flag'] = "update" self.item['typeabbrev'] = self.fromStore self.item['shop_id'] = self.shop_id(self.fromStore) self.item['SpiderDate'] = time_now() temp_des = data[i]['itemDesc']['desc'][0]['text'] self.item['description'] = temp_des.replace("(", "(").replace( ")", ")") self.item['price_tb'] = re.findall( "(\d+.?\d*)", data[i]["managerPrice"]['currentPrice'])[0] self.item['promotionprice'] = self.promo_price.get( self.item['link_id']) # print(self.promo_price) sql = "select spe_link from prices_tb_fix where link_id='%s' and server='%s'" % ( self.item['link_id'], self.sn) spe_link_id = mysql.get_data(db=self.db_test, sql=sql, return_one=True) isMut = re.search("^MUT\D*", self.item['stockid']) if isMut or spe_link_id: await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) await asyncio.sleep(1) net_check() await self.page.click( ".next-table-row td:nth-child(2) div.product-desc-hasImg span:nth-child(2) i" ) await asyncio.sleep(1) await self.page.keyboard.press('Escape') else: # print(self.item) if self.item['promotionprice'] is None: mail("price_tb_error", self.fromStore + ":" + self.item['link_id'], ["*****@*****.**"]) logger.error("error:" + self.fromStore + " : " + self.item['link_id'] + " and " + mysql.concat(self.promo_price, "=")) self.complete = 2 break condition = { "stockid": self.item['stockid'], "link_id": self.item['link_id'], "shop_id": self.item['shop_id'], } res = mysql.get_data(t="prices_tb", l=1, cn=["id"], c=condition, db=self.target_server) if res: self.item['ratio'] = round( float(self.item['price_tb']) / float(res[0][0]), 2) print(self.item) mysql.update_data(t="prices_tb", set=self.item, c=condition, db=self.target_server) else: insert_item = self.item.copy() insert_item["currabrev"] = "CNY" insert_item["price_erp"] = 0 insert_item["operator"] = self.operator insert_item["last_time"] = time_now() if self.operator == "爬虫维护": insert_item["flag"] = "create" else: insert_item['flag'] = "add" insert_item["ratio"] = 1 insert_item["package_number"] = 1 insert_item["Checker"] = "" insert_item["CheckDate"] = "0000-00-00 00:00:00" print(insert_item) with open( "reports/report_" + self.fromStore + "_insert.txt", "a") as file: file.writelines("物料编码:" + insert_item['stockid'] + " 与 商品ID:" + insert_item['link_id'] + " 为最新匹配,添加至ERP系统。\n" + self.item_url + insert_item['link_id'] + "\n" + self.item_erp_url + insert_item['link_id'] + "\n\n") mysql.insert_data(t="prices_tb", d=insert_item, db=self.target_server) result = mysql.get_data( t="prices_tb", cn=["*"], c={"link_id": self.item['link_id']}, db=self.target_server, dict_result=True) if len(result) > 1: for r in result: if r['stockid'] != self.item['stockid'] and r[ 'flag'] != "del": with open( "reports/report_" + self.fromStore + "_delete.txt", "a") as file: file.writelines("物料编码:" + r['stockid'] + " 与 商品ID:" + self.item['link_id'] + " 不匹配,已被爬虫从ERP系统中删除。\n" + self.item_url + self.item['link_id'] + "\n" + self.item_erp_url + self.item['link_id'] + "\n\n") mysql.update_data(t="prices_tb", c={"id": r['id']}, db=self.target_server, set={"flag": "del"}) self.complete = 1 else: pass self.complete = 1
async def fix_data(self, link_id=None): # page = await self.browser.newPage() self.complete = 0 self.prices = {} self.promo_price = {} await asyncio.sleep(2) await self.page.focus("input[name='queryItemId']") await self.page.keyboard.down("ShiftLeft") await self.page.keyboard.press("Home") await self.page.keyboard.down("ShiftLeft") await self.page.keyboard.press("Delete") server_name = 'production_server' self.sn = server_name if not link_id: if MODE == 1: link_id = "585308692855" elif MODE == 2: while True: link_id = input(time_now() + " | 输入link_id:") isMatch = re.match("^\d{10,20}$", link_id) if isMatch: break elif MODE == 3: sql = """ SELECT link_id,updateTime,server,operator FROM prices_tb_fix WHERE fromStore='%s' and isComplete=0 ORDER BY flag LIMIT 1 """ % (self.fromStore) res = mysql.get_data(sql=sql, db=self.db_test) if res: self.target_server = self.server[res[0][2]] link_id = res[0][0] updateTime = res[0][1] server_name = res[0][2] self.sn = server_name self.operator = res[0][3] else: return 1 logger.info(link_id) page = await self.browser.newPage() await page.setViewport({'width': 1600, 'height': 900}) net_check() await page.goto("https://item.taobao.com/item.htm?id=" + link_id, timeout=0) await asyncio.sleep(3) error_page = await page.J(".error-notice-hd") # 判断宝贝是否正常在售 offline = await page.J("#J_detail_offline") # 判断宝贝是否正常在售 if error_page or offline: logger.info("商品已下架") mysql.update_data(t="prices_tb", set={ "flag": "XiaJia", "typeabbrev": self.fromStore }, c={"link_id": link_id}, db=self.target_server) # mysql.update_data(t="tb_order_detail_spider", # set={"link_id": link_id + "xiajia"}, # c={"link_id": link_id}, # db=self.target_server) mysql.update_data(db=self.db_test, t="prices_tb_fix", set={ "isComplete": "2", "updateTime": time_now() }, c={ "link_id": link_id, "server": server_name }) await page.close() return else: while True: content = await page.content() # print(content) doc = pq(content) self.common['rates'] = doc.find("#J_RateCounter").text() self.common['sales'] = doc.find("#J_SellCounter").text() self.common['freight'] = doc.find("#J_WlServiceTitle").text() mat1 = re.match("\d+", self.common['sales']) mat2 = re.match("\d+", self.common['rates']) if mat1 and mat2: break res = re.findall('";(.*?);".*?e":"(\d+\.\d+).*?d":"(\d+)"', content) # 判断是否存在多属性 if res: control = 1 benefit_price = 0 for r in res: data_values = r[0].split(";") prop = [] for data in data_values: prop.append( doc.find("li[data-value='" + data + "'] span").text()) if control: for data in data_values: try: await page.click('li[data-value="' + data + '"]') except errors.PageError: pass content_p = await page.content() promo_price = re.findall( '<em id="J_PromoPriceNum".*?>(\d+\.?\d*)</em>', content_p) # 判断是否存在优惠 if len(promo_price) == 1: benefit_price = float(r[1]) - float(promo_price[0]) control = 0 self.prices[r[2]] = r[1] prop.reverse() self.prop[r[2]] = "-".join(prop) for r in res: if benefit_price: self.promo_price[r[2]] = round( float(r[1]) - benefit_price, 2) else: promo_price = re.findall( '<em id="J_PromoPriceNum".*?>(\d+.*\d*)</em>', content) # 判断是否存在优惠 if promo_price: self.promo_price[link_id] = promo_price[0] else: self.promo_price[link_id] = 0 # print(self.prices) # print(self.promo_price) await page.close() await self.page.type("input[name='queryItemId']", link_id) await self.page.setRequestInterception(True) self.page.on('request', self.intercept_request) self.page.on('response', self.intercept_response) await asyncio.sleep(1) net_check() await self.page.click(".filter-footer button:first-child") while True: await asyncio.sleep(1) if self.complete == 1: res = mysql.get_data(db=self.db_test, t="prices_tb_fix", c={ "link_id": link_id, "server": server_name }) if res: mysql.update_data(db=self.db_test, t="prices_tb_fix", set={ "isComplete": "1", "updateTime": time_now() }, c={ "link_id": link_id, "server": server_name }) break elif self.complete == 2: mysql.update_data(db=self.db_test, t="prices_tb_fix", set={"spe_link": "1"}, c={ "link_id": link_id, "server": server_name }) break
async def parse_2(self, data): verify = [] repeat_list = [] for i in data['skuOuterIdTable']['dataSource']: self.item['stockid'] = i['skuOuterId'] logger.info(self.item['stockid']) if not self.item['stockid']: continue else: if self.item['stockid'] not in verify: verify.append(self.item['stockid']) else: if self.item['stockid'] not in repeat_list: repeat_list.append(self.item['stockid']) skuId = str(i['skuId']) temp_attr = self.prop.get(skuId) self.item['attribute'] = temp_attr.replace("(", "(").replace(")", ")") if not self.item['attribute']: self.item.pop('attribute') self.item['price_tb'] = self.prices.get(skuId) if self.promo_price: self.item["promotionprice"] = self.promo_price.get(skuId) else: self.item["promotionprice"] = 0 condition = { "stockid": self.item['stockid'], "link_id": self.item['link_id'], "shop_id": self.item['shop_id'], } res = mysql.get_data(t="prices_tb", l=1, cn=["price_tb"], c=condition, db=self.target_server) if res: if res[0][0] == 0: self.item['ratio'] = 1 else: self.item['ratio'] = round( float(self.item['price_tb']) / float(res[0][0]), 2) print(self.item) mysql.update_data(t="prices_tb", set=self.item, c=condition, db=self.target_server) else: insert_item = self.item.copy() insert_item["currabrev"] = "CNY" insert_item["price_erp"] = 0 insert_item["operator"] = self.operator insert_item["last_time"] = time_now() if self.operator == "爬虫维护": insert_item["flag"] = "create" else: insert_item['flag'] = "add" insert_item["ratio"] = 1 insert_item["package_number"] = 1 insert_item["Checker"] = "" insert_item["CheckDate"] = "0000-00-00 00:00:00" print(insert_item) with open("reports/report_" + self.fromStore + "_insert.txt", "a") as file: file.write("物料编码:" + insert_item['stockid'] + " 与商品ID:" + insert_item['link_id'] + " 为最新匹配,添加至ERP系统。\n" + self.item_url + insert_item['link_id'] + "\n" + self.item_erp_url + insert_item['link_id'] + "\n\n") mysql.insert_data(t="prices_tb", d=insert_item, db=self.target_server) if repeat_list: with open("reports/report_" + self.fromStore + "_repeat.txt", "a") as file: file.write("店铺:" + store_trans(self.fromStore) + ",商品id:" + self.item['link_id'] + " 重复编码\n" + "重复编码:" + ",".join(repeat_list) + "\n" + self.item_url + self.item['link_id'] + "\n\n") if not verify: with open("reports/report_" + self.fromStore + "_empty.txt", "a") as file: file.write("店铺:" + store_trans(self.fromStore) + ",商品id:" + self.item['link_id'] + " 空编码\n" + self.item_url + self.item['link_id'] + "\n\n") sql = """ select id,stockid from prices_tb where link_id='%s' and flag not in('del','XiaJia') """ % (self.item['link_id']) res_verify = mysql.get_data(sql=sql, db=self.target_server) for rv in res_verify: if rv[1] not in verify: with open("reports/report_" + self.fromStore + "_delete.txt", "a") as file: file.write("物料编码:" + rv[1] + " 与 商品ID:" + self.item['link_id'] + " 不匹配,已被爬虫从ERP系统中删除。\n" + self.item_url + self.item['link_id'] + "\n" + self.item_erp_url + self.item['link_id'] + "\n\n") mysql.update_data(t="prices_tb", c={"id": rv[0]}, db=self.target_server, set={ "flag": "del", "operator": self.operator, "last_time": time_now() }) self.complete = 1