def shop_service_jiexi(shop_id, content, update_time): try: x = Mymysql() x._GetConnect() tree = etree.HTML(content) items = tree.xpath(".//li[@class='service-item']//h3[@class='name']/text()") refund_day_for_no_reason = delivery_hour_after_payment = -1 for item in items: if refund_day_for_no_reason == -1: refund_day_for_no_reason = first_or_zero(re.findall(u"(\d+)天无理由退货", item)) if delivery_hour_after_payment == -1: delivery_hour_after_payment = first_or_zero(re.findall(u"(\d+)小时发货", item)) sql = """ update seller_info set refund_day_for_no_reason = '%s', delivery_hour_after_payment = '%s' where shop_id = '%s' and update_time = '%s' """ % (refund_day_for_no_reason, delivery_hour_after_payment, shop_id, update_time) try: x.ExecNonQuery(sql) except Exception as ex: print(ex) print(sql) print("=============================") x.EndSql() return "ok" except Exception as ex: print(shop_id, "shop_service_jiexi", "error:", ex) x.EndSql() return False
def extend_information_jiexi(product_id, content_watch, content_buy, content_linija, update_time): try: x = Mymysql() x._GetConnect() content_watch = return_to_json_text(content_watch) content_watch = json.loads(content_watch, encoding='gbk')['result'] content_buy = return_to_json_text(content_buy) content_buy = json.loads(content_buy, encoding='gbk')['result'] content_linija = return_to_json_text(content_linija) if content_linija: content_linija = json.loads(content_linija, encoding='gbk')['result'] else: content_linija = None re_1 = help(product_id, content_watch, x, 1, update_time) re_2 = help(product_id, content_buy, x, 2, update_time) re_3 = help(product_id, content_linija, x, 3, update_time) if re_1 == re_2 == re_3 == 'ok': return "ok" else: raise Exception("help error") except Exception as ex: print(product_id, "extend_information_jiexi", "error:", ex) return False finally: x.EndSql()
def descContent_jiexi(product_id, content, update_time, item_dict, thread_name): try: x = Mymysql() x._GetConnect() content = content.replace("var desc='", "") content = content.replace("';", "") tree = etree.HTML(content) imgs = tree.xpath(".//img") count = 1 for img in imgs: src = img.get("src") if not src or not src.endswith("jpg"): continue src = src.replace("50x50", "400x400") from_who = 1 position = 2 sequence = count pic_md5 = get_pic_md5(src, product_id) sql = """ insert into image(product_id,img_src,position,sequence,update_time,from_who,pic_md5) values('%s','%s','%s','%s','%s','%s','%s') """ % (product_id, src, position, sequence, update_time, from_who, pic_md5) try: x.ExecNonQuery(sql) count += 1 except Exception as ex: print(ex) print(sql) print("=============================") continue text = tree.xpath(".//text()") text = [item.strip() for item in text] text = " ".join(text) text = re.sub("\s+", "", text) text = text.replace("\\", "") item_dict['note'] = text.replace("'", "") return "ok" except Exception as ex: print(product_id, "descContent_jiexi", "error:", ex) file_save = os.path.join("jiexi", product_id + "_" + "descContent" + ".html") f = codecs.open(file_save, "a+", encoding="utf-8") f.write(content) f.close() return False finally: x.EndSql()
def rate_jiexi(product_id, content, update_time, item_dict): try: x = Mymysql() x._GetConnect() begin_index = content.index("(") end_index = content.rindex(")") content = content[begin_index + 1:end_index] all_info = json.loads(content, encoding="gbk") data = all_info['data'] impress_item = data['impress'] for impress in impress_item: impress_type = impress['title'] impress_count = impress['count'] impress_id = uuid.uuid1() sql = """ insert into product_impress(impress_id,product_id, impress_type,impress_count,update_time) values ('%s','%s','%s','%s','%s') """ % (impress_id, product_id, impress_type, impress_count, update_time) try: x.ExecNonQuery(sql) except Exception as ex: print(ex) print(sql) print("=============================") continue # bar info comment_with_picture_num = data['count']['pic'] append_comment_num = data['count']['additional'] moderate_comment_num = data['count']['normal'] negative_comment_num = data['count']['bad'] refund_comment_num = all_info['sellerRefundCount'] item_dict['comment_with_picture_num'] = comment_with_picture_num item_dict['append_comment_num'] = append_comment_num item_dict['moderate_comment_num'] = moderate_comment_num item_dict['negative_comment_num'] = negative_comment_num item_dict['refund_comment_num'] = refund_comment_num item_dict['positive_comment_num'] = data['count']['good'] return "ok", comment_with_picture_num except Exception as ex: print(product_id, "rate_jiexi", "error:", ex) return False, -1 finally: x.EndSql()
class Producer(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.x = Mymysql() self.x._GetConnect() sql = "SELECT pic_md5,img_src,id FROM `image` where isSaved_Picture='0' limit 20000" # where id > self.cur = self.x.ExecQueryGetcur(sql) def run(self): while True: item = self.cur.fetchone() if item == None: break img_list.put(item) print("put down~") def __del__(self): self.x.EndSql()
class Customer(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.y = Mymysql() self.y._GetConnectY() self.header = { 'Host': 'img.alicdn.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } self.set_ip() print(self.name, "初始化完毕:", self.ip) def set_ip(self): global ip_count self.ip = get_ip() self.proxies = {'http': self.ip, 'https': self.ip} ip_count += 1 def get_image(self, url, id): error_time = 0 while 1: try: if error_time == 3: return False response = requests.get(url, headers=self.header, timeout=3, proxies=self.proxies) if response.status_code != 200 and error_time == 1: raise Exception("status_code error:" + str(response.status_code)) elif response.status_code != 200 and error_time == 0: time.sleep(3) error_time = 1 continue else: return response.content except Exception as ex: self.set_ip() lock.acquire() print("%s encouter error %s: id=%s" % (self.name, str(ex), id)) lock.release() error_time += 1 def run(self): global lock while True: try: pic_md5, url, id = img_list.get() save_path = r"D:\image\%s.jpg" % (pic_md5) url = url.strip() if not url.startswith("http"): url = "https:" + url try: sql = "insert into img_unique(img_unique) values ('%s')" % ( pic_md5) self.y.ExecNonQuery(sql) except Exception as ex: if '1062' in str(ex): sql = "update image set isSaved_Picture='1' where id='%s'" % ( id) self.y.ExecNonQuery(sql) now = datetime.datetime.now() now = now.strftime('%Y-%m-%d %H:%M:%S') lock.acquire() print("1062 %s:%s:%s done at %s" % (self.name, pic_md5, str(id), now)) lock.release() else: print(ex, url) continue content = self.get_image(url, id) if not content: raise Exception("no content") f = open(save_path, "wb") f.write(content) f.close() sql = "update image set isSaved_Picture='1' where id='%s'" % ( id) self.y.ExecNonQuery(sql) now = datetime.datetime.now() now = now.strftime('%Y-%m-%d %H:%M:%S') lock.acquire() print("%s:%s:%s done at %s" % (self.name, pic_md5, str(id), now)) lock.release() except Exception as ex: log_path = os.path.join("log", self.name + ".log") f = codecs.open(log_path, "a+", encoding="utf-8") f.write("%s:%s\n" % (url, str(ex))) f.write("======================\n") f.close() def __del__(self): self.y.EndSql()
def pic_jiexi(product_id, pic_content_original,update_time,thread_name): try: x = Mymysql() x._GetConnect() pic_content = pic_content_original.strip() #pic_content = re.findall("\((.*?)\)", pic_content)[0] begin_index = pic_content.index("(") end_index = pic_content.rindex(")") pic_content = pic_content[begin_index+1:end_index] #try: pic_content = json.loads(pic_content, encoding="gbk") #except Exception as ex: # print (ex) # print(pic_content) # time.sleep(36000) max_page = pic_content['maxPage'] comments = pic_content['comments'] for item in comments: customer_name = item['user']['nick'] rate = item['user']['displayRatePic'].split('.')[0] review_id = item['rateId'] review_type = '3' content = item['content'] if item['date']: review_info = datetime.datetime.strptime(item['date'].strip(),u'%Y年%m月%d日 %H:%M') review_date = review_info.date() review_time = review_info.time() else: review_date_info = "" review_date = '0000-00-00' review_time = '00:00:00' count_num = item['useful'] refund_time = "" Brief_information = item['auction']['sku'] # 图片插入 count = 1 photos = item['photos'] for photo in photos: src = photo['url'] img_id = uuid.uuid1() from_who = '2' position = '4' sequence = count pic_md5 = get_pic_md5(src, product_id) sql = """ insert into image(pic_md5,product_id,img_src,position,sequence,update_time,from_who,review_id) values('%s','%s','%s','%s','%s','%s','%s','%s') """ % (pic_md5,product_id,src,position,sequence,update_time,from_who,review_id) try: x.ExecNonQuery(sql) count +=1 except Exception as ex: print (ex) print (sql) print ("=============================") continue back_comment = "" back_comment_day = "" # 追加解析 append_list = item['appendList'] for item_append in append_list: count = 0 for item_append_photos in item_append['photos']: src = item_append_photos['thumbnail'] img_id = uuid.uuid1() from_who = '2' position = '4' sequence = count pic_md5 = get_pic_md5(src, product_id) sql = """ insert into image(pic_md5,product_id,img_src,position,sequence,update_time,from_who,review_id) values('%s','%s','%s','%s','%s','%s','%s','%s') """ % ( pic_md5, product_id, src, position, sequence, update_time, from_who, review_id) try: x.ExecNonQuery(sql) count += 1 except Exception as ex: print(ex) print(sql) print("=============================") continue back_comment = item_append['content'] back_comment_day = item_append['dayAfterConfirm'] break content = content.replace("'","") content = content.replace("\\","") content = content.replace("?","") back_comment = back_comment.replace("'","") back_comment = back_comment.replace("\\","") back_comment = back_comment.replace("?", "") sql = """ insert into product_comment(product_id,review_id,customer_name,rate, review_type,content,review_date,review_time,Brief_information,back_comment, back_comment_day,count_num,refund_time,update_time) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % (product_id, review_id, customer_name, rate, review_type, content, review_date, review_time, Brief_information, back_comment, back_comment_day, count_num, refund_time, update_time) try: x.ExecNonQuery(sql) # print sql except Exception as ex: print(ex) print(sql) print("=============================") continue return max_page, "ok" except Exception as ex: print(product_id, "pic_jiexi", "error:", ex) return -1, False finally: x.EndSql()
def index_jiexi(shop_ID, html, shop_url, update_time): try: x = Mymysql() x._GetConnect() tree = etree.HTML(html) # 有没有进入爱逛街 guang = tree.xpath(".//a[@class='guang-logo']//text()") if len(guang) > 0: print(shop_ID, guang[0]) return True # noitem error = tree.xpath(".//*[@id='error-notice']/div[2]") if len(error) > 0: print(shop_ID, "no_item") return True products = tree.xpath(".//*[starts-with(@href,'//item.taobao.com')]") products_item = [] # item_id ,URL,name count = 1 # 1 店铺ID shop_id %s shop_id = re.findall("\"shopI[dD]\":(.*?),", html)[0].strip()[1:-1] for item in products: url = item.get("href") item_id = re.findall("id=(\d+)", url) if len(item_id) == 0: continue # 产品ID item_id = item_id[0] name = item.xpath("string(.)") # 产品名称 name = re.sub("[\r\n\t ]*", "", name) name = name.replace("\xc2\xa0", "") name = name.replace("'", "") if len(name) == 0: name = "" name = name.strip()[:100] # 产品相对顺序 sequence = count #print [shop_id,item_id,url,name,sequence] # 日期 sql = "insert ignore into shop_homepage(shop_id,product_id,product_name,sequence,update_time) values ('%s','%s','%s',%d,'%s')" % ( shop_id, item_id, name, sequence, update_time) #print sql try: x.ExecNonQuery(sql) count += 1 except Exception as ex: print(ex) print(sql) print("=============================") continue x.EndSql() if count < 10: print(shop_ID, ":", shop_id, ":", u"主页产品", ":", count, shop_url) alipay_Authentication = tree.xpath( ".//span[@class='id-time J_id_time']/text()") if not alipay_Authentication: alipay_Authentication = '0000-00-00' else: alipay_Authentication = alipay_Authentication[0] return alipay_Authentication except Exception as ex: print(ex) save_path = os.path.join("re_jiexi", "%s_index" % (shop_ID)) f = codecs.open(save_path, "w+", encoding="utf-8") f.write(html) f.close() x.EndSql() return False
def main_method(self, product_id, total_sales_volume, shop_id): try: update_time = datetime.datetime.now().strftime("%Y-%m-%d") main_url = "https://item.taobao.com/item.htm?id=%s" % (product_id) # 获取主页内容 mainPageConent, status_code = self.get_page(main_url, host="item.taobao.com") if status_code == 'no': self.f_log.write("%s can not crawl\n" % (main_url)) return "ok" self.header['Referer'] = main_url if not mainPageConent: self.f_log.write(" get mainPageConent fail status_code:%s\n" % status_code) raise Exception("mainPageConent error") else: self.f_log.write("get mainPageConent succeed\n") if '下架' in mainPageConent: return 'ok' # 获取categroy_id,sellerId,descUrl categroy_id = re.findall('data-catid="(\d+)"', mainPageConent) if not categroy_id: raise Exception("categroy_id out of index") else: categroy_id = categroy_id[0] sellerId = re.findall("sellerId\s*:\s*'(\d+)',", mainPageConent) if not sellerId: raise Exception("sellerId out of index") else: sellerId = sellerId[0] descUrl = re.findall("location.protocol===(.*),", mainPageConent) if not descUrl: raise Exception("descUrl out of index") else: descUrl = descUrl[0] # 申明商品字典 product_information_Item_dict = {} product_information_Item_dict['shop_id'] = shop_id product_information_Item_dict['product_id'] = product_id product_information_Item_dict['update_time'] = update_time product_information_Item_dict['total_sales_volume'] = total_sales_volume self.f_log.write("categroy_id,sellerId,descUrl succeed\n") # 解析主页内容 main_page_jiexi_re = main_page_jiexi(product_id, mainPageConent, update_time, product_information_Item_dict,self.name) if main_page_jiexi_re != 'ok': self.f_log.write("main_page_jiexi_fail\n") raise Exception("main_page_jiexi_re error") else: self.f_log.write("main_page_jiexi_succeed\n") # 获取detail页面 detail_url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=%s&sellerId=%s&modules=dynStock,qrcode,viewer,price,contract,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,tradeContract&callback=onSibRequestSuccess" % ( product_id, sellerId) detail_content, satus_code = self.get_page(detail_url, host="detailskip.taobao.com") if not detail_content: self.f_log.write("get detail_content fail %s\n" % str(satus_code)) self.f_log.write(detail_url + "\n") raise Exception("detail_content error") else: self.f_log.write("get detail_content succeed\n") # 解析detail 页面 detail_content_jiexi_re = detail_content_jiexi(product_id, detail_content, update_time, product_information_Item_dict) if detail_content_jiexi_re != 'ok': self.f_log.write("detail_content_jiexi_re fail\n") raise Exception("detail_content_jiexi_re error") else: self.f_log.write("detail_content_jiexi_re succeed\n") # 获取描述内容 descUrl = "https:" + descUrl.split(":")[2].strip()[1:-1] descContent, status_code = self.get_page(descUrl, host="desc.alicdn.com") if not descContent: self.f_log.write("get descContent fail,status_code:%s\n" % status_code) raise Exception("descContent Error") else: self.f_log.write("get descContent succeed\n") # 解析描述内容 descContent_jiexi_re = descContent_jiexi(product_id, descContent, update_time, product_information_Item_dict,self.name) if descContent_jiexi_re != 'ok': self.f_log.write("get descContent_jiexi fail\n") raise Exception("descContent_jiexi_re error") else: self.f_log.write("get descContent_jiexi succeed\n") # 累计评论获取 detailCounturl = "https://rate.taobao.com/detailCount.do?itemId=%s" % (product_id) detailCountContent, status_code = self.get_page(detailCounturl, host="rate.taobao.com") if not detailCountContent: self.f_log.write("get detailCountContent fail,status_code:%s\n" % status_code) raise Exception("detailCount_Content Error") else: self.f_log.write("get detailCountContent succeed\n") detailCount = re.findall("\d+", detailCountContent) if not detailCount: self.f_log.write("detailCount index out of range\n") raise Exception("detailCount index out of range") else: self.f_log.write("detailCount succeed\n") product_information_Item_dict['cumulative_review'] = detailCount[0] # 看了还看了获取 recommend_one_url = "https://tui.taobao.com/recommend?&callback=detail_recommend_viewed&appid=9&count=12&sellerid=%s&itemid=%s&categoryid=%s" % ( sellerId, product_id, categroy_id) recommend_oneContent, status_code = self.get_page(recommend_one_url, host="tui.taobao.com") if not recommend_oneContent: self.f_log.write("get recommend_oneContent fail,status_code:%s\n" % status_code) raise Exception("recommend_oneContent Error") # 还买了 recommend_two_url = "https://tui.taobao.com/recommend?callback=detail_recommend_bought&appid=11&" + "count=12&sellerid=%s&itemid=%s&categoryid=%s" % ( sellerId, product_id, categroy_id) recommend_twoContent, status_code = self.get_page(recommend_two_url, host="tui.taobao.com") if not recommend_twoContent: self.f_log.write("get recommend_twoContent fail, status_code:%s\n" % status_code) raise Exception("recommend_twoContent Error") # 邻家好货 recommend_third_url = "https://tui.taobao.com/recommend?itemid=%s&sellerid=%s&callback=jsonp1524&appid=3066" % ( product_id, sellerId) recommend_thirdContent, status_code = self.get_page(recommend_third_url, host="tui.taobao.com") if not recommend_thirdContent: self.f_log.write("get recommend_thirdContent fail,status_code:%s\n" % status_code) raise Exception("recommend_thirdContent Error") # 拓展信息解析 extend_information_jiexi_re = extend_information_jiexi(product_id, recommend_oneContent, recommend_twoContent, recommend_thirdContent, update_time) if not extend_information_jiexi_re: self.f_log.write("extend_information_jiexi_re fail\n") raise Exception("extend_information_jiexi error") else: self.f_log.write("extend_information_jiexi succeed\n") # 大家印象 及 评论数量 rate_url = "https://rate.taobao.com/detailCommon.htm?auctionNumId=%s&userNumId=%s&callback=json_tbc_rate_summary" % ( product_id, sellerId) rate_content, status_code = self.get_page(rate_url, host='rate.taobao.com') if not rate_content: self.f_log.write("get rate_content fail,status_code:%s\n" % (status_code)) raise Exception("rate_content error") else: self.f_log.write("get rate_content succeed\n") # rate 解析 rate_jiexi_re, comment_with_picture_num = rate_jiexi(product_id, rate_content, update_time, product_information_Item_dict) if not rate_jiexi_re: self.f_log.write("get rate_jiexi_re error\n") raise Exception("rate_jiexi_re error") else: self.f_log.write("rate_jiexi succeed\n") # 收藏数量获取 collectcount_URL = "https://count.taobao.com/counter3?callback=jsonp87&keys=ICCP_1_%s" % (product_id) collectcount_Content, status_code = self.get_page(collectcount_URL, host='count.taobao.com') if not collectcount_Content: self.f_log.write("get collectcount_Content fail,status_code:%s\n" % (status_code)) raise Exception("collectcount_Content error") else: self.f_log.write("get collectcount_Content succeed\n") # 解析收藏数量 product_information_Item_dict['collection_number'] = re.findall("\d+", collectcount_Content.split(":")[1])[ 0] # 图片获取 if str(comment_with_picture_num) != '0': get_picture_comment_re = self.get_picture_comment(product_id, sellerId, update_time) if get_picture_comment_re == 'ok': self.f_log.write("get_picture_comment succeed\n") else: raise Exception("get_picture_comment_re ERROR" % ()) else: self.f_log.write("not exist picture\n") # 插入产品信息 sql = """ insert into product_information( shop_id, total_sales_volume, product_name, product_profile, cumulative_review, transaction_volume, price, taobao_price, place_of_delivery, express_fee, amount_of_inventory, promise, payment_method, collection_number , note, update_time, product_id,comment_with_picture_num, append_comment_num,moderate_comment_num,negative_comment_num, refund_comment_num,positive_comment_num ) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % ( shop_id, total_sales_volume, product_information_Item_dict['product_name'], product_information_Item_dict['product_profile'], product_information_Item_dict['cumulative_review'], product_information_Item_dict['transaction_volume'], product_information_Item_dict['price'], product_information_Item_dict['taobao_price'], product_information_Item_dict['place_of_delivery'], product_information_Item_dict['express_fee'], product_information_Item_dict['amount_of_inventory'], product_information_Item_dict['promise'], product_information_Item_dict['payment_method'], product_information_Item_dict['collection_number'], product_information_Item_dict['note'], update_time, product_id, product_information_Item_dict['comment_with_picture_num'], product_information_Item_dict['append_comment_num'], product_information_Item_dict['moderate_comment_num'], product_information_Item_dict['negative_comment_num'], product_information_Item_dict['refund_comment_num'], product_information_Item_dict['positive_comment_num'] ) x = Mymysql() x._GetConnect() try: x.ExecNonQuery(sql) except: print(sql) raise Exception("product information insert error") finally: x.EndSql() return "ok" except Exception as ex: self.f_log.write("main method error:%s:%s\n" % (str(ex), self.name)) return False
self.f_log.close() self.session.close() import threading lock = threading.Lock() get_lock = threading.Lock() his_ip = [] a = time.time() # 获取带爬商品ID 列表 x = Mymysql() x._GetConnect() sql = "SELECT product_id,total_sales_volume,shop_id FROM `product_list`; " target_ids = x.ExecQuery(sql) x.EndSql() print("loading...end") #f = codecs.open("thisTurn.txt", "r+", encoding="utf-8") #target_ids = f.readlines() #target_ids = [item[:-1].strip().split(":") for item in target_ids] #target_ids = list(filter(lambda x: len(x) == 3, target_ids)) #f.close() print("商品数:", len(target_ids)) # has crawl if not os.path.isfile("has_craw.txt"): f = open("has_craw.txt", "w+") f.close() f = codecs.open("has_craw.txt", "r+", encoding="utf-8") has_crawl_IDS = f.readlines()
def rating_jiexi(shop_ID, html, update_time, alipay_Authentication_main_page): try: x = Mymysql() x._GetConnect() values = [] tree = etree.HTML(html) # 1 店铺ID shop_id %s shop_id = re.findall("\"shopID\":(.*?),", html)[0].strip()[1:-1] #print shopID values.append(shop_id) # 2 .支付宝认证时间: alipay_Authentication alipay_Authentication = tree.xpath(".//span[@class='id-time']/text()") if len(alipay_Authentication) == 0: alipay_Authentication = alipay_Authentication_main_page else: alipay_Authentication = alipay_Authentication[0] values.append(alipay_Authentication) # 3. 主营: main_products main_products = tree.xpath(".//*[@id='chart-name']/text()")[0].strip() #print main_products values.append(main_products) # 4. 所在地区 Location location_text = "".join( tree.xpath( ".//div[@class='info-block info-block-first']//ul/li[2]/text()" )[0]) #print location_text Location = location_text.split(":")[1].strip() #print Location values.append(Location) # 5. 卖家信用 seller_credit seller_credit = tree.xpath( ".//ul[@class='sep']/li[1]/text()")[0].split(":")[1].strip() values.append(seller_credit) # 6. 买家信用 buyer_credit buyer_credit = tree.xpath(".//ul[@class='sep']/li[2]/text()")[0].split( ":")[1].strip() values.append(buyer_credit) # 7.保证金余额: seller_bond seller_bond = tree.xpath(".//div[@class='charge']/span/text()") if len(seller_bond) == 0: seller_bond = 0 else: seller_bond = seller_bond[0][1:].split(".")[0].replace(",", "") values.append(seller_bond) # 8-10 评分 rating = tree.xpath(".//div[@class='item-scrib']") commodity_score = -1 seller_attitude_score = -1 logistics_score = -1 commodity_score_compare = -1 seller_attitude_score_compare = -1 logistics_score_compare = -1 if len(rating) == 3: # 8.宝贝与描述相符分数 commodity_score commodity_score = rating[0].xpath( ".//em[@class='count']/text()")[0] # 9.卖家的服务态度 seller_attitude_score seller_attitude_score = rating[1].xpath( ".//em[@class='count']/text()")[0] # 10.物流服务质量 logistics_score logistics_score = rating[2].xpath( ".//em[@class='count']/text()")[0] # 11-13 同行业 # 11 宝贝与描述相符分数比同行业平均水平 commodity _score_compare rating_baby_Same_industry = rating[0].xpath(".//strong")[0] strong_class = rating_baby_Same_industry.get("class") rating_baby_industry_score = rating_baby_Same_industry.xpath( ".//text()")[0] if "--" in rating_baby_industry_score: commodity_score_compare = 0 else: commodity_score_compare = float( rating_baby_Same_industry.xpath(".//text()")[0] [:-1]) * 1.0 / 100 if "over" not in strong_class: commodity_score_compare *= -1 #print rating_baby_industry_score # 12 卖家的服务态度分数比同行业平均水平 seller_attitude_score_compare rating_seller_Same_industry = rating[1].xpath(".//strong")[0] strong_class = rating_seller_Same_industry.get("class") rating_seller_industry_score = rating_seller_Same_industry.xpath( ".//text()")[0] if "--" in rating_seller_industry_score: seller_attitude_score_compare = 0 else: seller_attitude_score_compare = float( rating_seller_Same_industry.xpath(".//text()")[0] [:-1]) * 1.0 / 100 if "over" not in strong_class: seller_attitude_score_compare *= -1 # 13 物流服务的质量分数比同行业平均水平 logistics_score_compare rating_logistics_Same_industry = rating[2].xpath(".//strong")[0] strong_class = rating_logistics_Same_industry.get("class") rating_logistics_industry_score = rating_logistics_Same_industry.xpath( ".//text()")[0] if "--" in rating_logistics_industry_score: logistics_score_compare = 0 else: logistics_score_compare = float( rating_logistics_Same_industry.xpath(".//text()")[0] [:-1]) * 1.0 / 100 if "over" not in strong_class: logistics_score_compare *= -1 values.append(commodity_score) values.append(seller_attitude_score) values.append(logistics_score) values.append(commodity_score_compare) values.append(seller_attitude_score_compare) values.append(logistics_score_compare) # 14最近一周 week = tree.xpath(".//div[@id='J_show_list']//li[1]//text()") week = filter(lambda x: len(re.findall("\d+", x)) >= 1, week) week = [re.findall("\d+", item)[0] for item in week] # 最近一周好评总数 positive_comment_week = week[0] values.append(positive_comment_week) # 最近一周中评总数 moderate_comment_week = week[1] values.append(moderate_comment_week) # 最近一周差评总数 negative_comment_week = week[2] values.append(negative_comment_week) # 最近一周所属类别好评总数 core_positive_comment_week = week[3] values.append(core_positive_comment_week) # 最近一周所属类别中评总数 core_moderate_comment_week = week[4] values.append(core_moderate_comment_week) # 最近一周所属类别差评总数 core_negative_comment_week = week[5] values.append(core_negative_comment_week) # 最近一周非主营行业好评总数 non_core_positive_comment_week = week[6] values.append(non_core_positive_comment_week) # 最近一周非主营行业中评总数 non_core_moderate_comment_week = week[7] values.append(non_core_moderate_comment_week) # 最近一周非主营行业差评总数 non_core_negative_comment_week = week[8] values.append(non_core_negative_comment_week) # 15 最近一月 month = tree.xpath(".//div[@id='J_show_list']//li[2]//text()") month = filter(lambda x: len(re.findall("\d+", x)) >= 1, month) month = [re.findall("\d+", item)[0] for item in month] # 最近一月好评总数 positive_comment_month = month[0] values.append(positive_comment_month) # 最近一月中评总数 moderate_comment_month = month[1] values.append(moderate_comment_month) # 最近一月差评总数 negative_comment_month = month[2] values.append(negative_comment_month) # 最近一月所属类别好评总数 core_positive_comment_month = month[3] values.append(core_positive_comment_month) # 最近一月所属类别中评总数 core_moderate_comment_month = month[4] values.append(core_moderate_comment_month) # 最近一月所属类别差评总数 core_negative_comment_month = month[5] values.append(core_negative_comment_month) # 最近一月非主营行业好评总数 non_core_positive_comment_month = month[6] values.append(non_core_positive_comment_month) # 最近一月非主营行业中评总数 non_core_moderate_comment_month = month[7] values.append(non_core_moderate_comment_month) # 最近一月非主营行业差评总数 non_core_negative_comment_month = month[8] values.append(non_core_negative_comment_month) # 16 最近半年 half_year = tree.xpath(".//div[@id='J_show_list']//li[3]//text()") half_year = filter(lambda x: len(re.findall("\d+", x)) >= 1, half_year) half_year = [re.findall("\d+", item)[0] for item in half_year] # 最近半年好评总数 positive_comment_half_year = half_year[0] values.append(positive_comment_half_year) # 最近半年中评总数 moderate_comment_half_year = half_year[1] values.append(moderate_comment_half_year) # 最近半年差评总数 negative_comment_half_year = half_year[2] values.append(negative_comment_half_year) # 最近半年所属类别好评总数 core_positive_comment_half_year = half_year[3] values.append(core_positive_comment_half_year) # 最近半年所属类别中评总数 core_moderate_comment_half_year = half_year[4] values.append(core_moderate_comment_half_year) # 最近半年所属类别差评总数 core_negative_comment_half_year = half_year[5] values.append(core_negative_comment_half_year) # 最近半年非主营行业好评总数 non_core_positive_comment_half_year = half_year[6] values.append(non_core_positive_comment_half_year) # 最近半年非主营行业中评总数 non_core_moderate_comment_half_year = half_year[7] values.append(non_core_moderate_comment_half_year) # 最近半年非主营行业差评总数 non_core_negative_comment_half_year = half_year[8] values.append(non_core_negative_comment_half_year) # 17 半年以前 before_half = tree.xpath(".//div[@id='J_show_list']//li[4]//text()") before_half = filter(lambda x: len(re.findall("\d+", x)) >= 1, before_half) before_half = [re.findall("\d+", item)[0] for item in before_half] #print before_half #半年以前好评总数 positive_comment_before_half_year = before_half[0] values.append(positive_comment_before_half_year) #半年以前中评总数 moderate_comment_before_half_year = before_half[1] values.append(moderate_comment_before_half_year) #半年以前差评总数 negative_comment_before_half_year = before_half[2] values.append(negative_comment_before_half_year) # 18-25 30天服务情况 table = tree.xpath(".//table[@class='tb-rate-table']/tbody/tr") total_penalty = -1 after_sales_speed_nearly_30 = -1 after_sales_speed_nearly_30_compare = -1 after_sale_rate_nearly_30 = -1 after_sale_rate_nearly_30_compare = -1 dispute_rate_nearly_30 = -1 dispute_rate_nearly_30_compare = -1 penalty_number_nearly_30 = -1 penalty_number_nearly_30_compare = -1 penalty_number_fake_good = -1 penalty_number_false_transaction = -1 penalty_number_breach_promise = -1 penalty_number_bad_desc = -1 penalty_number_malicious_harassment = -1 if len(table) != 0: # 18 售后速度 # 19 售后速度行业值 tds = table[0].xpath(".//td/text()")[1:] aftermarket_Speed = float(tds[0][:-1]) aftermarket_Industry_Speed = float(tds[2][:-1]) if tds[1] == "小于": aftermarket_Industry_Speed *= -1 #本店近30天售后速度 after_sales_speed_nearly_30 = aftermarket_Speed #本店近30天售后速度比行业均值 after_sales_speed_nearly_30_compare = aftermarket_Industry_Speed # 20 售后率 # 21 售后率行业值 tds = table[1].xpath(".//td/text()")[1:] after_sale_rate = float(tds[0][:-1].replace(",", "")) * 0.01 after_sale_Industry_rate = float(tds[2][:-1]) * 0.01 if tds[1] == "小于": after_sale_Industry_rate *= -1 # 本店近30天售后率 after_sale_rate_nearly_30 = after_sale_rate # 本店近30天售后率比行业均值 after_sale_rate_nearly_30_compare = after_sale_Industry_rate # 22 纠纷率 # 23 纠纷率行业值 tds = table[2].xpath(".//td/text()")[1:] dispute_rate = float(tds[0][:-1]) * 0.01 dispute_Industry_rate = float(tds[2][:-1]) * 0.01 if tds[1] == "小于": dispute_Industry_rate *= -1 # 本店近30天纠纷率 dispute_rate_nearly_30 = dispute_rate # 本店近30天纠纷率比行业均值 dispute_rate_nearly_30_compare = dispute_Industry_rate # 24 处罚数 # 25 处罚数行业值 tds = table[3].xpath(".//td/text()")[1:] penalty_number = float(tds[0][:-1]) penalty_Industry_number = float(tds[2][:-1]) if tds[1] == "小于": penalty_Industry_number *= -1 # 本店近30天处罚数 penalty_number_nearly_30 = penalty_number # 本店近30天处罚数比行业均值 penalty_number_nearly_30_compare = penalty_Industry_number # 26-31 虚假信息 tds = tree.xpath( ".//div[@class='J_TBR_MonthInfo_Detail detail']/div[4]")[0] info = tds.xpath("string(.)") content = info.replace('\n', '').replace(' ', '') #print content fake_info = re.findall("\d+", content) fake_info = [int(item) for item in fake_info] #print fake_info #26 本店近30天被处罚总次数 total_penalty = fake_info[1] #print total_penalty #27 因出售假冒商品,被处罚次数 penalty_number_fake_good = fake_info[2] #28 因虚假交易,被处罚次数 penalty_number_false_transaction = fake_info[3] #29 因违背承诺,被处罚次数 penalty_number_breach_promise = fake_info[4] #30 因描述不符,被处罚次数 penalty_number_bad_desc = fake_info[5] #31 因恶意骚扰,被处罚次数 penalty_number_malicious_harassment = fake_info[6] """ after_sales_speed_nearly_30, after_sales_speed_nearly_30_compare, after_sale_rate_nearly_30, after_sale_rate_nearly_30_compare, dispute_rate_nearly_30, dispute_rate_nearly_30_compare, penalty_number_nearly_30, penalty_number_nearly_30_compare, penalty_number_fake_good, penalty_number_false_transaction, penalty_number_breach_promise, penalty_number_bad_desc, penalty_number_malicious_harassment, """ values.append(total_penalty) values.append(after_sales_speed_nearly_30) values.append(after_sales_speed_nearly_30_compare) values.append(after_sale_rate_nearly_30) values.append(after_sale_rate_nearly_30_compare) values.append(dispute_rate_nearly_30) values.append(dispute_rate_nearly_30_compare) values.append(penalty_number_nearly_30) values.append(penalty_number_nearly_30_compare) values.append(penalty_number_fake_good) values.append(penalty_number_false_transaction) values.append(penalty_number_breach_promise) values.append(penalty_number_bad_desc) values.append(penalty_number_malicious_harassment) #32-38 评分 tbs = tree.xpath(".//div[@class='box-wrap']") average_score_for_commodity = '-1' count_of_judger_for_commodity = '-1' five_score_rate_for_commodity = '-1' four_score_rate_for_commodity = '-1' three_score_rate_for_commodity = '-1' two_score_rate_for_commodity = '-1' one_score_rate_for_commodity = '-1' average_score_for_seller = '-1' count_of_judger_for_seller = '-1' five_score_rate_for_seller = '-1' four_score_rate_for_seller = '-1' three_score_rate_for_seller = '-1' two_score_rate_for_seller = '-1' one_score_rate_for_seller = '-1' average_score_for_logistics = '-1' count_of_judger_for_logistics = '-1' five_score_rate_for_logistics = '-1' four_score_rate_for_logistics = '-1' three_score_rate_for_logistics = '-1' two_score_rate_for_logistics = '-1' one_score_rate_for_logistics = '-1' if len(tbs) == 3: # 宝贝评分打星 baby = tbs[0] # 宝贝评分均分 average_score_for_commodity = exist_or_0( baby.xpath(".//div[@class='total']/em[@class='h']/text()")) # 评价总人数 count_of_judger_for_commodity = exist_or_0( baby.xpath(".//div[@class='total']/span/text()")) # 五分好评人数占比 five_score_rate_for_commodity = exist_or_0( baby.xpath(".//div[@class='count count5']/em/text()")) # 四分好评人数占比 four_score_rate_for_commodity = exist_or_0( baby.xpath(".//div[@class='count count4']/em/text()")) # 三分好评人数占比 three_score_rate_for_commodity = exist_or_0( baby.xpath(".//div[@class='count count3']/em/text()")) # 二分好评人数占比 two_score_rate_for_commodity = exist_or_0( baby.xpath(".//div[@class='count count2']/em/text()")) # 一分好评人数占比 one_score_rate_for_commodity = exist_or_0( baby.xpath(".//div[@class='count count1']/em/text()")) # 服务态度评分打星 attitude = tbs[1] # 服务态度均分 average_score_for_seller = exist_or_0( attitude.xpath(".//div[@class='total']/em[@class='h']/text()")) # 评价总人数 count_of_judger_for_seller = exist_or_0( attitude.xpath(".//div[@class='total']/span/text()")) # 五分好评人数占比 five_score_rate_for_seller = exist_or_0( attitude.xpath(".//div[@class='count count5']/em/text()")) # 四分好评人数占比 four_score_rate_for_seller = exist_or_0( attitude.xpath(".//div[@class='count count4']/em/text()")) # 三分好评人数占比 three_score_rate_for_seller = exist_or_0( attitude.xpath(".//div[@class='count count3']/em/text()")) # 二分好评人数占比 two_score_rate_for_seller = exist_or_0( attitude.xpath(".//div[@class='count count2']/em/text()")) # 一分好评人数占比 one_score_rate_for_seller = exist_or_0( attitude.xpath(".//div[@class='count count1']/em/text()")) # 物流评分打星 logistic = tbs[2] # 物流评分均分 average_score_for_logistics = exist_or_0( logistic.xpath(".//div[@class='total']/em[@class='h']/text()")) # 评价总人数 count_of_judger_for_logistics = exist_or_0( logistic.xpath(".//div[@class='total']/span/text()")) # 五分好评人数占比 five_score_rate_for_logistics = exist_or_0( logistic.xpath(".//div[@class='count count5']/em/text()")) # 四分好评人数占比 four_score_rate_for_logistics = exist_or_0( logistic.xpath(".//div[@class='count count4']/em/text()")) # 三分好评人数占比 three_score_rate_for_logistics = exist_or_0( logistic.xpath(".//div[@class='count count3']/em/text()")) # 二分好评人数占比 two_score_rate_for_logistics = exist_or_0( logistic.xpath(".//div[@class='count count2']/em/text()")) # 一分好评人数占比 one_score_rate_for_logistics = exist_or_0( logistic.xpath(".//div[@class='count count1']/em/text()")) #print "count_of_judger_for_commodity:",count_of_judger_for_commodity values.append(average_score_for_commodity) values.append(count_of_judger_for_commodity) values.append(five_score_rate_for_commodity) values.append(four_score_rate_for_commodity) values.append(three_score_rate_for_commodity) values.append(two_score_rate_for_commodity) values.append(one_score_rate_for_commodity) values.append(average_score_for_seller) values.append(count_of_judger_for_seller) values.append(five_score_rate_for_seller) values.append(four_score_rate_for_seller) values.append(three_score_rate_for_seller) values.append(two_score_rate_for_seller) values.append(one_score_rate_for_seller) values.append(average_score_for_logistics) values.append(count_of_judger_for_logistics) values.append(five_score_rate_for_logistics) values.append(four_score_rate_for_logistics) values.append(three_score_rate_for_logistics) values.append(two_score_rate_for_logistics) values.append(one_score_rate_for_logistics) #print len(values) values.append(update_time) sql = """ insert ignore into seller_info \ ( \ shop_id,alipay_Authentication,main_products,Location,\ seller_credit,buyer_credit,seller_bond,commodity_score,\ seller_attitude_score,logistics_score,commodity_score_compare,seller_attitude_score_compare,\ logistics_score_compare,positive_comment_week,moderate_comment_week,negative_comment_week,\ core_positive_comment_week,core_moderate_comment_week,core_negative_comment_week,\ non_core_positive_comment_week,non_core_moderate_comment_week,non_core_negative_comment_week,\ positive_comment_month,moderate_comment_month,negative_comment_month,\ core_positive_comment_month,core_moderate_comment_month,core_negative_comment_month,\ non_core_positive_comment_month,non_core_moderate_comment_month,non_core_negative_comment_month,\ positive_comment_half_year,moderate_comment_half_year,negative_comment_half_year,\ core_positive_comment_half_year,core_moderate_comment_half_year,core_negative_comment_half_year,\ non_core_positive_comment_half_year,non_core_moderate_comment_half_year,non_core_negative_comment_half_year,\ positive_comment_before_half_year,moderate_comment_before_half_year,negative_comment_before_half_year,\ total_penalty,after_sales_speed_nearly_30,after_sales_speed_nearly_30_compare,\ after_sale_rate_nearly_30,after_sale_rate_nearly_30_compare,\ dispute_rate_nearly_30,dispute_rate_nearly_30_compare,\ penalty_number_nearly_30,penalty_number_nearly_30_compare,\ penalty_number_fake_good,penalty_number_false_transaction,penalty_number_breach_promise,penalty_number_bad_desc,penalty_number_malicious_harassment,\ average_score_for_commodity,count_of_judger_for_commodity,\ five_score_rate_for_commodity,four_score_rate_for_commodity,three_score_rate_for_commodity,two_score_rate_for_commodity,one_score_rate_for_commodity,\ average_score_for_seller,count_of_judger_for_seller,\ five_score_rate_for_seller,four_score_rate_for_seller,three_score_rate_for_seller,two_score_rate_for_seller,one_score_rate_for_seller,\ average_score_for_logistics,count_of_judger_for_logistics,\ five_score_rate_for_logistics,four_score_rate_for_logistics,three_score_rate_for_logistics,two_score_rate_for_logistics,one_score_rate_for_logistics,\ update_time \ )\ values\ (\ '%s','%s','%s','%s',\ '%s','%s','%s','%s',\ '%s','%s','%s','%s',\ '%s','%s','%s','%s',\ '%s','%s',%s,'%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s','%s',\ '%s','%s',\ '%s','%s',\ '%s','%s',\ '%s','%s',\ '%s','%s','%s','%s','%s',\ '%s','%s',\ '%s','%s','%s','%s','%s',\ '%s','%s',\ '%s','%s','%s','%s','%s',\ '%s','%s',\ '%s','%s','%s','%s','%s',\ '%s'\ ) """ % tuple(values) try: x.ExecNonQuery(sql) except Exception as ex: print(ex) print(sql) return False x.EndSql() return "ok" except Exception as ex: print(ex) save_path = os.path.join("re_jiexi", "%s_rating" % (shop_id)) f = codecs.open(save_path, "w+", encoding="utf-8") f.write(html) f.close() x.EndSql() return False
def good_jiexi(shop_ID, shop_id, content, page, update_time, thread_name): try: x = Mymysql() x._GetConnect() count = 0 tree = etree.HTML(content) dls = tree.xpath(".//dl[contains(@class,'item')]") #now = datetime.datetime.now() #otherStyleTime = now.strftime("%Y-%m-%d") path = os.getcwd() parent_path = os.path.dirname(path) if len(dls) != 0: for item in dls: #price = item.xpath(".//*[@class='c-price']//text()") #if len(price)==0: # continue #price = price[0].strip() detail_a = item.xpath(".//dd[@class='detail']/a") if not detail_a: continue detail_a = detail_a[0] #name = detail_a.xpath(".//text()")[0].strip() url = detail_a.get("href") id = re.findall("id=(\d+)", url) if len(id) == 0: continue id = id[0] sale_num = item.xpath(".//*[@class='sale-num']/text()") if len(sale_num) == 0: sale_num = -1 else: sale_num = sale_num[0] count += 1 sql = "insert ignore into product_list(shop_id,product_id,total_sales_volume,update_time)\ values('%s','%s','%s','%s')" % (shop_id, id, sale_num, update_time) try: x.ExecNonQuery(sql) except Exception as ex: print(ex) print(sql) print("=" * 20) div = tree.xpath(".//div[@class='item']") if len(div) != 0: for item in div: #price = item.xpath(".//div[@class='price']/strong/text()")[0].strip() #price = re.findall("\d+.?\d+",price)[0] sale_num = item.xpath( ".//*[@class='sale-num']//text()|.//*[@class='sales-amount']//text()" ) if len(sale_num) == 0: sale_num = -1 else: sale_num = sale_num[0] sale_num = re.findall("\d+", sale_num) sale_num = sale_num[0] if len(sale_num) > 0 else -1 detail_a = item.xpath(".//div[@class='desc']/a") if not detail_a: continue detail_a = detail_a[0] #name = detail_a.xpath(".//text()")[0].strip() url = detail_a.get("href") id = re.findall("id=(\d+)", url) if len(id) == 0: continue id = id[0] #print id,name,url,price count += 1 sql = "insert ignore into product_list(shop_id,product_id,total_sales_volume,update_time)\ values('%s','%s','%s','%s')" % (shop_id, id, sale_num, update_time) try: x.ExecNonQuery(sql) except Exception as ex: print(ex) print(sql) print("=" * 20) return "ok" except Exception as ex: print(ex) save_path = os.path.join("re_jiexi", "%s_%s.txt" % (shop_ID, page)) f = codecs.open(save_path, "w+", encoding="utf-8") f.write(content) f.close() return False finally: x.EndSql()
def refund_jiexi(shop_id, content, update_time): try: x = Mymysql() x._GetConnect() tree = etree.HTML(content) kuang = tree.xpath(".//div[@class='tb-r-box kg-rate-wd-refund']") trs = tree.xpath(".//tr[@class='J_KgRate_RefundSummary_TR']") tmp_tr = trs[0] tds = tmp_tr.xpath(".//td/text()") # 近30天售后速度 after_sales_speed_nearly_30 = re.findall("\d+.?\d+", tds[1]) after_sales_speed_nearly_30 = -1 if len( after_sales_speed_nearly_30 ) == 0 else after_sales_speed_nearly_30[0] # print "after_sales_speed_nearly_30",after_sales_speed_nearly_30 # 近30天售后速度 比行业值 if u"持平" in tds[2]: after_sales_speed_nearly_30_compare = 0 else: after_sales_speed_nearly_30_compare = re.findall( "\d+.?\d+", tds[2]) after_sales_speed_nearly_30_compare = -1 if len(after_sales_speed_nearly_30_compare) == 0 else \ after_sales_speed_nearly_30_compare[0] if u"快" not in tds[2] and after_sales_speed_nearly_30_compare != -1: after_sales_speed_nearly_30_compare = "-" + after_sales_speed_nearly_30_compare after_sales_speed_nearly_30_compare = float( after_sales_speed_nearly_30_compare) * 0.01 # print "after_sales_speed_nearly_30_compare",after_sales_speed_nearly_30_compare tmp = tree.xpath( ".//div[@data-kg-rate-gl-hover='refundfeedback.3.6']/ul/li/text()") # print "tmp:",tmp if len(tmp) != 0: # 仅退款速度 refund_speed_nearly_30 = re.findall(u"仅退款速度 (\d+.?\d+)", tmp[0]) # print "refund_speed_nearly_30:",refund_speed_nearly_30 refund_speed_nearly_30 = -1 if len( refund_speed_nearly_30) == 0 else refund_speed_nearly_30[0] # print "refund_speed_nearly_30",refund_speed_nearly_30 # 退货退款速度 full_refund_speed_nearly_30 = re.findall(u"退货退款速度 (\d+.?\d+)", tmp[-1]) # print "full_refund_speed_nearly_30:",full_refund_speed_nearly_30 full_refund_speed_nearly_30 = -1 if len( full_refund_speed_nearly_30 ) == 0 else full_refund_speed_nearly_30[0] # print "full_refund_speed_nearly_30",full_refund_speed_nearly_30 else: refund_speed_nearly_30 = -1 full_refund_speed_nearly_30 = -1 tmp_tr = trs[1] tds = tmp_tr.xpath(".//td/text()") # 近30天纠纷率 dispute_rate_nearly_30 = re.findall("\d+.?\d+", tds[1]) if len(dispute_rate_nearly_30) > 0: dispute_rate_nearly_30 = float(dispute_rate_nearly_30[0]) * 0.01 else: dispute_rate_nearly_30 = -1 # print "dispute_rate_nearly_30",dispute_rate_nearly_30 # 近30天纠纷率 比行业值 if u"持平" in tds[2]: dispute_rate_nearly_30_compare = 0 else: dispute_rate_nearly_30_compare = re.findall("\d+.?\d+", tds[2]) dispute_rate_nearly_30_compare = -1 if len(dispute_rate_nearly_30_compare) == 0 else \ dispute_rate_nearly_30_compare[0] if u"低" in tds[2] and dispute_rate_nearly_30_compare != -1: dispute_rate_nearly_30_compare = "-" + dispute_rate_nearly_30_compare dispute_rate_nearly_30_compare = float( dispute_rate_nearly_30_compare) * 0.01 # print "dispute_rate_nearly_30_compare",dispute_rate_nearly_30_compare tmp_tr = trs[2] tds = tmp_tr.xpath(".//td/text()") # 近30天售后率 after_sale_rate_nearly_30 = re.findall("\d+.?\d+", tds[1]) after_sale_rate_nearly_30 = -1 if len( after_sale_rate_nearly_30) == 0 else after_sale_rate_nearly_30[0] after_sale_rate_nearly_30 = float(after_sale_rate_nearly_30) * 0.01 # print "after_sale_rate_nearly_30",after_sale_rate_nearly_30 if u"持平" in tds[2]: after_sale_rate_nearly_30_compare = 0 else: after_sale_rate_nearly_30_compare = re.findall("\d+.?\d+", tds[2]) after_sale_rate_nearly_30_compare = -1 if len(after_sale_rate_nearly_30_compare) == 0 else \ after_sale_rate_nearly_30_compare[0] if u"低" in tds[2] and after_sale_rate_nearly_30_compare != -1: after_sale_rate_nearly_30_compare = "-" + after_sale_rate_nearly_30_compare after_sale_rate_nearly_30_compare = float( after_sale_rate_nearly_30_compare) * 0.01 # print "after_sale_rate_nearly_30_compare",after_sale_rate_nearly_30_compare # hover frame hover_frame_for_rate = tree.xpath( ".//div[@data-kg-rate-gl-hover='refundfeedback.3.8']//text()") hover_frame_for_rate = list( filter(lambda x: len(x.strip()) > 0, hover_frame_for_rate)) after_sales_count_nearly_30 = re.findall("\d+", hover_frame_for_rate[0])[1] # print "after_sales_count_nearly_30",after_sales_count_nearly_30 bad_goods_count_nearly_30 = re.findall("\d+", hover_frame_for_rate[1])[0] # print "bad_goods_count_nearly_30",bad_goods_count_nearly_30 buyer_dislike_count_nearly_30 = re.findall("\d+", hover_frame_for_rate[2])[0] # print "buyer_dislike_count_nearly_30",buyer_dislike_count_nearly_30 bad_seller_attitude_nearly_30 = re.findall("\d+", hover_frame_for_rate[3])[0] # print "bad_seller_attitude_nearly_30",bad_seller_attitude_nearly_30 tmp_tr = trs[3] tds = tmp_tr.xpath(".//td/text()") # 近180天售后态度评分 aftersale_attitude_score_nearly_180 = re.findall("\d+.?\d+", tds[1]) aftersale_attitude_score_nearly_180 = -1 if len(aftersale_attitude_score_nearly_180) == 0 else \ aftersale_attitude_score_nearly_180[0] # print "aftersale_attitude_score_nearly_180",aftersale_attitude_score_nearly_180 if u"持平" in tds[2]: aftersale_attitude_score_nearly_180_compare = 0 else: aftersale_attitude_score_nearly_180_compare = re.findall( "\d+.?\d+", tds[2]) aftersale_attitude_score_nearly_180_compare = -1 if len( aftersale_attitude_score_nearly_180_compare ) == 0 else aftersale_attitude_score_nearly_180_compare[0] if u"低" in tds[ 2] and aftersale_attitude_score_nearly_180_compare != -1: aftersale_attitude_score_nearly_180_compare = "-" + aftersale_attitude_score_nearly_180_compare aftersale_attitude_score_nearly_180_compare = float( aftersale_attitude_score_nearly_180_compare) * 0.01 # print "aftersale_attitude_score_nearly_180_compare",aftersale_attitude_score_nearly_180_compare tmp_tr = trs[4] tds = tmp_tr.xpath(".//td/text()") # 近180天售后速度评分 after_sale_rate_nearly_180 = re.findall("\d+.?\d+", tds[1]) after_sale_rate_nearly_180 = -1 if len( after_sale_rate_nearly_180) == 0 else after_sale_rate_nearly_180[0] # print "after_sale_rate_nearly_180",after_sale_rate_nearly_180 if u"持平" in tds[2]: after_sale_rate_nearly_180_compare = 0 else: after_sale_rate_nearly_180_compare = re.findall("\d+.?\d+", tds[2]) after_sale_rate_nearly_180_compare = -1 if len(after_sale_rate_nearly_180_compare) == 0 else \ after_sale_rate_nearly_180_compare[0] if u"低" in tds[2] and after_sale_rate_nearly_180_compare != -1: after_sale_rate_nearly_180_compare = "-" + after_sale_rate_nearly_180_compare after_sale_rate_nearly_180_compare = float( after_sale_rate_nearly_180_compare) * 0.01 # print "after_sale_rate_nearly_180_compare",after_sale_rate_nearly_180_compare sql = """ update seller_info set after_sales_speed_nearly_30='%s', after_sales_speed_nearly_30_compare='%s', refund_speed_nearly_30='%s', full_refund_speed_nearly_30='%s', dispute_rate_nearly_30='%s', dispute_rate_nearly_30_compare='%s', after_sale_rate_nearly_30='%s', after_sale_rate_nearly_30_compare='%s', after_sales_count_nearly_30='%s', bad_goods_count_nearly_30='%s', buyer_dislike_count_nearly_30='%s', bad_seller_attitude_nearly_30='%s', aftersale_attitude_score_nearly_180='%s', aftersale_attitude_score_nearly_180_compare='%s', after_sale_rate_nearly_180='%s', after_sale_rate_nearly_180_compare='%s' where shop_id = '%s' and update_time='%s' """ % ( after_sales_speed_nearly_30, after_sales_speed_nearly_30_compare, refund_speed_nearly_30, full_refund_speed_nearly_30, dispute_rate_nearly_30, dispute_rate_nearly_30_compare, after_sale_rate_nearly_30, after_sale_rate_nearly_30_compare, after_sales_count_nearly_30, bad_goods_count_nearly_30, buyer_dislike_count_nearly_30, bad_seller_attitude_nearly_30, aftersale_attitude_score_nearly_180, aftersale_attitude_score_nearly_180_compare, after_sale_rate_nearly_180, after_sale_rate_nearly_180_compare, shop_id, update_time) x.ExecNonQuery(sql) except Exception as ex: print(shop_id, "refund_jiexi", "error:", ex) log_path = os.path.join("jiexi", shop_id + "_" + "refund_jiexi.txt") f = codecs.open(log_path, "w+", encoding="utf-8") f.write(content) f.close() return False finally: x.EndSql()