def getShop(self,keyword): """取出前30个商品,并判断是否为配件""" goalKey = "手机" configutil = ConfigUtil() URL = configutil.getSearch_SuningPrd()+"/"+urllib.parse.quote(keyword) # URL = "http://search.suning.com"+"/"+urllib.parse.quote(keyword) response = SpiderUtil.getHtml(URL + "/") vonder_shops = re.findall(r'<li docType="1".*id="(.*?)"',response) + re.findall(r'<div.*product-box basic.*id="(.*?)"',response) flagList = [] checkResult = {} # 报告中使用 for vonder_shop in vonder_shops: goalFlag = 0 number =vonder_shop.split("-") url = "https://product.suning.com/"+number[0]+"/"+number[1]+".html" soup = SpiderUtil.getSoupContent(url) logger.info("测试商品: %s"%vonder_shop) breadcrumb_title = "" if soup != "": breadcrumb_title_tmp = SpiderUtil.findContentbyTagClass(soup, "span", "breadcrumb-title") if breadcrumb_title_tmp!=[]: breadcrumb_title = breadcrumb_title_tmp[0]["title"] if self.isExistErrorKey(breadcrumb_title): if self.isExistGoalKey(soup): goalFlag = 1 #表示此商品是手机 flagList.append(goalFlag) if goalFlag == 1: checkResult.update({number[1]:goalKey}) else: checkResult.update({number[1]:"配件"}) return flagList,checkResult
def verify_PrecisionRate(self,keywords): count = 0 errorKey = [] for keyword in keywords: logger.info("测试分词:%s"%keyword) keyword = keyword.replace("-", '%252d') #quote对‘-’不编码 RateSum = [] result = SearchResult() config = ConfigUtil() url = config.getSearch_SuningPrd() soup = SpiderUtil.getSoupContent(url+"/"+urllib.parse.quote(keyword)+"/") vonder_shops = self.get_partnumber(url,keyword) logger.info(url+"/"+urllib.parse.quote(keyword)+"/") #**********处理qurey分词******************** if soup != '': div = soup.find("div", class_="no-result-tips") # 考虑推荐的情况,会有两个class[no-result-tips no-result-proposal] if div != None: if len(div["class"]) == 1: # 考虑改写的情况,用改写之后的值匹配 tmp = div.strong.text keyword = re.findall(r'我们为您提供"(.*?)".*的搜索结果',div.strong.text)[0] qurey_segList = self.deal_QuerySeg(keyword) # 存放query的每个分词找到同义词 allShopNum = int(SpiderUtil.getTotalCount(soup)) # 考虑召回商品少于5个情况,没有召回商品的情况 if allShopNum != 0: if allShopNum > 4: getShopNum = 5 # 取召回商品的前5个 else: getShopNum = allShopNum titles = SpiderUtil.getTitles(soup, getShopNum) auxdescriptions = SpiderUtil.getAuxdescription(soup, getShopNum) storenames = SpiderUtil.getStoreName(soup, getShopNum) for k in range(len(titles)): redundancy_seg = ["官方", "旗舰", "店", "苏宁", "自营"] storename_seg = self.get_Seg("query",storenames[k]) storenames[k] = ''.join([i for i in storename_seg if i not in redundancy_seg]) # 将店铺名中的相关词去掉 if storenames[k] in keyword: # 考虑keyword中命中了店铺名 keyword_new = keyword.replace(storenames[k],"") # 去掉店铺名 qurey_segList = self.deal_QuerySeg(keyword_new) if qurey_segList ==[] or keyword in storenames[k]: # 考虑搜索词正好是店铺名称 HitRate_title = 1.0 HitRate_auxdescription = 1.0 else: HitRate_title = self.count_HitRat(qurey_segList, titles[k]) HitRate_auxdescription = self.count_HitRat(qurey_segList, auxdescriptions[k]) hitrate = max(HitRate_title, HitRate_auxdescription) if hitrate == 0: # 若标题和卖点都没有中,考虑查看是否命中通子码信息 BjOtherTxt = self.check_HitBjOtherTxt(vonder_shops[k]) hitrate = self.count_HitRat(qurey_segList, ''.join(BjOtherTxt)) RateSum.append(hitrate) AverageHitRate = sum(RateSum) / len(RateSum) AverageHitRate = float("%.2f"%AverageHitRate) else: titles = ["非常抱歉!没有找到与' *** ' 相关的商品。"] RateSum = [] AverageHitRate = -1 else: titles = ["根据相关法律法规和政策,无法显示相关的商品"] RateSum = [] AverageHitRate = -1 #************报告************** if 0 <= AverageHitRate <= 0.6: wordMatch = "fail" count += 1 errorKey.append(keyword) if keyword.isdigit(): # 考虑qurey是商品编码的情况 partnumber = list(map(lambda x:x.split("-")[1],vonder_shops)) if partnumber.count(keyword) == len(partnumber) and partnumber != []: wordMatch = "pass" AverageHitRate = 1 RateSum = [1.0]*len(partnumber) count = count-1 errorKey.pop() elif 0.6 < AverageHitRate <= 1: wordMatch = "pass" else: wordMatch = "warn" newTiles = list(map(lambda x:"<br>"+x,titles)) #为了报告中换行 result.setKeyword(keyword) result.setWords(qurey_segList) result.setTitle(newTiles) result.setHitRate(RateSum) result.setAverageHitRate(AverageHitRate) result.setMatchStatus(wordMatch) self.searchresults3.append(result) return count,errorKey