def findsource(self): contentall = self.gethtml() content = contentall[0] html = BeautifulSoup(content, "html.parser") sources = html.find("div", {'class': 'breadcrumbs'}) # 请求url requesturl = contentall[1] if sources: divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string(divtext, '>', '>') # 保存请求成功的path self.saveoutputlog(currentresult) self.setshowlog(requesturl + ' succeed') self.setresult(True) else: sources = html.find("div", {'class': 'subpage_menu'}) if sources: divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string(divtext, '>', '>') # 保存请求成功的path self.saveoutputlog(currentresult) self.setshowlog(requesturl + ' succeed') self.setresult(True) else: self.setshowlog(requesturl + ' failed') self.setresult(False)
def gettag(self): if self.info: tag = self.info.find('div', {'class': 'side-tags clearfix'}) if tag: s = tag.get_text() s = fetch_util.replace_some_string(s, "\n", '+') return fetch_util.remove_last_char(s, '+')
def getcategory(self): if self.info: category = self.info.find('dd', {'class': 'tag-box'}) if category: s = category.get_text() s = fetch_util.replace_some_string(s, "\n", '+') return fetch_util.remove_last_char(s, '+')
def parser_type_3(html, source): div_text = source.get_text() result = fetch_util.replace_some_string(div_text, '\n', '') item_name_tag = html.find("div", {'class': 'sku-name'}) if item_name_tag: item_name = item_name_tag.get_text() if item_name: result += '>' + item_name return result
def getcategory(self): category = self.html.find('div', {'class': 'crumb'}) # if 'get_text' in category.attrs: try: category = category.get_text() # else: except: category = '' return fetch_util.replace_some_string(category, '\n', '')
def parser_type_2(html, source): bread_tag = source.find("div", {'id': 'name'}) if bread_tag: h1_item_name = bread_tag.find('h1') item_name = h1_item_name.contents[0] if item_name: # 替换结果中的内容 result = fetch_util.replace_some_string(item_name, '>>', '>') return result return ''
def parser_type_1(html, source): div_text = source.get_text() result = fetch_util.replace_some_string(div_text, '>', '>') item_name_tag = html.find("div", {'id': 'name'}) if item_name_tag: h1_item_name = item_name_tag.find('h1') item_name = h1_item_name.contents[0] if item_name: result += '>' + item_name return result
def findsource(self): contentall = self.gethtml() content = contentall[0] html = BeautifulSoup(content, "html.parser") sources = html.find("div", {'class': 'subpage_menu'}) # 请求url requesturl = contentall[1] if sources: divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string( divtext, '>', '>') # 保存请求成功的path self.saveoutputlog(currentresult) self.setshowlog(requesturl + ' succeed') self.setresult(True) else: sources = html.find("div", {'class': 'content_text'}) if sources: tds = sources.findAll('td') title = '' if tds and len(tds) >= 1: title = tds[0].get_text().strip() + tds[1].get_text( ).strip() else: self.setshowlog(requesturl + ' failed') self.setresult(False) return if title: currentresult = requesturl + '\t' + title else: self.setshowlog(requesturl + ' failed') self.setresult(False) return # 保存请求成功的path self.saveoutputlog(currentresult) self.setshowlog(requesturl + ' succeed') self.setresult(True) else: self.setshowlog(requesturl + ' failed') self.setresult(False)
def match_type2(self, requesturl, sources): nametag = sources.find("div", {'id': 'name'}) if nametag: h1itemname = nametag.find('h1') itemname = h1itemname.contents[0] if itemname: # 替换结果中的内容 currentresult = fetch_util.replace_some_string( itemname, '>>', '>') price = get_price(requesturl) # 保存请求成功的path self.save_result(currentresult + price) self.set_print_log(requesturl + ' succeed') self.set_result(True)
def match_type3(self, html, requesturl, sources): divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string( divtext, '\n', '') divitemname = html.find("div", {'class': 'sku-name'}) if divitemname: itemname = divitemname.get_text() if itemname: currentresult += '>' + itemname price = get_price(requesturl) # 保存请求成功的path self.save_result(currentresult + price) self.set_print_log(requesturl + ' succeed') self.set_result(True)
def match_type1(self, html, requesturl, sources): divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string( divtext, '>', '>') itmenamehtml = html.find("div", {'id': 'name'}) if itmenamehtml: h1itemname = itmenamehtml.find('h1') itemname = h1itemname.contents[0] if itemname: currentresult += '>' + itemname # 保存请求成功的path price = get_price(requesturl) self.save_result(currentresult + price) fetch_util.print_log(currentresult + price) self.set_print_log(requesturl + ' succeed') self.set_result(True)