def parse_item(self, response: Response, selector: Selector): try: param_names = selector.xpath('.//dt/text()').extract() param_values = selector.xpath('.//dd/text()').extract() params = {n.strip(': '): v.strip(' ') for n, v in zip(param_names, param_values) if '\n' not in v} loader = ProductLoader(item=ProductItem(), response=response, selector=selector) loader.add_xpath('name', './/*[@itemprop="name"]/text()') loader.add_xpath('category', '//*[@id="main"]/h1/text()') loader.add_xpath('link', './/*[@itemprop="name"]/../@href') loader.add_xpath('price', './/*[@class="price" or @class="price action_special"]/text()') loader.add_xpath('price_old', './/*[@class="price-old"]/text()') loader.add_xpath('rating', './/*[@class="oh-rating"]/text()') loader.add_value('params', params) loader.add_value('where_found', response.request.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('parse_datetime', datetime.datetime.now()) return loader.load_item() except Exception as e: print(e)
def parse_showdesk_members_treat(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/timesItem!initTreat.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members_treat, meta=new_meta) treat_info_tabs = hxs.xpath('//div[@class="page_main"]//div[@class="table-responsive"]/table') if not treat_info_tabs: yield None return treat_info_tab = treat_info_tabs[0] ths = str_list_strip_replace(treat_info_tab.xpath('./thead/tr/th/child::text()').extract(), [' ', '\t', '\n', ' ']) info_nodes = treat_info_tab.xpath('./tbody/tr') for i_n in info_nodes: infos = [] info_tds = i_n.xpath('./td') for i_t in info_tds: info = ''.join(str_list_strip_replace(i_t.xpath('.//child::text()').extract(), [' ', '\t', '\n', ' '])) infos.append(info) item = SentreeMemberTreatItem() item['hs'] = ths item['vals'] = infos yield item
def parse_showdesk_membercards(self, resp): hxs = Selector(resp) headers = hxs.xpath('//form[@id="cardTypeForm"]//table/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return employee_nodes = hxs.xpath('//form[@id="cardTypeForm"]//table/tbody/tr') if not employee_nodes: self.log('%s can not find member card info' % self.name, level=log.ERROR) yield None return for e_n in employee_nodes: info_nodes = e_n.xpath('td') info = OrderedDict({}) for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 2: continue if idx == len(info_nodes) - 1: info[headers[idx]] = ' | '.join(str_list_strip_replace(i_n.xpath('./child::text()').extract(), [' ', '\t', '\n', ' '])) continue sep = ' | ' if idx == 3: sep = '' info[headers[idx]] = sep.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n', ' '])) item = SentreeMemberCardItem() item['info'] = info # items.append(info) yield item
def parse(self, response): sel = Selector(response) locations = Locations() locations["restaurantIDs"] = sel.xpath('//a/@data-id').extract() locations["coordinates"] = {} locations["coordinates"]["longitude"] = self.coordinatesURLTranslator.getLongitude(response.url) locations["coordinates"]["latitude"] = self.coordinatesURLTranslator.getLatitude(response.url) return locations
def parse(self, response): sel = Selector(response) restaurants = sel.xpath('//a[contains(@id, "establecimiento")]') for restaurant in restaurants: locationCsv = LocationCsv() locationCsv["id_restaurante"] = restaurant.css("a::attr(data-id)").extract() locationCsv["nombre_restaurante"] = restaurant.css("a .result-info h4::text").extract() locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude(response.url) locationCsv["longitud"] = self.coordinatesURLTranslator.getLongitude(response.url) yield locationCsv
def parse_member_overdraft(self, resp): hxs = Selector(resp) mem_item = resp.meta['item'] overdraft_click_nodes = hxs.xpath('//ul[@class="tab-nav"]//a[@href="#tab7"]/@onclick') if not overdraft_click_nodes: mem_item['overdraft'] = '0.0' yield mem_item else: click_str = overdraft_click_nodes.extract()[0] ids = re.findall(r'\d+', click_str) yield FormRequest(url='http://vip6.sentree.com.cn/shair/memberArchives!debtlist.action', formdata={'id' : ids[0], 'shopid' : ids[1]}, callback=self.parse_member_overdraft2, meta=resp.meta)
def parse_showdesk_members2(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/memberInfo!memberlist.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members2, meta=new_meta) member_nodes = hxs.xpath('//form[@id="delForm"]//table/tbody/tr') if member_nodes: for m_n in member_nodes: member_tds = m_n.xpath('td') info_query_str = None try: phone = member_tds[1].xpath('a/child::text()').extract()[0].replace(' ', '').strip() name = member_tds[2].xpath('span/child::text()').extract()[0].replace(' ', '').strip() card_no = member_tds[6].xpath('table/tr/td[1]/a/child::text()').extract()[0].replace(' ', '').strip() info_query_str = member_tds[6].xpath('table/tr/td[1]/a/@onclick').extract()[0] info_query_str = info_query_str[info_query_str.find('?') + 1:] info_query_str = info_query_str[:info_query_str.find("'")] card_name = member_tds[6].xpath('table/tr/td[2]/child::text()').extract()[0].replace(' ', '').strip() card_type = member_tds[6].xpath('table/tr/td[3]//child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() discont = member_tds[6].xpath('table/tr/td[4]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() timeout = member_tds[6].xpath('table/tr/td[9]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() overage = str_list_strip_replace(member_tds[6].xpath('table/tr/td[7]//child::text()').extract(), [' ', ' ', '\t', '\n']) except: self.log(traceback.format_exc()) continue mem_item = SentreeMembersSimpleItem() mem_item[u'phone'] = phone mem_item[u'name'] = name mem_item[u'card_no'] = card_no mem_item[u'card_name'] = card_name mem_item[u'card_type'] = card_type mem_item[u'discont'] = discont mem_item[u'timeout'] = timeout mem_item[u'overage'] = overage if info_query_str: new_meta = dict(meta) new_meta['item'] = mem_item yield Request(url='http://vip6.sentree.com.cn/shair/memberArchives!editMember.action?%s%d' % (info_query_str, time.time()), callback=self.parse_member_overdraft, meta=new_meta) else: mem_item['overdraft'] = '0.0' yield mem_item
def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Dmozitem() item['title'] = site.xpath('a/text()').extract() item['link'] = site.xpath('a/@href').extract() item['desc'] = site.xpath('text()').extract() items.append(item) return items
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") topic_lists = sel.xpath('//div[re:test(@class,"result.*?")]') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) title = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/text()')[0].extract() print title topic_item['topic_title']=title board = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[0].extract() print board poster = temp_sel.xpath('//div[re:test(@class,"meta")]/a/text()')[1].extract().strip() print poster topic_item['topic_author']=poster main_con = temp_sel.xpath('//div[re:test(@class,"meta")]')[0].extract().strip() post_time_ = re.findall(self.post_time_pa,main_con)[0] post_time_str = '20'+post_time_[0]+' '+post_time_[2]+':00' post_time = time.strptime(post_time_str, '%Y-%m-%d %H:%M:%S') # print post_time if '4e0b' in post_time_[1].__repr__(): print u'下午' post_time = self.time_to_datetime(post_time)+ datetime.timedelta(hours=12) elif '4e0a' in post_time_[1].__repr__(): print u'上午' post_time = post_time_str print post_time topic_item['topic_post_time']=post_time content = temp_sel.xpath('//div[re:test(@class,"content")]/text()')[0].extract().strip() print content topic_item['topic_content']=content url = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/a/@href').extract()[0] url = 'http://www.battlenet.com.cn'+url print url topic_item['topic_url']=url reply_num = temp_sel.xpath('//h3[re:test(@class,"subheader-3")]/span[re:test(@class,"small")]/text()').extract() reply_num = reply_num[len(reply_num)-1] reply_num = re.findall(self.dig_pattern,reply_num)[0] print reply_num topic_item['topic_reply']=reply_num print '+++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") topic_lists = sel.xpath('//div[re:test(@class,"result f s3")]') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) title = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/text()').extract()[0].strip() # print title topic_item['topic_title']=title content = temp_sel.xpath('//div[re:test(@class,"c-abstract")]/text()').extract()[0].strip() print content topic_item['topic_content']=content post_time = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span')[2].extract() post_time = re.findall(self.post_pa,post_time)[0]+' 00:00:00' print post_time topic_item['topic_post_time']=post_time author = temp_sel.xpath('//div[re:test(@class,"c-summary-1")]/span/text()')[1].extract() print author topic_item['topic_author']=author url = temp_sel.xpath('//h3[re:test(@class,"c-title")]/a/@href').extract()[0] print url topic_item['topic_url']=url topic_item['topic_reply']=0 print '+++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse_item(self, response): item = BuscapeItem() sel = Selector(response) title = sel.xpath('//h1[@class="name"]/text()').extract()[0] item["title"] = title item["url"] = response.url attributes = [] pares = sel.xpath('//*[@class="product-details"]/ul/li') for par in pares: key = par.xpath('span[@class="name"]/text()').extract() value = par.xpath('span[@class="value"]/text()').extract() attributes.append({"key": key[0], "value" : value}) item["attributes"] = attributes return item
def parse(self, response): sel = Selector(response) restaurants = sel.xpath('//a[contains(@id, "establecimiento")]') for restaurant in restaurants: locationCsv = LocationCsv() locationCsv["id_restaurante"] = restaurant.css( "a::attr(data-id)").extract() locationCsv["nombre_restaurante"] = restaurant.css( "a .result-info h4::text").extract() locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude( response.url) locationCsv[ "longitud"] = self.coordinatesURLTranslator.getLongitude( response.url) yield locationCsv
def __check_detail_title_valid(detail_title: Selector) -> bool: if not isinstance(detail_title, Selector): raise TypeError(f"detail_title必须是Selector类型") else: loc_title = detail_title.extract() # 自定义检验规格 return True
def extract_text(row_sel: Selector, query: str) -> List[str]: vals = row_sel.css(query).getall() if len(vals) > 0: return [v.strip() for v in vals] else: return vals
def parse(self, response): topic_kws = response.meta[ 'topic_kws' ] all_content = BeautifulSoup(response.body,'html5lib') topic_lists = all_content.find_all('li',class_="pbw") for topic in topic_lists: topic_item = Topic_Item() topic_item['topic_db_message'] = topic_kws temp_sel = Selector(text=topic.prettify(), type="html") title = topic.find_all("a")[0].get_text() # print title topic_item['topic_title']=title url = topic.find_all("a")[0].get('href') print url topic_item['topic_url']=url topic_content = topic.find_all("p")[1].get_text() print topic_content topic_item['topic_content']=topic_content post_time = temp_sel.xpath('//p/span/text()')[0].extract().strip() print post_time topic_item['topic_post_time']=post_time author = temp_sel.xpath('//p/span/a/text()')[0].extract().strip() # print author topic_item['topic_author']=author reply_msg = topic.find_all('p',class_='xg1')[0] msg = re.findall(self.reply_pattern,reply_msg.get_text())[0] print msg reply_num = msg[0] read_num = msg[1] topic_item['topic_reply']=reply_num homepage = temp_sel.xpath('//p/span/a/@href').extract()[0] user_id = re.findall(self.userid_pa,homepage)[0] print user_id topic_item['poster_id']=user_id topic_item['homepage'] = homepage print '+++++++++++++++++++++++++++++++++++++++++' yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item})
def parse(self, response): topic_kws = response.meta["topic_kws"] all_content = BeautifulSoup(response.body, "html5lib") topic_lists = all_content.find_all("li", class_="pbw") for topic in topic_lists: topic_item = Topic_Item() topic_item["topic_db_message"] = topic_kws temp_sel = Selector(text=topic.prettify(), type="html") title = topic.find_all("a")[0].get_text() # print title topic_item["topic_title"] = title url = topic.find_all("a")[0].get("href") print url topic_item["topic_url"] = url topic_content = topic.find_all("p")[1].get_text() # print topic_content topic_item["topic_content"] = topic_content post_time = temp_sel.xpath("//p/span/text()")[0].extract().strip() print post_time topic_item["topic_post_time"] = post_time author = temp_sel.xpath("//p/span/a/text()")[0].extract().strip() # print author topic_item["topic_author"] = author reply_msg = topic.find_all("p", class_="xg1")[0] msg = re.findall(self.reply_pattern, reply_msg.get_text())[0] # print msg reply_num = msg[0] read_num = msg[1] topic_item["topic_reply"] = reply_num homepage = temp_sel.xpath("//p/span/a/@href").extract()[0] user_id = re.findall(self.userid_pa, homepage)[0] # print user_id topic_item["poster_id"] = user_id topic_item["homepage"] = homepage print "+++++++++++++++++++++++++++++++++++++++++" yield scrapy.Request(url, callback=self.parse_torrent, meta={"topic_item": topic_item})
def parse_quick_facts(self, selector: Selector, quest: Quest): """ parses the quick facts section on a wowhead quest page :param selector: selector of the quick facts section :param quest: quest item to store gathered info in :return: """ result = selector.re(r"Start:\s(.*</a>)") if result: element = Selector(text=result[0]) quest["npc"] = element.xpath("//a/text()").get() quest["npc_link"] = self.base_url + element.xpath( "//a/@href").get() else: quest["npc"] = "Unknown" quest["npc_link"] = "Unknown"
def parse_item(self, response): index = response.meta['index'] if index == 1: index_count = response.selector.xpath('//*[@id="m-page"]/span/text()').extract() index_count = [x.strip() for x in index_count if x.strip()] index, count = [int(x) for x in index_count[0].split('/')] for i in range(index + 1, count + 1): yield Request(url=self.get_gn_url(i), headers=TONGHUASHUN_GN_HEADER, meta={'index': i}, callback=self.parse_item) trs = response.xpath('/html/body/table/tbody//tr').extract() try: for tr in trs: start_date = Selector(text=tr).xpath('//td[1]/text()').extract_first() name = Selector(text=tr).xpath('//td[2]/a/text()').extract_first() link = Selector(text=tr).xpath('//td[2]/a/@href').extract_first() news_title = Selector(text=tr).xpath('//td[3]/a/text()').extract_first() news_link = Selector(text=tr).xpath('//td[3]/a/@href').extract_first() leadings = [x.rsplit('/')[-2] for x in Selector(text=trs[0]).xpath('//td[4]/a/@href').extract()] count = Selector(text=tr).xpath('//td[5]/text()').extract() yield SectorItem(id='{}_{}_{}'.format('10jqka', 'gn', name), start_date=start_date, name=name, link=link, news_title=news_title, news_link=news_link, leadings=leadings, count=count, producer='10jqka', type='gn') except Exception as e: self.logger.error('error parse 10jqka gainian sector url:{} {}'.format(response.url, e))
class RestaurantIDsGetter(object): def __init__(self, response): self.sel = Selector(response) def getID(self, url): xpathQuery = '//a[contains(@href, "' + url + '")]/@data-id' queryResults = self.sel.xpath(xpathQuery).extract() if len(queryResults) == 0: return "NoID" return queryResults[0]
def detail_parse(self, response): page = response.meta['page'] token = json.loads(requests.post(self.token_url, headers=self.header).text, strict=False).get('d', '') data = copy.deepcopy(self.data) data.update({'Token': token, 'PageIndex': str(page)}) list_content = json.loads(requests.post(self.list_url, headers=self.header, json=data).text, strict=False).get('d', '') cont_list = json.loads(list_content).get('Table', []) for cont in cont_list: result_dict = {} info_id = cont.get('InfoID', '') post_data = { "Token": json.loads(requests.post(self.token_url, headers=self.header).text, strict=False).get('d', ''), "PageIndex": "1", "PageSize": "1", "InfoID": info_id } detail_content = json.loads(requests.post(self.detail_url, headers=self.header, json=post_data).text, strict=False).get('d', '') detail = json.loads(detail_content, strict=False).get('Table', [])[0] result_dict['punish_code'] = detail.get('name1', '') result_dict['case_name'] = detail.get('name2', '') result_dict['punish_category_one'] = detail.get('name3', '') result_dict['punish_category_two'] = detail.get('name4', '') result_dict['punish_type'] = detail.get('name5', '') result_dict['punish_basis'] = detail.get('name6', '') result_dict['company_name'] = detail.get('name7', '') result_dict['credit_code'] = detail.get('name8', '') result_dict['organization_code'] = detail.get('name9', '') result_dict['regno'] = detail.get('name10', '') result_dict['tax_code'] = detail.get('name11', '') result_dict['id_number'] = detail.get('name12', '') result_dict['frname'] = detail.get('name13', '') result_dict['punish_content'] = detail.get('name14', '') result_dict['public_date'] = detail.get('name15', '') result_dict['punish_org'] = detail.get('name16', '') result_dict['update'] = detail.get('infodate', '') for key, value in result_dict.items(): result_dict[key] = ''.join(Selector(text=value).xpath('//p//text()').extract()).strip()\ if '<p style' in value else value yield self.handle_result(response, result_dict, info_id)
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="mainleft"]') itemlist= [] for site in sites: item = CnkispiderItem() title = site.xpath('//*[@id="chTitle"]/text()').extract() #将相应的值填入到item对应的属性中去 item['title'] = [t.encode('utf-8') for t in title] author = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[1]/a/text()').extract() if author == None: author = site.xpath('//*[@id="content"]/div[1]/div[2]/p[1]/a/text()').extract() item['author'] = [a.encode('utf-8') for a in author] institution = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[3]/a/text()').extract() item['institution'] = [i.encode('utf-8') for i in institution] abstract = site.xpath('//*[@id="ChDivSummary"]/text()').extract() item['abstract'] = [a.encode('utf-8') for a in abstract] keyWord = site.xpath('//*[@id="ChDivKeyWord"]/a/text()').extract() item['keyWord'] = [k.encode('utf-8') for k in keyWord] downloadFreq = site.xpath('//*[@id="content"]/div[1]/div[5]/ul/li/text()').re(u'\s*【下载频次】(.*)') item['downloadFreq'] = [d.encode('utf-8') for d in downloadFreq] quoteFreq = site.xpath('//*[@id="rc3"]/text()').re('\W(\d+)\W') item['quoteFreq'] = [q.encode('utf-8') for q in quoteFreq] itemlist.append(item) #加入日志记录,级别为info log.msg("Appending item...", level=log.INFO) #生成日志 log.msg("Append done.", level=log.INFO) return itemlist # if __name__ == "__main__": # sys.path.append('F:\Pythonworkspace\cnkiSpider_master\cnkiSpider\cnkiSpider') # cnki = CNKI_Spiders() # # print os.getcwd() # print cnki #
def parse_member_overdraft2(self, resp): mem_item = resp.meta['item'] hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0.0' else: overdrafts = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n']) overdraft_statuss = str_list_strip_replace(hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[5]/font/child::text()').extract(), [' ', ' ', '\t', '\n']) overdraft = float(0) for i, s_overdraft in enumerate(overdrafts): f_overdraft = float(s_overdraft) if u'已还清' in overdraft_statuss[i]: overdraft = overdraft - f_overdraft continue if u'未还清' in overdraft_statuss[i]: overdraft = overdraft + f_overdraft if overdraft < 0: overdraft = float(0) overdraft = '%.1f' % overdraft mem_item['overdraft'] = overdraft yield mem_item
def parse_store(self, response, js): props = {} props["addr_full"] = Selector(text=js["address"]).xpath("//p/text()").get() props["ref"] = js["url_title"] props["lat"] = js["coordinates"][0] props["lon"] = js["coordinates"][1] props["city"] = js["city"] props["state"] = js["state"] props["postcode"] = js["zip"] props["phone"] = js["phone_number"] hours = response.css(".hours p:not(:empty)").xpath("text()").get() props["opening_hours"] = hours return GeojsonPointItem(**props)
def parse_showdesk_services(self, resp): hxs = Selector(resp) headers = hxs.xpath('//table[@id="itemset"]/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return service_nodes = hxs.xpath('//table[@id="itemset"]/tbody/tr') if not service_nodes: self.log('%s can not find services info' % self.name, level=log.ERROR) yield None return for s_n in service_nodes: info_nodes = s_n.xpath('td') info = OrderedDict({}) no = None for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 1: continue if idx == 8: info[headers[idx]] = str_list_strip_replace(str_list_strip(hxs.xpath('//span[@id="pricespan%s"]' % no).xpath('child::text()').extract()), [' ', '\t', '\n']) continue if idx == 9: discount_nodes = i_n.xpath('.//div[starts-with(@id, "icddiv")]') discounts = [] if discount_nodes: for d_n in discount_nodes: discounts.append(' | '.join(str_list_strip_replace(str_list_strip(d_n.xpath('./child::text()').extract()), [' ', '\t', '\n']))) info[headers[idx]] = ' ||| '.join(discounts) continue info[headers[idx]] = ' | '.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n'])) if idx == 1: no = info[headers[idx]] item = SentreeServiceItem() item['info'] = info # items.append(info) yield item
def _validate_response(self, response: Union[Response, str]) -> bool: """ :param response: :type response: Response :return: :rtype: bool """ if isinstance(response, str): response: Selector = Selector(text=response) response: Union[Response, Selector] names_in_meta: List[str] = response.xpath("/html/head/meta").xpath( "@name").extract() return "ROBOTS" not in names_in_meta
def parse(self, response): topic_id = response.meta[ 'topic_id' ] sel = Selector(text=response.body, type="html") print 'starting' topic_lists = sel.xpath('//ul[re:test(@id,"results")]/li') for topic in topic_lists: topic_item = Topic_Item() temp_sel = Selector(text=topic.extract()) topic_item['topic_id'] = topic_id title = temp_sel.xpath('//h3/a')[0].extract() title = self.parse_html_content(title) print title topic_item['topic_title']=title content = temp_sel.xpath('//p')[0].extract() content = self.parse_html_content(content).strip() print type(content) print content.encode('gbk','ignore') topic_item['topic_content']=content ttime = temp_sel.xpath('//span[re:test(@class,"green stat")]/text()').extract()[0] tt = ttime.split()[1].__repr__() print tt now = datetime.datetime.now() if '5e74' in tt: time_pa = re.findall(self.time_1_pa,ttime.split()[1])[0] new_time = str(time_pa[0])+'-'+str(time_pa[1])+'-'+str(time_pa[2])+' '+'00:00:00' print time_pa elif '5206' in tt: time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0] new_time = now - datetime.timedelta(minutes=int(time_pa)) print time_pa elif '5c0f' in tt: time_pa = re.findall(self.time_2_pa,ttime.split()[1])[0] new_time = now - datetime.timedelta(hours=int(time_pa)) print time_pa print new_time topic_item['topic_post_time']= new_time poster = ttime.split()[0] topic_item['topic_author'] = poster url = temp_sel.xpath('//h3/a/@href').extract()[0] print url topic_item['topic_url']=url yield scrapy.Request(url,callback=self.parse_torrent,meta={'topic_item':topic_item}) print '++++++++++++++++++++++++++++++'
def parse_XML(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def test_parsel_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): s = Selector(text=page.body) extract(parsel_extractors[name], s)
schema = FakeContainer(descriptors['#default']) validate = schema._validate_and_adapt_item _names_map = {'daft_ie': 'daft', 'patchofland': 'pol'} ibl_extractors = {} ibl_pages = {} selector_pages = {} for template_name in ('daft_ie', 'hn', 'patchofland'): with open('%s/data/templates/%s.html' % (_PATH, template_name)) as f: html_page = HtmlPage(body=f.read().decode('utf-8')) name = _names_map.get(template_name, template_name) ibl_pages[name] = html_page ibl_extractors[name] = SlybotIBLExtractor([(html_page, descriptors, '0.13.0')]) selector_pages[name] = Selector(text=html_page.body) class TestExtractionSpeed(TestCase): def test_parsel_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): s = Selector(text=page.body) extract(parsel_extractors[name], s) def test_slybot_parse_and_extract(self): for i in range(ITERATIONS): for name, page in ibl_pages.items(): extraction_page = HtmlPage(body=page.body) ibl_extractors[name].extract(extraction_page)
def parse_consumer_bill_stream_validate(self, resp): hxs = Selector(resp) menu = [u'营业记录', u'水单记录', u'水单审查'] bill_headers = [] head_nodes = hxs.xpath('//tbody[@id="billBody"]/parent::table/thead/tr/th') if not head_nodes: self.log('in %s.parse_consumer_bill_stream_validate, can not get table headers.' % self.name, level=log.ERROR) yield None return for idx, hd in enumerate(head_nodes): if idx == len(head_nodes) - 1: break txts = hd.xpath('child::text()').extract() bill_headers.append('/'.join(txts)) bill_nodes = hxs.xpath('//tbody[@id="billBody"]/tr') if bill_nodes: for bn in bill_nodes: item = SentreeShuiDanShenChaItem() item['menu'] = menu headers = [] item['data'] = OrderedDict({}) data_nodes = bn.xpath('td') for idx, dn in enumerate(data_nodes): if idx == 6: break h = bill_headers[idx] if idx == 0 or idx == 4: headers.append(h) item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract())[0], True] continue if idx == 1 or idx == 2 or idx == 3: headers.append(h) item['data'][h] = [str_list_strip(dn.xpath('descendant::text()').extract()), True] continue if idx == 5: detail = [] subtrs = dn.xpath('table/tr') recoded_headers = False for tr in subtrs: empperfors = [] subdetail = OrderedDict({}) subtds = tr.xpath('td') h = bill_headers[idx + 0] if not recoded_headers: headers.append(h) subdetail[h] = [str_list_strip(subtds[0].xpath('descendant::text()').extract()), True] h = bill_headers[idx + 1] if not recoded_headers: headers.append(h) subdetail[h] = [str_list_strip(subtds[1].xpath('descendant::text()').extract())[0], True] subtrs2 = subtds[2].xpath('table/tr') for kdx, tr2 in enumerate(subtrs2): if kdx == len(subtrs2) - 1: break empperfor = OrderedDict({}) subtds2 = tr2.xpath('td') h = bill_headers[idx + 2 + 0] if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[0].xpath('descendant::text()').extract()), True]) h = bill_headers[idx + 2 + 1] if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[1].xpath('descendant::text()').extract())[0], True]) h = bill_headers[idx + 2 + 2] h = u'员工' + h if not recoded_headers: headers.append(h) if h not in empperfor: empperfor[h] = [] empperfor[h].append([str_list_strip(subtds2[2].xpath('descendant::text()').extract())[0], True]) empperfors.append(empperfor) recoded_headers = True subdetail[u'员工业绩'] = [empperfors, False] detail.append([subdetail, False]) recoded_headers = True item['headers'] = headers item['data'][u'详情'] = [detail, False] # items.append(item) yield item
def __init__(self, response): self.sel = Selector(response)
item['vals'] = infos yield item items = [] if __name__ == '__main__': f = open('e:\\1.html') html = "" for l in f: html += l f.close() resp = TextResponse(url="", body=html) if 1: hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]') total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0' else: overdraft = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n'])[0] print overdraft sys.exit(0) s = SentreeSpider() try: s.parse_showdesk_services(resp) except: print traceback.format_exc()
def extractData(self, body, xpath): if isinstance(body, str): return Selector(text=body).xpath(xpath).extract() return Selector(response=body).xpath(xpath).extract()
def parse_page(self,response): item=CnbetaItem() sel=Selector(response) item["title"]=sel.xpath('//title/text()').extract() item['url']=response.url return item