def parse(self, response): sel = Selector(response) links = sel.xpath(SalonSelectors.LIST_SALONS).extract() next_page = self.get_property(sel, SalonSelectors.NEXT_URL) print u'links: %s, %s' % (len(links), response.url) if SalonSelectors.is_first_page(sel): total = SalonSelectors.get_list_total(sel) if total > 999: # yahoo search can not paginate beyond 1000 items # so need to run crawler for smaller areas self.log_message(u'Pagination overflow: %s' % response.url) if links: for link in links: canonical = link.split('?')[0] if SalonEs.check_by_url(canonical): self.count_skip += 1 print u'%s: skipped: %s' % (self.count_skip, link) continue request = Request(link, callback=self.parse_salon, errback=self.parse_err) request.meta['page_kind'] = 'salon' yield request if next_page: request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request
def parse_salon(self, response): sel = Selector(response) item = SalonItem() item['page_url'] = self.get_property(sel, SalonSelectors.CANONICAL_URL) or response.url item['name'] = self.get_property(sel, SalonSelectors.NAME) item['name_kata'] = self.get_property(sel, SalonSelectors.NAME_KATA) item['address'] = self.get_property(sel, SalonSelectors.ADDRESS, clean=True) item['routes'] = SalonSelectors.get_routes(sel) item['phone'] = SalonSelectors.get_phone(sel) item['working_hours'] = SalonSelectors.get_working_hours(sel) item['holydays'] = SalonSelectors.get_holidays(sel) item['shop_url'] = SalonSelectors.get_shop_url(sel) comment, credit_cards = SalonSelectors.get_credit_cards(sel) item['credit_cards_comment'] = comment item['credit_cards'] = credit_cards item['seats'] = SalonSelectors.get_seats(sel) item['stylist'] = SalonSelectors.get_stylist(sel) item['parking'] = SalonSelectors.get_parking(sel) item['cut_price'] = SalonSelectors.get_cut_price(sel) prefecture, area = SalonSelectors.get_prefecture_area(sel) item['prefecture'] = prefecture item['area'] = area item['page_body'] = SalonSelectors.get_body(sel) self.count += 1 print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name']) return item