def parse_salon(self, response): sel = Selector(response) item = LocationHotelItem() item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url item['name'] = self.get_property(sel, LocationHotelSelectors.NAME) item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA) item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True) item['routes'] = LocationHotelSelectors.get_routes(sel) item['phone'] = LocationHotelSelectors.get_phone(sel) item['shop_url'] = LocationHotelSelectors.get_shop_url(sel) comment, credit_cards = LocationHotelSelectors.get_credit_cards(sel) item['credit_cards_comment'] = comment item['credit_cards'] = credit_cards prefecture, area = LocationHotelSelectors.get_prefecture_area(sel) item['prefecture'] = prefecture item['area'] = area item['genre'] = LocationHotelSelectors.get_genre(sel) checkin, checkout = LocationHotelSelectors.get_working_time(sel) item['checkin'] = checkin item['checkout'] = checkout item['kind'] = LocationHotelSelectors.get_hotel_type(sel) item['votes'] = LocationHotelSelectors.get_votes(sel) item['page_body'] = LocationHotelSelectors.get_body(sel) self.count += 1 # print "=" * 100 print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name']) # for key, value in item.iteritems(): # print u'%s: %s' % (key, u'|'.join(value) if isinstance(value, (list, tuple)) else value) return item
def parse(self, response): sel = Selector(response) links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract() next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL) print u'links: %s, %s' % (len(links), response.url) if len(links) <= 0: self.log_message(u'links: %s, %s' % (len(links), response.url)) if LocationHotelSelectors.is_first_page(sel): total = LocationHotelSelectors.get_list_total(sel) self.total += total if total > 999: # yahoo search can not paginate beyond 1000 items # so need to run crawler for smaller areas or cateories page_cat = LocationHotelSelectors.get_category(sel) if page_cat and page_cat != "01": self.log_message(u'Pagination overflow: %s' % response.url) else: for category in GOURMET_CATEGORY: next_page = response.url.replace('genrecd=01', 'genrecd=%s' % category) print u'new links --> %s' % next_page request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request if self.start_urls[-1] == response.url: self.log_message(u'Counted this many places: %s' % self.total) if self.scan_mode: return if links: for link in links: canonical = link.split('?')[0] if LocationRestaurantEs.check_by_url(canonical): # print u'skipped: %s' % link continue request = Request(link, callback=self.parse_salon, errback=self.parse_err) request.meta['page_kind'] = 'salon' yield request if next_page: request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request
def parse_salon(self, response): sel = Selector(response) item = LocationRestaurantItem() item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url item['name'] = self.get_property(sel, LocationHotelSelectors.NAME) item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA) item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True) item['phone'] = LocationHotelSelectors.get_phone(sel) prefecture, area = LocationHotelSelectors.get_prefecture_area(sel) item['prefecture'] = prefecture item['area'] = area genre = LocationHotelSelectors.get_restaurant_genre(sel) item['genre'] = genre item['kind'] = LocationHotelSelectors.convert_latte_kind(genre) item['page_body'] = LocationHotelSelectors.get_body(sel, is_restaurant=True) self.count += 1 print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name']) return item
def update_location(): """Duplicate a type from type_from to type_to.""" # build bulk from ghost_spider.elastic import LocationRestaurantEs from ghost_spider.helper import LocationHotelSelectors from ghost_spider import progressbar import re size = 100 page = 0 progress = None total = 0 query = {"query": {"match_all": {}}, "size": size, "from": 0} query["sort"] = [{"name.untouched": "asc"}] while True: query["from"] = page * size results = LocationRestaurantEs.search(query) if not progress: total = results["hits"]["total"] print "total %s" % total progress = progressbar.AnimatedProgressBar(end=total, width=100) if not results["hits"]["hits"]: break page += 1 bulk = "" for result in results["hits"]["hits"]: location = result["_source"] data_id = result["_id"] genre = [] area_found = False for a in location["page_body"]["genre"]: result = re.findall(r'genrecd=\d+', a) if u'genrecd=01' in result: if not area_found: text = re.findall(r'>(.*)</', a) location['area'] = text[0] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') area_found = True else: text = re.findall(r'>(.*)</', a) if text and text[0]: genre.append(text[0]) if not area_found and len(location["page_body"]["breadcrumbs"]) > 1: location['area'] = location["page_body"]["breadcrumbs"][-1] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') elif not area_found: location['area'] = '' location['area_ascii'] = '' kind = LocationHotelSelectors.convert_latte_kind(genre) location['genre'] = genre location['kind'] = kind bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id) progress + size progress.show_progress() LocationRestaurantEs.send(bulk) if progress: progress + total progress.show_progress() print " "