def parse_salon(self, response): sel = Selector(response) item = LocationRestaurantItem() item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url item['name'] = self.get_property(sel, LocationHotelSelectors.NAME) item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA) item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True) item['phone'] = LocationHotelSelectors.get_phone(sel) prefecture, area = LocationHotelSelectors.get_prefecture_area(sel) item['prefecture'] = prefecture item['area'] = area genre = LocationHotelSelectors.get_restaurant_genre(sel) item['genre'] = genre item['kind'] = LocationHotelSelectors.convert_latte_kind(genre) item['page_body'] = LocationHotelSelectors.get_body(sel, is_restaurant=True) self.count += 1 print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name']) return item
def update_location(): """Duplicate a type from type_from to type_to.""" # build bulk from ghost_spider.elastic import LocationRestaurantEs from ghost_spider.helper import LocationHotelSelectors from ghost_spider import progressbar import re size = 100 page = 0 progress = None total = 0 query = {"query": {"match_all": {}}, "size": size, "from": 0} query["sort"] = [{"name.untouched": "asc"}] while True: query["from"] = page * size results = LocationRestaurantEs.search(query) if not progress: total = results["hits"]["total"] print "total %s" % total progress = progressbar.AnimatedProgressBar(end=total, width=100) if not results["hits"]["hits"]: break page += 1 bulk = "" for result in results["hits"]["hits"]: location = result["_source"] data_id = result["_id"] genre = [] area_found = False for a in location["page_body"]["genre"]: result = re.findall(r'genrecd=\d+', a) if u'genrecd=01' in result: if not area_found: text = re.findall(r'>(.*)</', a) location['area'] = text[0] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') area_found = True else: text = re.findall(r'>(.*)</', a) if text and text[0]: genre.append(text[0]) if not area_found and len(location["page_body"]["breadcrumbs"]) > 1: location['area'] = location["page_body"]["breadcrumbs"][-1] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') elif not area_found: location['area'] = '' location['area_ascii'] = '' kind = LocationHotelSelectors.convert_latte_kind(genre) location['genre'] = genre location['kind'] = kind bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id) progress + size progress.show_progress() LocationRestaurantEs.send(bulk) if progress: progress + total progress.show_progress() print " "