def replicate_type(): """Duplicate a type from type_from to type_to.""" # build bulk from ghost_spider.elastic import LocationRestaurantEs from ghost_spider import progressbar size = 100 page = 0 progress = None total = 0 while True: start_from = page * size results = LocationRestaurantEs.search({"query": {"match_all": {}}, "size": size, "from": start_from}) if not progress: total = results["hits"]["total"] progress = progressbar.AnimatedProgressBar(end=total, width=100) if not results["hits"]["hits"]: break page += 1 bulk = "" for result in results["hits"]["hits"]: data = result["_source"] bulk += LocationRestaurantEs.bulk_data(data, type_name="restaurants_back") progress + size progress.show_progress() LocationRestaurantEs.send(bulk) if progress: progress + total progress.show_progress() print "total %s" % total
def dump_restaurant(cls, name, action=None): from ghost_spider.elastic import LocationRestaurantEs, LatteRestaurantEs from ghost_spider.data import URL_TARGET_URLS from ghost_spider import progressbar from ghost_spider.data import RST_KINDS_LATE_NOT_ALLOWED query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": name}}], "must_not": []}}} if action == 'production': query["query"]["bool"]["must"].append({"term": {"version": 10}}) query["query"]["bool"]["must_not"].append({"terms": {"genre.untouched": RST_KINDS_LATE_NOT_ALLOWED.keys()}}) progress = None total = 0 page = 1 limit = 100 sort = [{"area.untouched": "asc"}] save_data_to_file = cls.save_for_production if action == u'production' else cls.save_to_csv print "=" * 100 count_lines = 0 while True: places, total = LocationRestaurantEs.pager(query=query, page=page, size=limit, sort=sort) page += 1 if not places or not len(places): break if not progress: print "Dumping data for %s (%s)" % (name, total) progress = progressbar.AnimatedProgressBar(end=total, width=100) progress + limit progress.show_progress() for place in places: result = LatteRestaurantEs.get_place_by_name(place.get('name')) if result["hits"]["total"] > 0: place["latte_url"] = result["hits"]["hits"][0]["_source"]["url"] place["latte_url"] = place["latte_url"].replace(URL_TARGET_URLS[0], URL_TARGET_URLS[1]) place['kind'] = u'|'.join(place['kind']) if count_lines % 10000 == 0: count = (count_lines / 10000) + 1 filename = cls.get_filename_by_name(name, count=count, remove_file=True) count_lines += 1 save_data_to_file(filename, place) print " "
def parse(self, response): sel = Selector(response) links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract() next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL) print u'links: %s, %s' % (len(links), response.url) if len(links) <= 0: self.log_message(u'links: %s, %s' % (len(links), response.url)) if LocationHotelSelectors.is_first_page(sel): total = LocationHotelSelectors.get_list_total(sel) self.total += total if total > 999: # yahoo search can not paginate beyond 1000 items # so need to run crawler for smaller areas or cateories page_cat = LocationHotelSelectors.get_category(sel) if page_cat and page_cat != "01": self.log_message(u'Pagination overflow: %s' % response.url) else: for category in GOURMET_CATEGORY: next_page = response.url.replace('genrecd=01', 'genrecd=%s' % category) print u'new links --> %s' % next_page request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request if self.start_urls[-1] == response.url: self.log_message(u'Counted this many places: %s' % self.total) if self.scan_mode: return if links: for link in links: canonical = link.split('?')[0] if LocationRestaurantEs.check_by_url(canonical): # print u'skipped: %s' % link continue request = Request(link, callback=self.parse_salon, errback=self.parse_err) request.meta['page_kind'] = 'salon' yield request if next_page: request = Request(next_page, callback=self.parse, errback=self.parse_err) request.meta['page_kind'] = 'list' yield request
def update_location(): """Duplicate a type from type_from to type_to.""" # build bulk from ghost_spider.elastic import LocationRestaurantEs from ghost_spider.helper import LocationHotelSelectors from ghost_spider import progressbar import re size = 100 page = 0 progress = None total = 0 query = {"query": {"match_all": {}}, "size": size, "from": 0} query["sort"] = [{"name.untouched": "asc"}] while True: query["from"] = page * size results = LocationRestaurantEs.search(query) if not progress: total = results["hits"]["total"] print "total %s" % total progress = progressbar.AnimatedProgressBar(end=total, width=100) if not results["hits"]["hits"]: break page += 1 bulk = "" for result in results["hits"]["hits"]: location = result["_source"] data_id = result["_id"] genre = [] area_found = False for a in location["page_body"]["genre"]: result = re.findall(r'genrecd=\d+', a) if u'genrecd=01' in result: if not area_found: text = re.findall(r'>(.*)</', a) location['area'] = text[0] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') area_found = True else: text = re.findall(r'>(.*)</', a) if text and text[0]: genre.append(text[0]) if not area_found and len(location["page_body"]["breadcrumbs"]) > 1: location['area'] = location["page_body"]["breadcrumbs"][-1] if location['area']: location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer') elif not area_found: location['area'] = '' location['area_ascii'] = '' kind = LocationHotelSelectors.convert_latte_kind(genre) location['genre'] = genre location['kind'] = kind bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id) progress + size progress.show_progress() LocationRestaurantEs.send(bulk) if progress: progress + total progress.show_progress() print " "