def parse_salon(self, response):
    sel = Selector(response)
    item = LocationRestaurantItem()
    item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url
    item['name'] = self.get_property(sel, LocationHotelSelectors.NAME)
    item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA)
    item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True)
    item['phone'] = LocationHotelSelectors.get_phone(sel)

    prefecture, area = LocationHotelSelectors.get_prefecture_area(sel)

    item['prefecture'] = prefecture
    item['area'] = area
    genre = LocationHotelSelectors.get_restaurant_genre(sel)
    item['genre'] = genre
    item['kind'] = LocationHotelSelectors.convert_latte_kind(genre)

    item['page_body'] = LocationHotelSelectors.get_body(sel, is_restaurant=True)
    self.count += 1
    print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name'])

    return item
Example #2
0
def update_location():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider.helper import LocationHotelSelectors
  from ghost_spider import progressbar
  import re
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"name.untouched": "asc"}]
  while True:
    query["from"] = page * size
    results = LocationRestaurantEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      genre = []
      area_found = False
      for a in location["page_body"]["genre"]:
        result = re.findall(r'genrecd=\d+', a)
        if u'genrecd=01' in result:
          if not area_found:
            text = re.findall(r'>(.*)</', a)
            location['area'] = text[0]
            if location['area']:
              location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
            area_found = True
        else:
          text = re.findall(r'>(.*)</', a)
          if text and text[0]:
            genre.append(text[0])
      if not area_found and len(location["page_body"]["breadcrumbs"]) > 1:
        location['area'] = location["page_body"]["breadcrumbs"][-1]
        if location['area']:
          location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
      elif not area_found:
        location['area'] = ''
        location['area_ascii'] = ''
      kind = LocationHotelSelectors.convert_latte_kind(genre)
      location['genre'] = genre
      location['kind'] = kind
      bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id)
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print " "