Beispiel #1
0
def replicate_type():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider import progressbar
  size = 100
  page = 0
  progress = None
  total = 0
  while True:
    start_from = page * size
    results = LocationRestaurantEs.search({"query": {"match_all": {}}, "size": size, "from": start_from})
    if not progress:
      total = results["hits"]["total"]
      progress = progressbar.AnimatedProgressBar(end=total, width=100)

    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      data = result["_source"]
      bulk += LocationRestaurantEs.bulk_data(data, type_name="restaurants_back")
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print "total %s" % total
Beispiel #2
0
def update_location():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider.helper import LocationHotelSelectors
  from ghost_spider import progressbar
  import re
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"name.untouched": "asc"}]
  while True:
    query["from"] = page * size
    results = LocationRestaurantEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      genre = []
      area_found = False
      for a in location["page_body"]["genre"]:
        result = re.findall(r'genrecd=\d+', a)
        if u'genrecd=01' in result:
          if not area_found:
            text = re.findall(r'>(.*)</', a)
            location['area'] = text[0]
            if location['area']:
              location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
            area_found = True
        else:
          text = re.findall(r'>(.*)</', a)
          if text and text[0]:
            genre.append(text[0])
      if not area_found and len(location["page_body"]["breadcrumbs"]) > 1:
        location['area'] = location["page_body"]["breadcrumbs"][-1]
        if location['area']:
          location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
      elif not area_found:
        location['area'] = ''
        location['area_ascii'] = ''
      kind = LocationHotelSelectors.convert_latte_kind(genre)
      location['genre'] = genre
      location['kind'] = kind
      bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id)
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print " "