Exemple #1
0
  def dump_restaurant(cls, name, action=None):
    from ghost_spider.elastic import LocationRestaurantEs, LatteRestaurantEs
    from ghost_spider.data import URL_TARGET_URLS
    from ghost_spider import progressbar
    from ghost_spider.data import RST_KINDS_LATE_NOT_ALLOWED

    query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": name}}], "must_not": []}}}
    if action == 'production':
      query["query"]["bool"]["must"].append({"term": {"version": 10}})

    query["query"]["bool"]["must_not"].append({"terms": {"genre.untouched": RST_KINDS_LATE_NOT_ALLOWED.keys()}})

    progress = None
    total = 0
    page = 1
    limit = 100
    sort = [{"area.untouched": "asc"}]

    save_data_to_file = cls.save_for_production if action == u'production' else cls.save_to_csv

    print "=" * 100
    count_lines = 0

    while True:
      places, total = LocationRestaurantEs.pager(query=query, page=page, size=limit, sort=sort)
      page += 1
      if not places or not len(places):
        break
      if not progress:
        print "Dumping data for %s (%s)" % (name, total)
        progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress + limit
      progress.show_progress()
      for place in places:
        result = LatteRestaurantEs.get_place_by_name(place.get('name'))
        if result["hits"]["total"] > 0:
          place["latte_url"] = result["hits"]["hits"][0]["_source"]["url"]
          place["latte_url"] = place["latte_url"].replace(URL_TARGET_URLS[0], URL_TARGET_URLS[1])

        place['kind'] = u'|'.join(place['kind'])
        if count_lines % 10000 == 0:
          count = (count_lines / 10000) + 1
          filename = cls.get_filename_by_name(name, count=count, remove_file=True)

        count_lines += 1
        save_data_to_file(filename, place)
    print " "