def parse_salon(self, response):
    sel = Selector(response)
    item = LocationHotelItem()
    item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url
    item['name'] = self.get_property(sel, LocationHotelSelectors.NAME)
    item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA)
    item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True)
    item['routes'] = LocationHotelSelectors.get_routes(sel)
    item['phone'] = LocationHotelSelectors.get_phone(sel)
    item['shop_url'] = LocationHotelSelectors.get_shop_url(sel)

    comment, credit_cards = LocationHotelSelectors.get_credit_cards(sel)
    item['credit_cards_comment'] = comment
    item['credit_cards'] = credit_cards

    prefecture, area = LocationHotelSelectors.get_prefecture_area(sel)

    item['prefecture'] = prefecture
    item['area'] = area
    item['genre'] = LocationHotelSelectors.get_genre(sel)

    checkin, checkout = LocationHotelSelectors.get_working_time(sel)
    item['checkin'] = checkin
    item['checkout'] = checkout
    item['kind'] = LocationHotelSelectors.get_hotel_type(sel)
    item['votes'] = LocationHotelSelectors.get_votes(sel)
    item['page_body'] = LocationHotelSelectors.get_body(sel)
    self.count += 1
    # print "=" * 100
    print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name'])
    # for key, value in item.iteritems():
    #   print u'%s: %s' % (key, u'|'.join(value) if isinstance(value, (list, tuple)) else value)
    return item
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(LocationHotelSelectors.LIST_SALONS).extract()
    next_page = self.get_property(sel, LocationHotelSelectors.NEXT_URL)
    print u'links: %s, %s' % (len(links), response.url)

    if len(links) <= 0:
      self.log_message(u'links: %s, %s' % (len(links), response.url))

    if LocationHotelSelectors.is_first_page(sel):
      total = LocationHotelSelectors.get_list_total(sel)
      self.total += total
      if total > 999:
        # yahoo search can not paginate beyond 1000 items
        # so need to run crawler for smaller areas or cateories
        page_cat = LocationHotelSelectors.get_category(sel)
        if page_cat and page_cat != "01":
          self.log_message(u'Pagination overflow: %s' % response.url)
        else:
          for category in GOURMET_CATEGORY:
            next_page = response.url.replace('genrecd=01', 'genrecd=%s' % category)
            print u'new links --> %s' % next_page
            request = Request(next_page, callback=self.parse, errback=self.parse_err)
            request.meta['page_kind'] = 'list'
            yield request

      if self.start_urls[-1] == response.url:
        self.log_message(u'Counted this many places: %s' % self.total)

    if self.scan_mode:
      return

    if links:
      for link in links:
        canonical = link.split('?')[0]
        if LocationRestaurantEs.check_by_url(canonical):
          # print u'skipped: %s' % link
          continue
        request = Request(link, callback=self.parse_salon, errback=self.parse_err)
        request.meta['page_kind'] = 'salon'
        yield request

    if next_page:
      request = Request(next_page, callback=self.parse, errback=self.parse_err)
      request.meta['page_kind'] = 'list'
      yield request
  def parse_salon(self, response):
    sel = Selector(response)
    item = LocationRestaurantItem()
    item['page_url'] = self.get_property(sel, LocationHotelSelectors.CANONICAL_URL) or response.url
    item['name'] = self.get_property(sel, LocationHotelSelectors.NAME)
    item['name_kata'] = self.get_property(sel, LocationHotelSelectors.NAME_KATA)
    item['address'] = self.get_property(sel, LocationHotelSelectors.ADDRESS, clean=True)
    item['phone'] = LocationHotelSelectors.get_phone(sel)

    prefecture, area = LocationHotelSelectors.get_prefecture_area(sel)

    item['prefecture'] = prefecture
    item['area'] = area
    genre = LocationHotelSelectors.get_restaurant_genre(sel)
    item['genre'] = genre
    item['kind'] = LocationHotelSelectors.convert_latte_kind(genre)

    item['page_body'] = LocationHotelSelectors.get_body(sel, is_restaurant=True)
    self.count += 1
    print u'%s: %s > %s -> %s' % (self.count, item['prefecture'], item['area'], item['name'])

    return item
Example #4
0
def update_location():
  """Duplicate a type from type_from to type_to."""
  # build bulk
  from ghost_spider.elastic import LocationRestaurantEs
  from ghost_spider.helper import LocationHotelSelectors
  from ghost_spider import progressbar
  import re
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"name.untouched": "asc"}]
  while True:
    query["from"] = page * size
    results = LocationRestaurantEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
    if not results["hits"]["hits"]:
      break

    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      genre = []
      area_found = False
      for a in location["page_body"]["genre"]:
        result = re.findall(r'genrecd=\d+', a)
        if u'genrecd=01' in result:
          if not area_found:
            text = re.findall(r'>(.*)</', a)
            location['area'] = text[0]
            if location['area']:
              location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
            area_found = True
        else:
          text = re.findall(r'>(.*)</', a)
          if text and text[0]:
            genre.append(text[0])
      if not area_found and len(location["page_body"]["breadcrumbs"]) > 1:
        location['area'] = location["page_body"]["breadcrumbs"][-1]
        if location['area']:
          location['area_ascii'] = LocationRestaurantEs.analyze(location['area'], 'romaji_ascii_normal_analyzer')
      elif not area_found:
        location['area'] = ''
        location['area_ascii'] = ''
      kind = LocationHotelSelectors.convert_latte_kind(genre)
      location['genre'] = genre
      location['kind'] = kind
      bulk += LocationRestaurantEs.bulk_data(location, action="update", data_id=data_id)
    progress + size
    progress.show_progress()
    LocationRestaurantEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print " "