Exemple #1
0
  def dump(self, action=None):
    from ghost_spider.elastic import SalonEs
    from ghost_spider import progressbar

    filename = self.get_filename_by_name(self.name, u'salons.csv')
    query = {"query": {"bool": {"must": [{"term": {"prefecture_ascii": self.name}}]}}}

    if action == 'recover':
      query["query"]["bool"]["must"].append({"term": {"recovered": "1"}})
      filename = self.get_filename_by_name(self.name, u'salons_recover.csv')

    if os.path.exists(filename):
      os.remove(filename)

    progress = None
    total = 0
    page = 1
    limit = 100
    sort = [{"area.untouched": "asc"}]
    print "=" * 100
    print "dumping data for %s" % self.name
    while True:
      salons, total = SalonEs.pager(query=query, page=page, size=limit, sort=sort)
      page += 1
      if not salons or not len(salons):
        break
      if not progress:
        print u'Total: %s' % total
        progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress + limit
      progress.show_progress()
      for salon in salons:
        self.save_to_csv(filename, salon)
    print " "
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(SalonSelectors.LIST_SALONS).extract()
    next_page = self.get_property(sel, SalonSelectors.NEXT_URL)
    print u'links: %s, %s' % (len(links), response.url)
    if SalonSelectors.is_first_page(sel):
      total = SalonSelectors.get_list_total(sel)
      if total > 999:
        # yahoo search can not paginate beyond 1000 items
        # so need to run crawler for smaller areas
        self.log_message(u'Pagination overflow: %s' % response.url)
    if links:
      for link in links:
        canonical = link.split('?')[0]
        if SalonEs.check_by_url(canonical):
          self.count_skip += 1
          print u'%s: skipped: %s' % (self.count_skip, link)
          continue
        request = Request(link, callback=self.parse_salon, errback=self.parse_err)
        request.meta['page_kind'] = 'salon'
        yield request

    if next_page:
      request = Request(next_page, callback=self.parse, errback=self.parse_err)
      request.meta['page_kind'] = 'list'
      yield request