Exemple #1
0
  def dump(self):
    from ghost_spider.elastic import LocationEs
    from ghost_spider import progressbar
    page = 1
    limit = 100

    parent_place = "United States"
    sort = self.ES_SORT
    directory = 'upload/crawler/hotels/United States'
    if self.region == self.UE_REGION:
      parent_place = "Europe"
      sort = self.ES_SORT_STATE
      directory = 'upload/crawler/hotels/Europe'
    query = {"query": {"bool": {"must": [{"term": {"place.area1.untouched": parent_place}}]}}}

    if os.path.exists(directory):
      shutil.rmtree(directory)

    progress = None
    total = 0
    while True:
      places, total = LocationEs.pager(query=query, page=page, size=limit, sort=sort)
      page += 1
      if not places or not len(places):
        break
      if not progress:
        progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress + limit
      progress.show_progress()
      for p in places:
        self.save_to_csv(p)
    print " "
    print "Finito!"
    print "*." * 50
    print "count %s" % total
  def parse(self, response):
    """Go through the sitemap and fetch hotels/restaurant/spot pages."""
    count = 0
    download_list = None
    current_level = long(response.meta.get('area_level') or 1)
    sel = Selector(response)
    links = sel.xpath(helper.SEL_LIST_PLACES).extract()

    # Get the list of countries that needs to be scrapped
    if current_level == 1:
      download_list = sel.xpath(helper.SEL_ALLOW_PLACES).extract()
      if download_list:
        download_list = download_list[0].split(u',')
    if links:
      for link in links:
        area_name = helper.place_sel_name.findall(link)[0]
        # skip country if is not in the list
        if download_list and area_name.lower() not in download_list:
          continue
        area_link = self.target_base_url + helper.place_sel_link.findall(link)[0]
        count += 1
        request = Request(area_link, callback=self.parse, errback=self.parse_err)
        request.meta['area_name'] = area_name
        request.meta['area_level'] = current_level + 1
        yield request
    else:
      # possible last level
      links = sel.xpath(helper.SEL_LIST_PLACES_LAST).extract()
      if links:
        if not response.meta.get('is_more'):
          # load additional list of places
          links_more = sel.xpath(helper.SEL_LIST_MORE).extract()
          for l in links_more:
            count += 1
            area_name = "More Links"
            area_link = self.target_base_url + helper.place_sel_link.findall(l)[0]
            request = Request(area_link, callback=self.parse, errback=self.parse_err)
            request.meta['area_name'] = area_name
            request.meta['is_more'] = True
            request.meta['area_level'] = current_level
            self.log.msg('Loading more pages, %s' % area_link, level=self.log.INFO)
            yield request
        for link in links:
          area_name = helper.place_sel_name_last.findall(link)[0]
          area_link = self.target_base_url + helper.place_sel_link_last.findall(link)[0]
          # don't scrap the page if it was crawled
          # if the link is not hotel don't fetch it!!
          if not helper.FIND_HOTEL_LINK.findall(area_link):
            self.log.msg(u'ignored %s' % area_link, level=self.log.INFO)
            continue
          if LocationEs.check_by_url(area_link):
            self.log.msg(u'ignored %s' % area_link, level=self.log.INFO)
            continue
          request = Request(area_link, callback=self.parse_place, errback=self.parse_err)
          request.meta['area_name'] = area_name
          request.meta['area_level'] = current_level + 1
          yield request
          count += 1
        self.total_count += count
        print u'found = %s' % self.total_count
    if response.meta.get('area_name'):
      message = u'%s> %s found(%s) | total(%s)' % ('-----' * current_level, response.meta['area_name'], count, self.total_count)
      print message
      self.log.msg(message, level=self.log.INFO)