def dump(self): from ghost_spider.elastic import LocationEs from ghost_spider import progressbar page = 1 limit = 100 parent_place = "United States" sort = self.ES_SORT directory = 'upload/crawler/hotels/United States' if self.region == self.UE_REGION: parent_place = "Europe" sort = self.ES_SORT_STATE directory = 'upload/crawler/hotels/Europe' query = {"query": {"bool": {"must": [{"term": {"place.area1.untouched": parent_place}}]}}} if os.path.exists(directory): shutil.rmtree(directory) progress = None total = 0 while True: places, total = LocationEs.pager(query=query, page=page, size=limit, sort=sort) page += 1 if not places or not len(places): break if not progress: progress = progressbar.AnimatedProgressBar(end=total, width=100) progress + limit progress.show_progress() for p in places: self.save_to_csv(p) print " " print "Finito!" print "*." * 50 print "count %s" % total
def parse(self, response): """Go through the sitemap and fetch hotels/restaurant/spot pages.""" count = 0 download_list = None current_level = long(response.meta.get('area_level') or 1) sel = Selector(response) links = sel.xpath(helper.SEL_LIST_PLACES).extract() # Get the list of countries that needs to be scrapped if current_level == 1: download_list = sel.xpath(helper.SEL_ALLOW_PLACES).extract() if download_list: download_list = download_list[0].split(u',') if links: for link in links: area_name = helper.place_sel_name.findall(link)[0] # skip country if is not in the list if download_list and area_name.lower() not in download_list: continue area_link = self.target_base_url + helper.place_sel_link.findall(link)[0] count += 1 request = Request(area_link, callback=self.parse, errback=self.parse_err) request.meta['area_name'] = area_name request.meta['area_level'] = current_level + 1 yield request else: # possible last level links = sel.xpath(helper.SEL_LIST_PLACES_LAST).extract() if links: if not response.meta.get('is_more'): # load additional list of places links_more = sel.xpath(helper.SEL_LIST_MORE).extract() for l in links_more: count += 1 area_name = "More Links" area_link = self.target_base_url + helper.place_sel_link.findall(l)[0] request = Request(area_link, callback=self.parse, errback=self.parse_err) request.meta['area_name'] = area_name request.meta['is_more'] = True request.meta['area_level'] = current_level self.log.msg('Loading more pages, %s' % area_link, level=self.log.INFO) yield request for link in links: area_name = helper.place_sel_name_last.findall(link)[0] area_link = self.target_base_url + helper.place_sel_link_last.findall(link)[0] # don't scrap the page if it was crawled # if the link is not hotel don't fetch it!! if not helper.FIND_HOTEL_LINK.findall(area_link): self.log.msg(u'ignored %s' % area_link, level=self.log.INFO) continue if LocationEs.check_by_url(area_link): self.log.msg(u'ignored %s' % area_link, level=self.log.INFO) continue request = Request(area_link, callback=self.parse_place, errback=self.parse_err) request.meta['area_name'] = area_name request.meta['area_level'] = current_level + 1 yield request count += 1 self.total_count += count print u'found = %s' % self.total_count if response.meta.get('area_name'): message = u'%s> %s found(%s) | total(%s)' % ('-----' * current_level, response.meta['area_name'], count, self.total_count) print message self.log.msg(message, level=self.log.INFO)