def close_spider(self, spider): if not isinstance(spider, TripAdvisorHotelSpider): return config = GlobalConfig() db_path = config.get_path('OUTPUT_SQLITE') file_path = config.get_path('OUTPUT_BULK_JSON') if not db_path is None and not file_path is None: try: with TripAdvisorDB(db_path) as db: data = db.get_everything() with open(file_path, 'w') as fh: fh.write(json.dumps(data)) except: pass
def start_requests(self): ''' Este método es invocado cuando la araña empieza ha hacer las requests. :return: Devuelve un generador, que genera instancias de la clase scrapy.Request ''' config = GlobalConfig() if config.is_set('SEARCH_BY_TERMS'): yield TripAdvisorRequests.search_hotels_by_terms( terms=config.get_value('SEARCH_BY_TERMS'), callback=self.parse_hotel_search_by_terms) else: yield TripAdvisorRequests.search_hotels_by_place( place=config.get_value('SEARCH_BY_LOCATION'), callback=self.parse_hotel_search_by_place)
def parse_hotel(self, response): ''' Parsea una request a la página con información de un hotel en TripAdvisor, obtiene sus datos, reviews y deals :param response: :return: ''' config = GlobalConfig() methods = [] methods.append(self.parse_hotel_info(response)) if config.is_true('SCRAP_DEALS'): methods.append(self.parse_hotel_deals(response)) if config.is_true('SCRAP_REVIEWS'): methods.append(self.parse_hotel_reviews(response)) return chain(*methods)
def __init__(self, file_path): self.log = logging.getLogger(str(self)) self.log.setLevel(logging.DEBUG) self.log.propagate = GlobalConfig().is_true('ENABLE_DEBUG') and GlobalConfig().is_true('OUTPUT_DEBUG_INFO_TO_STDOUT') if GlobalConfig().is_true('ENABLE_DEBUG') and not file_path is None: try: with open(file_path, 'wb') as fh: pass except: pass try: log_file_handler = logging.FileHandler(file_path) self.log.addHandler(log_file_handler) except: pass
def get_data(): ''' Una petición GET a esta ruta ("/data") devolverá información sobre los hoteles escrapeados en TripAdvisor. :return: ''' try: with sqlite.connect(GlobalConfig().get_path('OUTPUT_SQLITE')) as db: cursor = db.cursor() cursor.execute( """ SELECT id, name, address, latitude, longitude FROM hotel_geo AS geo INNER JOIN hotel_info as info ON geo.hotel_id = info.id """) hotel_data = [] for register in cursor.fetchall(): id, name, address, latitude, longitude = register hotel_data.append({ 'id' : id, 'name' : name, 'address' : address, 'latitude' : latitude, 'longitude' : longitude }) cursor.execute('SELECT COUNT(*) FROM hotel_info') num_hotels, = cursor.fetchone() cursor.execute('SELECT COUNT(*) FROM hotel_deal') num_deals, = cursor.fetchone() cursor.execute('SELECT COUNT(*) FROM hotel_review') num_reviews, = cursor.fetchone() cursor.execute('SELECT COUNT(*) FROM hotel_geo') num_geolocalized_hotels, = cursor.fetchone() except Exception as e: hotel_data = [] num_hotels = 0 num_geolocalized_hotels = 0 num_deals = 0 num_reviews = 0 return json.dumps({ 'hotel-data' : hotel_data, 'meta' : { 'num_hotels' : num_hotels, 'num_geolocalized_hotels' : num_geolocalized_hotels, 'num_deals' : num_deals, 'num_reviews' : num_reviews }, 'scraper_finished' : not TripAdvisorScraper().is_running() })
def open_spider(self, spider): if not isinstance(spider, TripAdvisorHotelSpider): return db_path = GlobalConfig().get_path('OUTPUT_SQLITE') try: if db_path is None: raise Exception() self.db = TripAdvisorDB(db_path) self.db.reset() except: self.db = None
def search_place(cls, address, callback): ''' Realiza una request a la API de Google Maps para buscar una localización o lugar. :param address: Es un lugar o dirección :param callback: :return: ''' api_key = GlobalConfig().get_value('GOOGLE_MAPS_API_KEY') params = {'address': address, 'key': api_key} url = '{}?{}'.format(cls.get_root_url(), urlencode(params)) return Request(url=url, callback=callback)
def __init__(self): config = GlobalConfig() self.files = { 'TripAdvisorHotelReview': config.get_path('OUTPUT_REVIEWS_JSON'), 'TripAdvisorHotelInfo': config.get_path('OUTPUT_HOTEL_INFO_JSON'), 'TripAdvisorHotelDeals': config.get_path('OUTPUT_DEALS_JSON'), 'TripAdvisorHotelGeolocation': config.get_path('OUTPUT_GEO_JSON') }
def get_json(): ''' Sobre la ruta "/get-json-data" se obtienen los datos escrapeados en formato JSON :return: ''' try: with TripAdvisorDB(db_path = GlobalConfig().get_path('OUTPUT_SQLITE')) as db: data = db.get_everything() except: data = [] response = make_response(json.dumps(data)) response.headers['Content-Disposition'] = 'attachment; filename=tripadvisor.json' return response
def get_sqlite(): ''' Sobre la ruta "/get-sqlite-data" se obtiene el fichero de base de datos sqlite con los datos escrapeados hasta el momento :return: ''' try: with open(GlobalConfig().get_path('OUTPUT_SQLITE'), 'rb') as fh: data = fh.read() except: return 'Error fetching sqlite file database' response = make_response(data) response.headers['Content-Disposition'] = 'attachment; filename=tripadvisor.db' return response
def __init__(self, db_path): ''' Inicializa la instancia. En el constructor se abre la conexión con la base de datos. ''' Logger.__init__(self, GlobalConfig().get_path('OUTPUT_SQLITE_LOG')) self.log.debug('Connecting to TripAdvisor sqlite database...') self.db = sqlite3.connect(db_path) self.item_handlers = { 'TripAdvisorHotelReview': lambda item:self.insert_item(item, 'hotel_review'), 'TripAdvisorHotelInfo': lambda item:self.insert_item(item, 'hotel_info'), 'TripAdvisorHotelDeals': lambda item:self.insert_item(item, 'hotel_deal'), 'TripAdvisorHotelGeolocation': lambda item:self.insert_item(item, 'hotel_geo') }
def parse_hotel_info(self, response): ''' Parsea la información de un hotel en TripAdvisor :return: ''' loader = ItemLoader(item=TripAdvisorHotelInfo(), response=response) loader.add_css('name', '#HEADING::text') loader.add_css('phone_number', 'div.phone span:not(.ui_icon)::text') loader.add_css( 'amenities', 'div.amenitiesColumn div.detailsMid div.highlightedAmenity::text', re='^[ ]*(.+)[ ]*$') loader.add_css('address', 'div.address span.street-address::text', re='^[ ]*(.+)[ ]*$') loader.add_css('address', 'div.address span.locality::text', re='^[ ]*(.+),[ ]*$') loader.add_css('address', 'div.address span.country-name::text', re='^[ ]*(.+)[ ]*$') hasher = sha256() hasher.update(response.url.encode()) loader.add_value('id', hasher.hexdigest()) item = loader.load_item() self.log.debug('Succesfully info extracted from "{}" hotel'.format( loader.item['name'])) yield item geo_request = GMapRequests.search_place( address=item.get('address'), callback=self.parse_hotel_geolocation) geo_request.meta['hotel_id'] = item.get('id') if GlobalConfig().is_true('SCRAP_GEO'): yield geo_request
def __init__(self, **kwargs): ''' Inicializa esta instancia. :param terms: Es un parámetro opcional que indica los términos de busqueda para encontrar hoteles en tripadvisor. :param locations: Es un parámetro opcional que indica una localización para encontrar hoteles en tripadvisor e.g: "Olite, Navarra" o "Spain" Si terms no es None, se escrapearán los hoteles que se encuentren realizando una búsqueda por términos. Si terms es None, se escrapearán los hoteles que se encuentren realizando una búsqueda por localización. ''' Spider.__init__(self) config = GlobalConfig() self.log = Logger(config.get_path('OUTPUT_SCRAP_LOG')) config.override(Config(kwargs)) config.check()
def init(self): GlobalConfig().override(Config.load_from_file(join(dirname(__file__), 'scraper.conf.py')))
def from_crawler(cls, crawler, *args, **kwargs): GlobalConfig().override(Config(crawler.settings)) return cls(*args, **kwargs)