class SeleniumAccess(object): """Analyze information of each scrapint of a job and contruct information for save""" logger = None config = None driver = None def __init__(self, config, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.config = config def open_selenium(self): """open driver for scraping""" self.logger.debug("Open Selenium") self.driver = webdriver.Remote(\ command_executor=self.config.get("urlSelenium"),\ desired_capabilities=DesiredCapabilities.CHROME) self.logger.debug("IS selenium open %r", self.driver != None) def close_selenium(self): """close driver for scraping""" if self.driver != None: self.driver.stop_client() self.driver.close() self.driver = None
class FindFlights(object): """find Flight""" seleniumaccess = None mongodbaccess = None logger = None holidays = None def __init__(self, config, mongo_db_access, level_log): self.logger = Logger(self.__class__.__name__, level_log).get() self.mongodbaccess = mongo_db_access self.seleniumaccess = SeleniumAccess(config, level_log) self.holidays = Holidays(level_log) self.logger.info("Inicio: %s", datetime.datetime.now()) def get_flights(self, urls): """ doc to explain """ self.logger.info("Process each url") result = {"save": 0, "warn": 0, "error": 0} self.seleniumaccess.open_selenium() driver = self.seleniumaccess.driver time.sleep(1) driver.get("http://www.google.com") time.sleep(1) for url in urls: accumulate_dic(result, self.url_to_flight(url, driver)) self.seleniumaccess.close_selenium() return result def url_to_flight(self, url, driver): """process each url""" driver.get(url.get("url", "http://google.es")) try: precio_string = driver.find_element_by_class_name( "gws-flights-results__price").text #navigate #driver.find_element_by_class_name("gws-flights-results__more").click() #driver.find_element_by_xpath("//*[contains(text(), 'SELECT FLIGHT')]").click() if url.get("type", "") == "o": type_flight = driver\ .find_element_by_class_name("gws-flights-form__menu-label").text else: type_flight = driver\ .find_element_by_class_name("gws-flights-results__price-annotation").text url_insert = \ {"dBusqueda":datetime.datetime.now(), \ "precio":float(precio_string[1:].replace(".", "").replace(", ", ".")), \ "type": type_flight,\ "horaS":driver.find_element_by_class_name("gws-flights-results__times").text,\ "horaLl":"",\ "company":driver.find_element_by_class_name("gws-flights-results__carriers").text,\ "duracion":driver.find_element_by_class_name("gws-flights-results__duration").text, \ "escalas":driver \ .find_element_by_class_name("gws-flights-results__itinerary-stops").text, \ "from":url.get("from", "XXX"), \ "to":url.get("to", "XXX"), \ "dateDirect":url.get("dateDirect", "XXX"), \ "dateReturn":url.get("dateReturn", "YYY"), \ "holidays": \ self.holidays.get_number_holidays(url.get("dateDirect", "XXX"), \ url.get("dateReturn", "YYY"))} self.logger.debug("Insert url elemento: %s", url_insert) self.mongodbaccess.insert("vuelos", url_insert) self.mongodbaccess.delete_one("urls", {"url": url.get("url", "")}) print "from: {0}, to: {1}, dateDirect: {2}, dateReturn: {3}, price: {4}".format(\ url_insert["from"], url_insert["to"], \ url_insert["dateDirect"].strftime("%Y-%m-%d"), \ url_insert["dateReturn"].strftime("%Y-%m-%d"), \ url_insert["precio"]) except StaleElementReferenceException as error_ref: print "****************************" print url print error_ref time.sleep(1) return {"save": 0, "warn": 0, "error": 1} except NoSuchElementException as error_no_such: print "****************************" print url print error_no_such time.sleep(1) return {"save": 0, "warn": 1, "error": 0} except TimeoutException as error_time_out: print "-- ERROR -- TimeOut *****************" print "****************************" print url print error_time_out return {"save": 0, "warn": 0, "error": 1} return {"save": 1, "warn": 0, "error": 0}
class MongoDBAccess(object): """Class to access to MongoDB allow access and review connections""" db_access = None _client = None def __init__(self, config, levelLog): """Need a file where has got all parameters and level of Loggin""" self.logger = Logger(self.__class__.__name__, levelLog).get() self.logger.setLevel('INFO') try: self.logger.debug(config.get("url", "")) self._client = MongoClient(config.get("url", "")) self.db_access = self._client[config.get("nameDB")] self.logger.info("-- INFO -- DATA BASE CONECT OK") except ConfigurationError: self.logger.error("ConfigurationErr") except ConnectionFailure: self.logger.error("ConnectionFailure") except OperationFailure: self.logger.error("Authentication failure") def status(self): """Determinate True is connect or False if is not connect""" if self._client is None: return False try: self.logger.debug(self._client.server_info()) return True except ConnectionFailure: self.logger.error("ConnectionFailure") return False except OperationFailure: self.logger.error("Authentication failure") return False def find_one(self, collection, query, sort=None): """Find one element only return a json element""" if self.status(): sort = None if sort is None else sort.items() self.logger.info("Access to collection: %s, query %s", collection, query) return self.db_access[collection].find_one(query, sort=sort) else: self.logger.error("Database Not INIT Find_one") return None def find(self, collection, query, sort=None, limite=None): """Find several elements is a cursor, atention for line in cursor is better""" if self.status(): self.logger.info("Access to collection Multi: %s, query: %s, sort: %s, limit: %s",\ collection, query, sort, limite) limite = 0 if limite is None else limite sort = None if sort is None else sort.items() return self.db_access[collection].find(query, sort=sort, limit=limite) else: self.logger.error("Database Not INIT Find") return None def update_one(self, collection, query, change, is_set="set"): """Update One return status of update""" if self.status(): self.logger.info("Modify collection: %s, query: %s, modify: %s, set: %s",\ collection, query, change, is_set) setdollar = "$" + is_set return self.db_access[collection].update_one( query, {setdollar: change}) else: self.logger.error("Database Not INIT Update_one") return None def update_many(self, collection, query, change, is_set="set"): """Update Many return status of update""" if self.status(): self.logger.info("Modify Many collection: %s, query: %s, modify: %s, set: %s",\ collection, query, change, is_set) setdollar = "$" + is_set return self.db_access[collection].update_many( query, {setdollar: change}) else: self.logger.error("Database Not INIT Update_one") return None def insert(self, collection, element): """Insert return status of insert""" if self.status(): self.logger.debug("Insert collection: %s, data: %s", collection, element) #control duplicated try: return self.db_access[collection].insert(element) except DuplicateKeyError: return None else: self.logger.error("Database Not INIT Find") return None def delete_one(self, collection, element): """delete One return status of delete""" if self.status(): self.logger.info("Remove collection: %s, data: %s", collection, element) return self.db_access[collection].delete_one(element) else: self.logger.error("Database Not INIT Find") return None def delete_many(self, collection, element): """delete return status of delete""" if self.status(): self.logger.info("Remove collection: %s, data: %s", collection, element) return self.db_access[collection].delete_many(element) else: self.logger.error("Database Not INIT Find") return None def aggregate(self, collection, element): """delete return status of delete""" if self.status(): self.logger.info("Aggregate collection: %s, data: %s", collection, element) return self.db_access[collection].aggregate(element) else: self.logger.error("Database Not INIT Find") return None def drop(self, collection): """Drop a collection return status of drop""" if self.status(): self.logger.info("Drop collection: %s", collection) return self.db_access[collection].drop() else: self.logger.error("Database Not INIT Find") return None