Exemple #1
0
class SeleniumAccess(object):
    """Analyze information of each scrapint of a job and contruct information for save"""

    logger = None
    config = None
    driver = None

    def __init__(self, config, level_log):
        self.logger = Logger(self.__class__.__name__, level_log).get()
        self.config = config

    def open_selenium(self):
        """open driver for scraping"""
        self.logger.debug("Open Selenium")
        self.driver = webdriver.Remote(\
                      command_executor=self.config.get("urlSelenium"),\
                      desired_capabilities=DesiredCapabilities.CHROME)
        self.logger.debug("IS selenium open %r", self.driver != None)

    def close_selenium(self):
        """close driver for scraping"""
        if self.driver != None:
            self.driver.stop_client()
            self.driver.close()
            self.driver = None
Exemple #2
0
 def __init__(self, file_config, level_log):
     self.level_log = level_log
     self.logger = Logger(self.__class__.__name__, level_log).get()
     try:
         self.config = json.loads(open(file_config, "r").read())
         self.mongodbaccess = MongoDBAccess(self.config, level_log)
     except IOError:
         self.logger.error("File Error: %s", file_config)
         self.config = {}
         self.mongodbaccess = MongoDBAccess({}, level_log)
     self.logger.info("Inicio: %s", datetime.datetime.now())
Exemple #3
0
def test_logger_test():
    """Test logger"""
    sys.path.insert(0, "../test")
    try:
        os.remove("log/test.log")
    except OSError:
        print "file don't exist"

    logger = Logger("test", "DEBUG").get()
    logger.error("Error")
    data = open("log/test.log", "r").read()

    assert " ERROR:log_namespace.test Error" in data
Exemple #4
0
    def __init__(self, config, levelLog):
        """Need a file where has got all parameters and level of Loggin"""
        self.logger = Logger(self.__class__.__name__, levelLog).get()
        self.logger.setLevel('INFO')

        try:
            self.logger.debug(config.get("url", ""))
            self._client = MongoClient(config.get("url", ""))
            self.db_access = self._client[config.get("nameDB")]
            self.logger.info("-- INFO -- DATA BASE CONECT OK")
        except ConfigurationError:
            self.logger.error("ConfigurationErr")
        except ConnectionFailure:
            self.logger.error("ConnectionFailure")
        except OperationFailure:
            self.logger.error("Authentication failure")
Exemple #5
0
    def __init__(self, level_log):
        """load bank holidays 2018 end year """
        self.logger = Logger(self.__class__.__name__, level_log).get()

        self.bank_holidays.append([2018, 5, 7])
        self.bank_holidays.append([2018, 6, 4])
        self.bank_holidays.append([2018, 8, 6])
        self.bank_holidays.append([2018, 10, 29])
        self.bank_holidays.append([2018, 12, 25])
        self.bank_holidays.append([2018, 12, 26])
        self.bank_holidays.append([2018, 12, 27])
Exemple #6
0
class CleanFlights(object):
    """clean Flights"""

    mongodbaccess = None
    logger = None

    def __init__(self, mongo_db_access, level_log):
        self.logger = Logger(self.__class__.__name__, level_log).get()
        self.mongodbaccess = mongo_db_access
        self.logger.info("Inicio: %s", datetime.datetime.now())


    def clean(self):
        """ clean Process """
        self.logger.info("++INFO-- CLEAN FASE I")
        result = {"total":0}
        for vuelo in self.mongodbaccess.find("vuelos", {}):
            result = self.analize_each_flight(result, vuelo)
        return result

    def analize_each_flight(self, result, vuelo):
        """each flight analyze each rule"""
        apply(lambda rule: accumulate_dic(result, rule(vuelo)), self.create_all_rules())
        result["total"] += 1
        return result

    def create_all_rules(self):
        """ insert all rules created for run all"""
        return [self.rule_older_than_15days]

    def rule_older_than_15days(self, elemento):
        """First Rule: move all flights from vuelos to vuelosOld older than 15 days """
        date15 = datetime.datetime.now()-datetime.timedelta(days=15)
        deleted = 0
        inserted_old = 0
        if elemento.get("dateDirect", datetime.datetime) < date15:
            if self.mongodbaccess.insert("vuelosOld", elemento) is not None:
                inserted_old = 1
                self.logger.error("Error vuelo not insert backup but delete %s", elemento)
            self.mongodbaccess.delete_one("vuelos", {"_id":elemento.get("_id")})
            deleted = 1
        return {"deleted":deleted, "inserted_old":inserted_old}
Exemple #7
0
 def __init__(self, mongo_db_access, level_log):
     self.logger = Logger(self.__class__.__name__, level_log).get()
     self.mongodbaccess = mongo_db_access
     self.logger.info("Inicio: %s", datetime.datetime.now())
Exemple #8
0
 def __init__(self, config, level_log):
     self.logger = Logger(self.__class__.__name__, level_log).get()
     self.config = config
Exemple #9
0
 def __init__(self, config, mongo_db_access, level_log):
     self.logger = Logger(self.__class__.__name__, level_log).get()
     self.mongodbaccess = mongo_db_access
     self.seleniumaccess = SeleniumAccess(config, level_log)
     self.holidays = Holidays(level_log)
     self.logger.info("Inicio: %s", datetime.datetime.now())
Exemple #10
0
class FindFlights(object):
    """find Flight"""

    seleniumaccess = None
    mongodbaccess = None
    logger = None
    holidays = None

    def __init__(self, config, mongo_db_access, level_log):
        self.logger = Logger(self.__class__.__name__, level_log).get()
        self.mongodbaccess = mongo_db_access
        self.seleniumaccess = SeleniumAccess(config, level_log)
        self.holidays = Holidays(level_log)
        self.logger.info("Inicio: %s", datetime.datetime.now())

    def get_flights(self, urls):
        """ doc to explain """
        self.logger.info("Process each url")
        result = {"save": 0, "warn": 0, "error": 0}

        self.seleniumaccess.open_selenium()
        driver = self.seleniumaccess.driver
        time.sleep(1)
        driver.get("http://www.google.com")
        time.sleep(1)

        for url in urls:
            accumulate_dic(result, self.url_to_flight(url, driver))

        self.seleniumaccess.close_selenium()
        return result

    def url_to_flight(self, url, driver):
        """process each url"""
        driver.get(url.get("url", "http://google.es"))
        try:
            precio_string = driver.find_element_by_class_name(
                "gws-flights-results__price").text
            #navigate
            #driver.find_element_by_class_name("gws-flights-results__more").click()
            #driver.find_element_by_xpath("//*[contains(text(), 'SELECT FLIGHT')]").click()
            if url.get("type", "") == "o":
                type_flight = driver\
                  .find_element_by_class_name("gws-flights-form__menu-label").text
            else:
                type_flight = driver\
                  .find_element_by_class_name("gws-flights-results__price-annotation").text

            url_insert = \
              {"dBusqueda":datetime.datetime.now(),  \
               "precio":float(precio_string[1:].replace(".", "").replace(", ", ".")), \
               "type": type_flight,\
               "horaS":driver.find_element_by_class_name("gws-flights-results__times").text,\
               "horaLl":"",\
               "company":driver.find_element_by_class_name("gws-flights-results__carriers").text,\
               "duracion":driver.find_element_by_class_name("gws-flights-results__duration").text, \
               "escalas":driver \
                .find_element_by_class_name("gws-flights-results__itinerary-stops").text, \
               "from":url.get("from", "XXX"), \
               "to":url.get("to", "XXX"), \
               "dateDirect":url.get("dateDirect", "XXX"), \
               "dateReturn":url.get("dateReturn", "YYY"), \
               "holidays": \
                 self.holidays.get_number_holidays(url.get("dateDirect", "XXX"), \
                                                   url.get("dateReturn", "YYY"))}
            self.logger.debug("Insert url elemento: %s", url_insert)
            self.mongodbaccess.insert("vuelos", url_insert)
            self.mongodbaccess.delete_one("urls", {"url": url.get("url", "")})
            print "from: {0}, to: {1}, dateDirect: {2}, dateReturn: {3}, price: {4}".format(\
                   url_insert["from"], url_insert["to"], \
                   url_insert["dateDirect"].strftime("%Y-%m-%d"), \
                   url_insert["dateReturn"].strftime("%Y-%m-%d"), \
                   url_insert["precio"])
        except StaleElementReferenceException as error_ref:
            print "****************************"
            print url
            print error_ref
            time.sleep(1)
            return {"save": 0, "warn": 0, "error": 1}
        except NoSuchElementException as error_no_such:
            print "****************************"
            print url
            print error_no_such
            time.sleep(1)
            return {"save": 0, "warn": 1, "error": 0}
        except TimeoutException as error_time_out:
            print "-- ERROR -- TimeOut *****************"
            print "****************************"
            print url
            print error_time_out
            return {"save": 0, "warn": 0, "error": 1}
        return {"save": 1, "warn": 0, "error": 0}
Exemple #11
0
class MongoDBAccess(object):
    """Class to access to MongoDB allow access and review connections"""

    db_access = None
    _client = None

    def __init__(self, config, levelLog):
        """Need a file where has got all parameters and level of Loggin"""
        self.logger = Logger(self.__class__.__name__, levelLog).get()
        self.logger.setLevel('INFO')

        try:
            self.logger.debug(config.get("url", ""))
            self._client = MongoClient(config.get("url", ""))
            self.db_access = self._client[config.get("nameDB")]
            self.logger.info("-- INFO -- DATA BASE CONECT OK")
        except ConfigurationError:
            self.logger.error("ConfigurationErr")
        except ConnectionFailure:
            self.logger.error("ConnectionFailure")
        except OperationFailure:
            self.logger.error("Authentication failure")

    def status(self):
        """Determinate True is connect or False if is not connect"""
        if self._client is None:
            return False
        try:
            self.logger.debug(self._client.server_info())
            return True
        except ConnectionFailure:
            self.logger.error("ConnectionFailure")
            return False
        except OperationFailure:
            self.logger.error("Authentication failure")
            return False

    def find_one(self, collection, query, sort=None):
        """Find one element only return a json element"""
        if self.status():
            sort = None if sort is None else sort.items()
            self.logger.info("Access to collection: %s, query %s", collection,
                             query)
            return self.db_access[collection].find_one(query, sort=sort)
        else:
            self.logger.error("Database Not INIT Find_one")
            return None

    def find(self, collection, query, sort=None, limite=None):
        """Find several elements is a cursor, atention for line in cursor is better"""
        if self.status():
            self.logger.info("Access to collection Multi: %s, query: %s, sort: %s, limit: %s",\
                collection, query, sort, limite)
            limite = 0 if limite is None else limite
            sort = None if sort is None else sort.items()
            return self.db_access[collection].find(query,
                                                   sort=sort,
                                                   limit=limite)
        else:
            self.logger.error("Database Not INIT Find")
            return None

    def update_one(self, collection, query, change, is_set="set"):
        """Update One return status of update"""
        if self.status():
            self.logger.info("Modify collection: %s, query: %s, modify: %s, set: %s",\
                collection, query, change, is_set)
            setdollar = "$" + is_set
            return self.db_access[collection].update_one(
                query, {setdollar: change})
        else:
            self.logger.error("Database Not INIT Update_one")
            return None

    def update_many(self, collection, query, change, is_set="set"):
        """Update Many return status of update"""
        if self.status():
            self.logger.info("Modify Many collection: %s, query: %s, modify: %s, set: %s",\
                collection, query, change, is_set)
            setdollar = "$" + is_set
            return self.db_access[collection].update_many(
                query, {setdollar: change})
        else:
            self.logger.error("Database Not INIT Update_one")
            return None

    def insert(self, collection, element):
        """Insert return status of insert"""
        if self.status():
            self.logger.debug("Insert collection: %s, data: %s", collection,
                              element)
            #control duplicated
            try:
                return self.db_access[collection].insert(element)
            except DuplicateKeyError:
                return None
        else:
            self.logger.error("Database Not INIT Find")
            return None

    def delete_one(self, collection, element):
        """delete One return status of delete"""
        if self.status():
            self.logger.info("Remove collection: %s, data: %s", collection,
                             element)
            return self.db_access[collection].delete_one(element)
        else:
            self.logger.error("Database Not INIT Find")
            return None

    def delete_many(self, collection, element):
        """delete return status of delete"""
        if self.status():
            self.logger.info("Remove collection: %s, data: %s", collection,
                             element)
            return self.db_access[collection].delete_many(element)
        else:
            self.logger.error("Database Not INIT Find")
            return None

    def aggregate(self, collection, element):
        """delete return status of delete"""
        if self.status():
            self.logger.info("Aggregate collection: %s, data: %s", collection,
                             element)
            return self.db_access[collection].aggregate(element)
        else:
            self.logger.error("Database Not INIT Find")
            return None

    def drop(self, collection):
        """Drop a collection return status of drop"""
        if self.status():
            self.logger.info("Drop collection: %s", collection)
            return self.db_access[collection].drop()
        else:
            self.logger.error("Database Not INIT Find")
            return None
Exemple #12
0
class Vuelos(object):
    """find Flight"""

    level_log = None
    config = None
    mongodbaccess = None
    logger = None

    def __init__(self, file_config, level_log):
        self.level_log = level_log
        self.logger = Logger(self.__class__.__name__, level_log).get()
        try:
            self.config = json.loads(open(file_config, "r").read())
            self.mongodbaccess = MongoDBAccess(self.config, level_log)
        except IOError:
            self.logger.error("File Error: %s", file_config)
            self.config = {}
            self.mongodbaccess = MongoDBAccess({}, level_log)
        self.logger.info("Inicio: %s", datetime.datetime.now())

    def ejecutar(self, nivel):
        """ run load process """
        print "++ INFO ++ MODULO PRINCIPAL MODO DE EJECUCION: {0}".format(
            nivel)
        if nivel == "1":
            print "-- INFO -- MODO 1 duro ejecuta y limpia los datos del dia"
            #proceso duro vaciamos informacion y empezamos
            print "++ INFO ++ Vaciamos informacion del dia"
            print "-- INFO -- dia: {0}".format(today())
            borrados = self.vaciar_dia()
            print "-- INFO -- vaciamos informacion -- Vuelos borrados del dia: {0}"\
                  .format(borrados.deleted_count)
            urls = BuildUrls(self.mongodbaccess, self.level_log).build_urls()
            print "-- INFO -- construir urls -- numero de URLS: {0}".format(
                urls)
        else:
            print "-- INFO -- MODO 0 suave solo si hay datos que ejecutar"
            #proceso soft miramos si hay algo que procesar
            #si no hay nada que procesar o el dia no se ha ejecutado.
            if self.return_urls().count() == 0:
                #no hay nada que ejecutar
                if self.find_last_day() < today():
                    # ultimo dia es anterior a hoy a las 12... no se ha procesado
                    print "++ WARN ++  1.1 PRIMERA VEZ DEL DIA creamos las URLS y seguimos"
                    urls = BuildUrls(self.mongodbaccess,
                                     self.level_log).build_urls()
                    print "-- INFO -- construir urls -- numero de URLS: {0}".format(
                        urls)
                else:
                    # ultimo dia posterior hoy a las 12... esta todo Ok
                    print "++ WARN ++  1.2 SE HA PROCESADO TODO Y NO HAY NADA QUE HACER"
            else:
                if self.find_last_day() < today():
                    # prblemas en el paraiso ayer la cosa no fue bien. Reiniciamos y procesamos
                    print "** ERROR **  2.1 AYER NO SE EJECUTARON TODOS LOS VUELOS"
                    print "** ERROR **  vuelos pendientes {0}".format(
                        self.return_urls().count())
                    self.logger.error("AYER no se ejecutaron todos los vuelos")
                    urls = BuildUrls(self.mongodbaccess,
                                     self.level_log).build_urls()
                    print "-- INFO -- construir urls -- numero de URLS: {0}".format(
                        urls)

                else:
                    #hay cosas que ejecutar
                    print "++ WARN ++  2.2 HA HABIDO UNA CANCELACION y el "\
                          +"SISTEMA SIGUE DESDE ESE PUNTO"
                    print "++ WARN ++  vuelos pendientes {0}".format(
                        self.return_urls().count())
                    self.logger.error(
                        "Ha habido una cancelacion y se sigue desde ese punto")
        result = FindFlights(self.config, self.mongodbaccess, self.level_log)\
                   .get_flights(self.return_urls())
        print "++ INFO -- TOTAL PROCESO, Save: {0}".format(
            result.get("save", 0))
        print "++ INFO -- TOTAL PROCESO, errores sin Informacion: {0}".format(
            result.get("warn", 0))
        print "++ INFO -- TOTAL PROCESO, errores NO ENCONTRADO: {0}".format(
            result.get("error", 0))

    def vaciar_dia(self):
        """ delete all info of day """
        return self.mongodbaccess.delete_many("vuelos",
                                              {"dBusqueda": {
                                                  "$gt": today()
                                              }})

    def return_urls(self):
        """ doc to explain """
        return self.mongodbaccess.find("urls", {})

    def find_last_day(self):
        """ doc to explain """
        print "++ INFO ++ find_last_day"
        if self.mongodbaccess.find_one("vuelos", {}, sort={"dBusqueda": -1
                                                           }) is None:
            return datetime.datetime(2000, 01, 01)
        else:
            return self.mongodbaccess.find_one("vuelos", {}, sort={"dBusqueda":-1})\
                                     .get("dBusqueda", "")
Exemple #13
0
 def __init__(self, mongodbaccess, level_log):
     """ Build urls to review """
     self.logger = Logger(self.__class__.__name__, level_log).get()
     self.holidays = Holidays(level_log)
     self.mongodbaccess = mongodbaccess
Exemple #14
0
class BuildUrls(object):
    """ process to calculate holidays """

    logger = None
    holidays = None
    mongodbaccess = None

    def __init__(self, mongodbaccess, level_log):
        """ Build urls to review """
        self.logger = Logger(self.__class__.__name__, level_log).get()
        self.holidays = Holidays(level_log)
        self.mongodbaccess = mongodbaccess

    def build_urls(self):
        """Build all urls"""
        deleted = self.mongodbaccess.delete_many("urls", {})
        self.logger.warn("-- INFO -- URLS deleted: %d",\
              deleted.deleted_count)
        return sum(
            self.build_urls_one_search(search)
            for search in self.find_elements_search())

    def find_elements_search(self):
        """ doc to explain """
        return self.mongodbaccess.find("busquedas", {"activa": True})

    def build_urls_one_search(self, search):
        """each element of busqueda create urls"""
        self.logger.warn("new element %s", search)
        sum_per_search = 0
        date_direct = search.get("fromDateInit",\
                              datetime.datetime.now()+datetime.timedelta(days=1))
        while date_direct <= search.get("fromDateEnd",
                                        datetime.datetime.now()):
            if search.get("type", "o") == "o":
                sum_per_search += \
                    self.review_save_url_onetrip(search, date_direct)
            else:
                sum_per_search += self.process_date_return(search, date_direct)
            date_direct = date_direct + datetime.timedelta(days=1)
        if sum_per_search == 0:
            self.logger.warn(
                "-- INFO -- desactivate search, no generate urls: %s", search)
            self.mongodbaccess.update_one("busquedas", {"_id": search["_id"]},
                                          {"activa": False})
        return sum_per_search

    def process_date_return(self, search, date_direct):
        """fixed date init find all posibilities date return"""
        suma = 0
        date_return = search.get(
            "toDateInit",
            datetime.datetime.now() + datetime.timedelta(days=1))
        while date_return <= search.get("toDateEnd", datetime.datetime.now()):
            suma += self.review_save_url_return(search, date_direct,
                                                date_return)
            date_return = date_return + datetime.timedelta(days=1)
        return suma

    def review_save_url_onetrip(self, search, date_direct):
        """review for return flights if can save"""
        if (date_direct > datetime.datetime.now()) and \
           (self.holidays.get_number_holidays(date_direct, date_direct)\
            >= search.get("minHolidays", 0)):
            return self.save_url(create_url(search, date_direct, date_direct))
        return 0

    def review_save_url_return(self, search, date_direct, date_return):
        """review for return flights if can save"""
        if (date_direct <= date_return) and (date_direct >
                                             datetime.datetime.now()):
            dif = (date_return - date_direct) + datetime.timedelta(days=1)
            if dif <= datetime.timedelta(days=search.get("maxDays", 0)) and\
               dif >= datetime.timedelta(days=search.get("minDays", 0)) and\
               (self.holidays.get_number_holidays(date_direct, date_return)\
                >= search.get("minHolidays", 0)):
                return self.save_url(
                    create_url(search, date_direct, date_return))
        return 0

    def save_url(self, urls):
        """ doc to explain """
        if self.mongodbaccess.find_one(
                "urls", {"url": urls.get("url", "ERROR")}) is None:
            self.mongodbaccess.insert("urls", urls)
            return 1
        return 0