Exemple #1
0
	def run(self):
		logger = "restServer"
		log = logging.getLogger(logger)
		log.info("REST Server started")
		self.mysql_conn = None
		self.mysql_cursor = None
		self.configDBSession = None
		self.configDBEngine = None
		self.debugLogLevel = False

		if logging.root.level == 10:        # DEBUG
			self.debugLogLevel = True

		restAddress = configuration.get("Server", "restServer_address")
		restPort = configuration.get("Server", "restServer_port")

		if restAddress.strip() != "" and restPort.strip() != "":
			app = Flask("restServer")
			api = Api(app)
			api.add_resource(restRoot, '/')
			api.add_resource(restStatus, '/status')
			api.add_resource(restJdbcConnections, '/jdbc_connections')

			log.info("Starting RESTserver on %s:%s"%(restAddress, restPort))
			serve(
				TransLogger(app, setup_console_handler=False, logger=logging.getLogger("restServerAccess")), 
				host=restAddress, 
				port=restPort, 
				ident="DBImport REST Server", 
				url_scheme='https',
				_quiet=True)
	def __init__(self, Hive_DB=None, Hive_Table=None):
		logging.debug("Executing common_operation.__init__()")

		self.Hive_DB = Hive_DB	 
		self.Hive_Table = Hive_Table	 
#		self.mysql_conn = None
#		self.mysql_cursor = None
		self.hive_conn = None
		self.hive_cursor = None
		self.debugLogLevel = False

		if logging.root.level == 10:		# DEBUG
			self.debugLogLevel = True

		# Fetch and initialize the Kerberos configuration
		self.kerberosPrincipal = configuration.get("Kerberos", "principal")
		self.webHCatAuth = HTTPKerberosAuth(force_preemptive=True, principal=self.kerberosPrincipal)

		self.common_config = common_config.config()

		# Fetch configuration details about Hive LLAP
#		self.hive_hostname = configuration.get("Hive", "hostname")
#		self.hive_port = configuration.get("Hive", "port")
		self.hive_servers = configuration.get("Hive", "servers")
		self.hive_kerberos_service_name = configuration.get("Hive", "kerberos_service_name")
		self.hive_kerberos_realm = configuration.get("Hive", "kerberos_realm")
		self.hive_print_messages = self.common_config.getConfigValue(key = "hive_print_messages")
		if configuration.get("Hive", "use_ssl").lower() == "true":
			self.hive_use_ssl = True
		else:
			self.hive_use_ssl = False

		self.hive_min_buckets = int(configuration.get("Hive", "min_buckets"))
		self.hive_max_buckets = int(configuration.get("Hive", "max_buckets"))

		# HDFS Settings
		self.hdfs_address = self.common_config.getConfigValue(key = "hdfs_address")
		self.hdfs_basedir = self.common_config.getConfigValue(key = "hdfs_basedir")
		self.hdfs_blocksize = self.common_config.getConfigValue(key = "hdfs_blocksize")

		self.hiveConnectStr = configuration.get("Hive", "hive_metastore_alchemy_conn")

		try:
			self.hiveMetaDB = sa.create_engine(self.hiveConnectStr, echo = self.debugLogLevel)
			self.hiveMetaDB.connect()
			self.hiveMetaSession = sessionmaker(bind=self.hiveMetaDB)
		except sa.exc.OperationalError as err:
			logging.error("%s"%err)
			self.common_config.remove_temporary_files()
			sys.exit(1)
		except:
			print("Unexpected error: ")
			print(sys.exc_info())
			self.common_config.remove_temporary_files()
			sys.exit(1)

		logging.debug("Executing common_operations.__init__() - Finished")
Exemple #3
0
    def __init__(self):
        logging.debug("Executing rest.__init__()")

        self.headers = {'Content-type': 'application/json'}

        self.RESTendpoint = configuration.get("REST_statistics",
                                              "rest_endpoint")
        self.RESTtimeout = configuration.get("REST_statistics", "timeout")

        if self.RESTendpoint == "":
            logging.error(
                "Cant find the REST endpoint. Please check configuration file")
            sys.exit(1)

        if self.RESTtimeout == "":
            logging.error(
                "Cant find the REST endpoint timeout. Please check configuration file"
            )
            sys.exit(1)
Exemple #4
0
    def __init__(self, mysql_conn, Hive_DB, Hive_Table):
        logging.debug("Executing stage.__init__()")

        self.Hive_DB = Hive_DB
        self.Hive_Table = Hive_Table
        self.mysql_conn = mysql_conn
        self.mysql_cursor = self.mysql_conn.cursor(buffered=False)
        self.currentStage = None
        self.memoryStage = False
        self.stageTimeStart = None
        self.stageTimeStop = None
        self.stageDurationStart = float()
        self.stageDurationStop = float()
        self.stageDurationTime = float()

        if configuration.get("REST_statistics",
                             "post_import_data").lower() == "true":
            self.post_import_data = True
        else:
            self.post_import_data = False

        self.rest = rest.restInterface()
Exemple #5
0
    def __init__(self):
        logging.debug("Executing database.__init__()")

        self.mysql_conn = None
        self.mysql_cursor = None
        self.debugLogLevel = False

        if logging.root.level == 10:  # DEBUG
            self.debugLogLevel = True

        try:
            DBImport_Home = os.environ['DBIMPORT_HOME']
        except KeyError:
            print(
                "Error: System Environment Variable DBIMPORT_HOME is not set")
            #			self.remove_temporary_files()
            sys.exit(1)

        # Fetch configuration about MySQL database and how to connect to it
        self.configHostname = configuration.get("Database", "mysql_hostname")
        self.configPort = configuration.get("Database", "mysql_port")
        self.configDatabase = configuration.get("Database", "mysql_database")
        self.configUsername = configuration.get("Database", "mysql_username")
        self.configPassword = configuration.get("Database", "mysql_password")

        # Esablish a SQLAlchemy connection to the DBImport database
        #		try:
        self.connectStr = "mysql+pymysql://%s:%s@%s:%s/%s" % (
            self.configUsername, self.configPassword, self.configHostname,
            self.configPort, self.configDatabase)

        try:
            self.configDB = sa.create_engine(self.connectStr,
                                             echo=self.debugLogLevel)
            self.configDB.connect()
            self.configDBSession = sessionmaker(bind=self.configDB)

        except sa.exc.OperationalError as err:
            logging.error("%s" % err)
            sys.exit(1)
        except:
            print("Unexpected error: ")
            print(sys.exc_info())
            sys.exit(1)

        # Setup configuration for Alembic
        self.alembicSchemaDir = DBImport_Home + '/bin/SchemaUpgrade'
        self.alembicConfig = Config()
        self.alembicConfig.set_main_option('script_location',
                                           self.alembicSchemaDir)
        self.alembicConfig.set_main_option('sqlalchemy.url', self.connectStr)

        # Esablish a connection to the DBImport database in MySQL
        try:
            self.mysql_conn = mysql.connector.connect(
                host=self.configHostname,
                port=self.configPort,
                database=self.configDatabase,
                user=self.configUsername,
                password=self.configPassword)
        except mysql.connector.errors.ProgrammingError as err:
            logging.error("%s" % err)
            #			self.remove_temporary_files()
            sys.exit(1)
        except mysql.connector.Error as err:
            if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
                logging.error(
                    "Something is wrong with your user name or password")
            elif err.errno == errorcode.ER_BAD_DB_ERROR:
                logging.error("Database does not exist")
            else:
                logging.error("%s" % err)
            logging.error(
                "Error: There was a problem connecting to the MySQL database. Please check configuration and serverstatus and try again"
            )
            #			self.remove_temporary_files()
            sys.exit(1)
        else:
            self.mysql_cursor = self.mysql_conn.cursor(buffered=False)

        logging.debug("Executing database.__init__() - Finished")
Exemple #6
0
    def __init__(self):
        logging.debug("Executing rest.__init__()")

        self.mysql_conn = None
        self.mysql_cursor_01 = None
        self.mysql_cursor_02 = None

        self.RESTendpoint = configuration.get("REST_statistics",
                                              "rest_endpoint")
        if self.RESTendpoint == "":
            logging.error(
                "Cant find the REST endpoint. Please check configuration file")
            sys.exit(1)

        # Fetch configuration about MySQL database and how to connect to it
        mysql_hostname = configuration.get("Database", "mysql_hostname")
        mysql_port = configuration.get("Database", "mysql_port")
        mysql_database = configuration.get("Database", "mysql_database")
        mysql_username = configuration.get("Database", "mysql_username")
        mysql_password = configuration.get("Database", "mysql_password")

        # Esablish a connection to the DBImport database in MySQL
        try:
            self.mysql_conn = mysql.connector.connect(host=mysql_hostname,
                                                      port=mysql_port,
                                                      database=mysql_database,
                                                      user=mysql_username,
                                                      password=mysql_password)
        except mysql.connector.Error as err:
            if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
                logging.error(
                    "Something is wrong with your user name or password")
            elif err.errno == errorcode.ER_BAD_DB_ERROR:
                logging.error("Database does not exist")
            else:
                logging.error("%s" % err)
            logging.error(
                "Error: There was a problem connecting to the MySQL database. Please check configuration and serverstatus and try again"
            )
            self.remove_temporary_files()
            sys.exit(1)
        else:
            self.mysql_cursor_01 = self.mysql_conn.cursor(buffered=False)
            self.mysql_cursor_02 = self.mysql_conn.cursor(buffered=False)

        rest = restInterface()

        query = "select id, jsondata from json_to_rest"
        self.mysql_cursor_01.execute(query)
        logging.debug("SQL Statement executed: %s" %
                      (self.mysql_cursor_01.statement))

        successCounter = 0
        errorCounter = 0

        for row in self.mysql_cursor_01.fetchall():
            jsonID = row[0]
            jsonData = row[1]
            response_code = -1

            response_code = rest.sendData(jsonData)

            if response_code == 200:
                query = "delete from json_to_rest where id = %s"
                self.mysql_cursor_02.execute(query, (jsonID, ))
                logging.debug("SQL Statement executed: %s" %
                              (self.mysql_cursor_02.statement))
                self.mysql_conn.commit()
                successCounter += 1
            else:
                errorCounter += 1

        logging.info("Transmitted %s JSON documents to %s" %
                     (successCounter, self.RESTendpoint))
        if errorCounter > 0:
            logging.error("%s errors encountered" % (errorCounter))

        self.mysql_conn.close()
Exemple #7
0
    def run(self):
        log = logging.getLogger(self.loggerName)
        log.info("distCP %s started" % (self.name))

        yarnQueue = configuration.get("Server", "distCP_yarnqueue")

        while not self.threadStopEvent.isSet():
            distCPrequest = self.distCPreqQueue.get()
            if distCPrequest is None:
                time.sleep(1)
                break

            tableID = distCPrequest.get('tableID')
            hiveDB = distCPrequest.get('hiveDB')
            hiveTable = distCPrequest.get('hiveTable')
            destination = distCPrequest.get('destination')
            failures = distCPrequest.get('failures')
            HDFSsourcePath = distCPrequest.get('HDFSsourcePath')
            HDFStargetPath = distCPrequest.get('HDFStargetPath')

            log.info(
                "Thread %s: Starting a new distCP copy with the following paramaters"
                % (self.name))
            log.info(
                "Thread %s: --------------------------------------------------------"
                % (self.name))
            log.info("Thread %s: tableID = %s" % (self.name, tableID))
            log.info("Thread %s: hiveDB = %s" % (self.name, hiveDB))
            log.info("Thread %s: hiveTable = %s" % (self.name, hiveTable))
            log.info("Thread %s: destination = %s" % (self.name, destination))
            log.info("Thread %s: HDFSsourcePath = %s" %
                     (self.name, HDFSsourcePath))
            log.info("Thread %s: HDFStargetPath = %s" %
                     (self.name, HDFStargetPath))
            log.info(
                "Thread %s: --------------------------------------------------------"
                % (self.name))

            distcpCommand = [
                "hadoop", "distcp", "-D",
                "yarn.timeline-service.enabled=false", "-D",
                "mapreduce.job.queuename=%s" % (yarnQueue), "-overwrite",
                "-delete",
                "%s" % (HDFSsourcePath),
                "%s" % (HDFStargetPath)
            ]

            log.info("Thread %s:  ______________________ " % (self.name))
            log.info("Thread %s: |                      |" % (self.name))
            log.info("Thread %s: | Hadoop distCp starts |" % (self.name))
            log.info("Thread %s: |______________________|" % (self.name))
            log.info("Thread %s: " % (self.name))

            # Start distcp
            sh_session = subprocess.Popen(distcpCommand,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT)
            distCPoutput = ""

            # Print Stdout and stderr while distcp is running
            while sh_session.poll() == None:
                row = sh_session.stdout.readline().decode('utf-8').rstrip()
                if row != "":
                    log.info("Thread %s: %s" % (self.name, row))
                    distCPoutput += row + "\n"
                    sys.stdout.flush()

            # Print what is left in output after distcp is finished
            for row in sh_session.stdout.readlines():
                row = row.decode('utf-8').rstrip()
                if row != "":
                    log.info("Thread %s: %s" % (self.name, row))
                    distCPoutput += row + "\n"
                    sys.stdout.flush()

            log.info("Thread %s:  _________________________ " % (self.name))
            log.info("Thread %s: |                         |" % (self.name))
            log.info("Thread %s: | Hadoop distCp completed |" % (self.name))
            log.info("Thread %s: |_________________________|" % (self.name))
            log.info("Thread %s: " % (self.name))

            disCPresult = False
            if " ERROR " in distCPoutput:
                log.error("Thread %s: ERROR detected during distCP copy." %
                          (self.name))
                failures = failures + 1
            elif " completed successfully" in distCPoutput:
                disCPresult = True
                failures = 0
            else:
                log.error(
                    "Thread %s: Unknown status of distCP. Marked as failure as it cant find that it was finished successful"
                    % (self.name))
                failures = failures + 1

            distCPresponse = {}
            distCPresponse["tableID"] = tableID
            distCPresponse["hiveDB"] = hiveDB
            distCPresponse["hiveTable"] = hiveTable
            distCPresponse["destination"] = destination
            distCPresponse["result"] = disCPresult
            distCPresponse["failures"] = failures

            self.distCPresQueue.put(distCPresponse)

        log.info("distCP %s stopped" % (self.name))
Exemple #8
0
    def run(self):
        # This is the main event loop where the 'real' daemonwork happens
        log = logging.getLogger("server")
        log.debug("Executing daemon.serverDaemon.run()")
        log.info("Server initializing")
        self.mysql_conn = None
        self.mysql_cursor = None
        self.debugLogLevel = False

        if logging.root.level == 10:  # DEBUG
            self.debugLogLevel = True

        self.common_config = common_config.config()

        self.crypto = self.common_config.crypto
        self.crypto.setPrivateKeyFile(
            configuration.get("Credentials", "private_key"))
        self.crypto.setPublicKeyFile(
            configuration.get("Credentials", "public_key"))

        self.remoteDBImportEngines = {}
        self.remoteDBImportSessions = {}
        self.remoteInstanceConfigDB = None

        self.configDBSession = None
        self.configDBEngine = None

        self.distCPreqQueue = Queue()
        self.distCPresQueue = Queue()
        self.threadStopEvent = threading.Event()

        # Start the Atlas Discovery Thread
        self.atlasDiscoveryThread = atlasDiscovery.atlasDiscovery(
            self.threadStopEvent)
        self.atlasDiscoveryThread.daemon = True
        self.atlasDiscoveryThread.start()

        # Start the REST Server Thread
        self.restServerThread = restServer.restServer(self.threadStopEvent)
        self.restServerThread.daemon = True
        self.restServerThread.start()

        # Start the distCP threads
        if configuration.get("Server",
                             "distCP_separate_logs").lower() == "true":
            distCP_separate_logs = True
        else:
            distCP_separate_logs = False

        distCPobjects = []
        distCPthreads = int(configuration.get("Server", "distCP_threads"))
        if distCPthreads == 0:
            log.error(
                "'distCP_threads' configuration in configfile must be larger than 0"
            )
            sys.exit(1)

        log.info("Starting %s distCp threads" % (distCPthreads))

        for threadID in range(0, distCPthreads):
            if distCP_separate_logs == False:
                distCPlogName = "distCP"
            else:
                distCPlogName = "distCP-thread%s" % (str(threadID))

            thread = distCP(name=str(threadID),
                            distCPreqQueue=self.distCPreqQueue,
                            distCPresQueue=self.distCPresQueue,
                            threadStopEvent=self.threadStopEvent,
                            loggerName=distCPlogName)
            thread.daemon = True
            thread.start()
            distCPobjects.append(thread)

        # Fetch configuration about MySQL database and how to connect to it
        self.configHostname = configuration.get("Database", "mysql_hostname")
        self.configPort = configuration.get("Database", "mysql_port")
        self.configDatabase = configuration.get("Database", "mysql_database")
        self.configUsername = configuration.get("Database", "mysql_username")
        self.configPassword = configuration.get("Database", "mysql_password")

        # Set all rows that have copy_status = 1 to 0. This is needed in the startup as if they are 1 in this stage, it means that a previous
        # server marked it as 1 but didnt finish the copy. We need to retry that copy here and now
        try:
            updateDict = {}
            updateDict["last_status_update"] = str(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))
            updateDict["copy_status"] = 0

            session = self.getDBImportSession()

            (session.query(configSchema.copyASyncStatus).filter(
                configSchema.copyASyncStatus.copy_status == 1).update(
                    updateDict))
            session.commit()
            session.close()

            log.debug("Init part of daemon.serverDaemon.run() completed")

            log.info("Server startup completed")

        except SQLAlchemyError as e:
            log.error(str(e.__dict__['orig']))
            log.error("Server startup failed")
            self.disconnectDBImportDB()

            # As we require this operation to be completed successful before entering the main loop, we will exit if there is a problem
            self.common_config.remove_temporary_files()
            sys.exit(1)

        except SQLerror:
            log.error("Server startup failed. Cant connect to config database")
            self.disconnectDBImportDB()
            self.common_config.remove_temporary_files()
            sys.exit(1)

        importTables = aliased(configSchema.importTables)
        dbimportInstances = aliased(configSchema.dbimportInstances)
        copyASyncStatus = aliased(configSchema.copyASyncStatus)

        while True:

            # ***********************************
            # Main Loop for server
            # ***********************************

            try:
                session = self.getDBImportSession()

                # status 0 = New data from import
                # status 1 = Data sent to distCP thread
                # status 2 = Data returned from distCP and was a failure
                # status 3 = Data returned from distCP and was a success

                # ------------------------------------------
                # Fetch all rows from copyASyncStatus that contains the status 0 and send them to distCP threads
                # ------------------------------------------

                # TODO: make the 1 min interval a configured param
                status2checkTimestamp = (
                    datetime.now() -
                    timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S.%f')

                aSyncRow = pd.DataFrame(
                    session.query(
                        copyASyncStatus.table_id, copyASyncStatus.hive_db,
                        copyASyncStatus.hive_table,
                        copyASyncStatus.destination, copyASyncStatus.failures,
                        copyASyncStatus.hdfs_source_path,
                        copyASyncStatus.hdfs_target_path).select_from(
                            copyASyncStatus).filter(
                                (copyASyncStatus.copy_status == 0)
                                | ((copyASyncStatus.copy_status == 2)
                                   & (copyASyncStatus.last_status_update <=
                                      status2checkTimestamp))).all())

                for index, row in aSyncRow.iterrows():

                    tableID = row['table_id']
                    destination = row['destination']
                    hiveDB = row['hive_db']
                    hiveTable = row['hive_table']
                    failures = row['failures']
                    HDFSsourcePath = row['hdfs_source_path']
                    HDFStargetPath = row['hdfs_target_path']

                    log.info("New sync request for table %s.%s" %
                             (hiveDB, hiveTable))

                    updateDict = {}
                    updateDict["copy_status"] = 1
                    updateDict["last_status_update"] = str(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))

                    (session.query(configSchema.copyASyncStatus).filter(
                        configSchema.copyASyncStatus.table_id == tableID).
                     filter(configSchema.copyASyncStatus.destination ==
                            destination).update(updateDict))
                    session.commit()

                    distCPrequest = {}
                    distCPrequest["tableID"] = tableID
                    distCPrequest["hiveDB"] = hiveDB
                    distCPrequest["hiveTable"] = hiveTable
                    distCPrequest["destination"] = destination
                    distCPrequest["failures"] = failures
                    distCPrequest["HDFSsourcePath"] = HDFSsourcePath
                    distCPrequest["HDFStargetPath"] = HDFStargetPath
                    self.distCPreqQueue.put(distCPrequest)

                    log.debug(
                        "Status changed to 1 for table %s.%s and sent to distCP threads"
                        % (hiveDB, hiveTable))

                session.close()

            except SQLAlchemyError as e:
                log.error(str(e.__dict__['orig']))
                session.rollback()
                self.disconnectDBImportDB()

            except SQLerror:
                self.disconnectDBImportDB()

            # ------------------------------------------
            # Read the response from the distCP threads
            # ------------------------------------------
            try:
                distCPresponse = self.distCPresQueue.get(block=False)
            except Empty:
                pass
            else:
                updateDict = {}
                updateDict["last_status_update"] = str(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))
                updateDict["failures"] = distCPresponse.get("failures")

                distCPresult = distCPresponse.get("result")
                if distCPresult == True:
                    updateDict["copy_status"] = 3
                else:
                    updateDict["copy_status"] = 2

                try:
                    session = self.getDBImportSession()
                    (session.query(configSchema.copyASyncStatus).filter(
                        configSchema.copyASyncStatus.table_id ==
                        distCPresponse.get('tableID')).filter(
                            configSchema.copyASyncStatus.destination ==
                            distCPresponse.get('destination')).update(
                                updateDict))
                    session.commit()
                    session.close()

                except SQLAlchemyError as e:
                    log.error(str(e.__dict__['orig']))
                    session.rollback()
                    self.disconnectDBImportDB()

                except SQLerror:
                    self.disconnectDBImportDB()

            # ------------------------------------------
            # Fetch all rows from copyASyncStatus that contains the status 3 and update the remote DBImport instance database
            # Also dlete the record from the copyASyncStatus table
            # ------------------------------------------

            try:
                session = self.getDBImportSession()
                aSyncRow = pd.DataFrame(
                    session.query(
                        copyASyncStatus.table_id, copyASyncStatus.hive_db,
                        copyASyncStatus.hive_table,
                        copyASyncStatus.destination, copyASyncStatus.failures,
                        copyASyncStatus.hdfs_source_path,
                        copyASyncStatus.hdfs_target_path).select_from(
                            copyASyncStatus).filter(
                                copyASyncStatus.copy_status == 3).all())
                session.close()

            except SQLAlchemyError as e:
                log.error(str(e.__dict__['orig']))
                session.rollback()
                self.disconnectDBImportDB()

            except SQLerror:
                self.disconnectDBImportDB()

            else:
                for index, row in aSyncRow.iterrows():

                    tableID = row['table_id']
                    destination = row['destination']
                    hiveDB = row['hive_db']
                    hiveTable = row['hive_table']
                    failures = row['failures']
                    HDFSsourcePath = row['hdfs_source_path']
                    HDFStargetPath = row['hdfs_target_path']

                    # Get the remote sessions. if sessions is not available, we just continue to the next item in the database
                    _remoteSession = self.getDBImportRemoteSession(destination)
                    if _remoteSession == None:
                        continue

                    try:
                        remoteSession = _remoteSession()

                        # Get the table_id from the table at the remote instance
                        remoteImportTableID = (remoteSession.query(
                            importTables.table_id
                        ).select_from(importTables).filter(
                            importTables.hive_db == hiveDB).filter(
                                importTables.hive_table == hiveTable).one())

                        remoteTableID = remoteImportTableID[0]

                        updateDict = {}
                        updateDict["copy_finished"] = str(
                            datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))

                        # Update the values in import_table on the remote instance
                        (remoteSession.query(configSchema.importTables).filter(
                            configSchema.importTables.table_id ==
                            remoteTableID).update(updateDict))
                        remoteSession.commit()

                        remoteSession.close()

                    except SQLAlchemyError as e:
                        log.error(str(e.__dict__['orig']))
                        remoteSession.rollback()
                        self.disconnectRemoteSession(destination)

                    else:
                        # Delete the record from copyASyncStatus
                        try:
                            session = self.getDBImportSession()
                            (session.query(
                                configSchema.copyASyncStatus).filter(
                                    configSchema.copyASyncStatus.table_id ==
                                    tableID).filter(
                                        configSchema.copyASyncStatus.
                                        destination == destination).delete())
                            session.commit()
                            session.close()

                        except SQLAlchemyError as e:
                            log.error(str(e.__dict__['orig']))
                            session.rollback()
                            self.disconnectDBImportDB()

                        except SQLerror:
                            self.disconnectDBImportDB()

                        else:
                            log.info(
                                "Table %s.%s copied successfully to '%s'" %
                                (hiveDB, hiveTable, destination))

            session.close()
            #			log.info("Starting wait")
            time.sleep(1)

        log.info("Server stopped")
        log.debug("Executing daemon.serverDaemon.run() - Finished")
Exemple #9
0
    def run(self):
        logger = "atlasDiscovery"
        log = logging.getLogger(logger)
        self.mysql_conn = None
        self.mysql_cursor = None
        self.configDBSession = None
        self.configDBEngine = None
        self.debugLogLevel = False
        atlasEnabled = True
        #		self.atlasOperation = atlas_operations.atlasOperation(logger)

        if logging.root.level == 10:  # DEBUG
            self.debugLogLevel = True

        self.atlasCrawlerProcessQueue = Queue()
        self.atlasCrawlerResultQueue = Queue()
        self.jdbcConnectionMutex = threading.Lock()

        # Fetch configuration about MySQL database and how to connect to it
        self.configHostname = configuration.get("Database", "mysql_hostname")
        self.configPort = configuration.get("Database", "mysql_port")
        self.configDatabase = configuration.get("Database", "mysql_database")
        self.configUsername = configuration.get("Database", "mysql_username")
        self.configPassword = configuration.get("Database", "mysql_password")

        atlasCrawlerObjects = []
        atlasCrawlerThreads = int(configuration.get("Server", "atlas_threads"))
        if atlasCrawlerThreads == 0:
            log.info(
                "Atlas discovery disabled as the number of threads is set to 0"
            )
            atlasEnabled = False
        else:
            log.info("Starting %s Atlas crawler threads" %
                     (atlasCrawlerThreads))

        for threadID in range(0, atlasCrawlerThreads):
            #			if distCP_separate_logs == False:
            atlasCrawlerLogName = "atlasCrawler-thread%s" % (str(threadID))
            #			else:
            #				distCPlogName = "distCP-thread%s"%(str(threadID))

            thread = atlasCrawler(
                name=str(threadID),
                atlasCrawlerProcessQueue=self.atlasCrawlerProcessQueue,
                atlasCrawlerResultQueue=self.atlasCrawlerResultQueue,
                threadStopEvent=self.threadStopEvent,
                loggerName=atlasCrawlerLogName,
                mutex=self.jdbcConnectionMutex)
            thread.daemon = True
            thread.start()
            atlasCrawlerObjects.append(thread)

        self.common_config = common_config.config()
        jdbcConnections = aliased(configSchema.jdbcConnections)
        self.failureLog = {}
        self.connectionsSentToCrawlers = []

        # The interval between the scans. This is in hours
        atlasDiscoveryInterval = self.common_config.getConfigValue(
            key="atlas_discovery_interval")

        #		if atlasEnabled == True:
        #			atlasEnabled = self.atlasOperation.checkAtlasSchema(logger=logger)

        if atlasEnabled == True:
            log.info("atlasDiscovery started")
            log.info("Atlas discovery interval is set to %s hours" %
                     (atlasDiscoveryInterval))

        while not self.threadStopEvent.isSet() and atlasEnabled == True:

            # ****************************************************************
            # Read data from jdbc_connection and put in queue for processing
            # ****************************************************************

            if self.atlasCrawlerProcessQueue.qsize() < atlasCrawlerThreads:
                # Only read the database if there isn't enough items in the queue to the crawlers to processes. This will save
                # a large number of sql requests if the queue is full
                try:
                    # Read a list of connection aliases that we are going to process in this iteration
                    session = self.getDBImportSession()
                    atlasDiscoveryCheckTime = datetime.utcnow() - timedelta(
                        hours=atlasDiscoveryInterval)

                    # TODO: Antagligen bara köra denna om jdbcConnectionsDf är tom från föregående körning
                    jdbcConnectionsDf = pd.DataFrame(
                        session.query(
                            jdbcConnections.dbalias,
                            jdbcConnections.atlas_last_discovery,
                            jdbcConnections.atlas_discovery,
                            jdbcConnections.contact_info,
                            jdbcConnections.description, jdbcConnections.owner,
                            jdbcConnections.atlas_include_filter,
                            jdbcConnections.atlas_exclude_filter).select_from(
                                jdbcConnections).filter(
                                    jdbcConnections.atlas_discovery == 1)
                        #						.filter((jdbcConnections.atlas_last_discovery < atlasDiscoveryCheckTime) | (jdbcConnections.atlas_last_discovery == None))
                        .order_by(jdbcConnections.atlas_last_discovery).all())
                    session.close()

                except SQLAlchemyError as e:
                    log.error(str(e.__dict__['orig']))
                    session.rollback()
                    self.disconnectDBImportDB()

                else:

                    for index, row in jdbcConnectionsDf.iterrows():
                        dbAlias = row['dbalias']

                        # TODO: Flytta denna till thread
                        if self.common_config.checkTimeWindow(
                                dbAlias, atlasDiscoveryMode=True) == False:
                            continue

                        # Find out if the dbAlias is blacklisted
                        if self.isConnectionBlacklisted(dbAlias) == True:
                            continue

                        if dbAlias in self.connectionsSentToCrawlers:
                            #							log.warning("This connection is already being processed. Skipping....")
                            continue

                        altasOperationFailed = False
                        printBlackListWarning = True

                        self.common_config.mysql_conn.commit()
                        try:
                            self.common_config.lookupConnectionAlias(dbAlias)
                        except invalidConfiguration as err:
                            if self.common_config.atlasJdbcSourceSupport == True:
                                log.error(
                                    "Connection '%s' have invalid configuration. Failed with '%s'"
                                    % (dbAlias, err))
                                altasOperationFailed = True

                        if self.common_config.atlasJdbcSourceSupport == False:
                            # This source type does not support Atlas discovery
                            log.debug(
                                "Connection '%s' does not support Atlas discovery. Skipping..."
                                % (dbAlias))
                            altasOperationFailed = True
                            printBlackListWarning = False

                        # Start the Jpype JVM as that needs to be running before the crawlers starts to use it
                        if jpype.isJVMStarted() == False:
                            log.debug("Starting jpype JVM")
                            self.common_config.connectToJDBC(
                                allJarFiles=True,
                                exitIfFailure=False,
                                logger=logger,
                                printError=False)
                            self.common_config.disconnectFromJDBC()

#						if altasOperationFailed == False and self.common_config.connectToJDBC(allJarFiles=True, exitIfFailure=False, logger=logger) == True:
                        if altasOperationFailed == False:
                            #							self.common_config.atlasEnabled = True
                            self.connectionsSentToCrawlers.append(dbAlias)

                            log.debug("Sending alias '%s' to queue" %
                                      (dbAlias))
                            atlasCrawlerRequest = {}
                            atlasCrawlerRequest["dbAlias"] = row['dbalias']
                            atlasCrawlerRequest["contactInfo"] = row[
                                'contact_info']
                            atlasCrawlerRequest["description"] = row[
                                'description']
                            atlasCrawlerRequest["owner"] = row['owner']
                            atlasCrawlerRequest["atlasIncludeFilter"] = row[
                                'atlas_include_filter']
                            atlasCrawlerRequest["atlasExcludeFilter"] = row[
                                'atlas_exclude_filter']
                            atlasCrawlerRequest[
                                "jdbc_hostname"] = self.common_config.jdbc_hostname
                            atlasCrawlerRequest[
                                "jdbc_port"] = self.common_config.jdbc_port
                            atlasCrawlerRequest[
                                "jdbc_servertype"] = self.common_config.jdbc_servertype
                            atlasCrawlerRequest[
                                "jdbc_database"] = self.common_config.jdbc_database
                            atlasCrawlerRequest[
                                "jdbc_oracle_sid"] = self.common_config.jdbc_oracle_sid
                            atlasCrawlerRequest[
                                "jdbc_oracle_servicename"] = self.common_config.jdbc_oracle_servicename
                            atlasCrawlerRequest[
                                "jdbc_username"] = self.common_config.jdbc_username
                            atlasCrawlerRequest[
                                "jdbc_password"] = self.common_config.jdbc_password
                            atlasCrawlerRequest[
                                "jdbc_driver"] = self.common_config.jdbc_driver
                            atlasCrawlerRequest[
                                "jdbc_url"] = self.common_config.jdbc_url
                            atlasCrawlerRequest[
                                "jdbc_classpath_for_python"] = self.common_config.jdbc_classpath_for_python
                            atlasCrawlerRequest[
                                "jdbc_environment"] = self.common_config.jdbc_environment
                            atlasCrawlerRequest["hdfs_address"] = None
                            atlasCrawlerRequest["cluster_name"] = None

                            self.atlasCrawlerProcessQueue.put(
                                atlasCrawlerRequest)

                        else:
                            #							altasOperationFailed = True

                            #						if altasOperationFailed == True:
                            self.blacklistConnection(dbAlias,
                                                     printBlackListWarning)

            # ********************************
            # Read response from atlasCrawler
            # ********************************

            try:
                atlasCrawlerResult = self.atlasCrawlerResultQueue.get(
                    block=False, timeout=1)
            except Empty:
                atlasCrawlerResult = None

            if atlasCrawlerResult is not None:
                dbAlias = atlasCrawlerResult.get('dbAlias')
                result = atlasCrawlerResult.get('result')
                blacklist = atlasCrawlerResult.get('blacklist')
                log.debug("atlasCrawlerResultQueue: %s" % (atlasCrawlerResult))

                self.connectionsSentToCrawlers.remove(dbAlias)

                if result == True:
                    updateDict = {}
                    updateDict["atlas_last_discovery"] = str(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'))

                    try:
                        session = self.getDBImportSession()
                        (session.query(configSchema.jdbcConnections).filter(
                            configSchema.jdbcConnections.dbalias ==
                            dbAlias).update(updateDict))
                        session.commit()
                        session.close()

                    except SQLAlchemyError as e:
                        log.error(str(e.__dict__['orig']))
                        session.rollback()
                        self.disconnectDBImportDB()

                    else:
                        self.removeBlacklist(dbAlias)
                else:
                    if blacklist == True:
                        log.error(
                            "Connection '%s' failed during crawling of database schema"
                            % (dbAlias))
                        self.blacklistConnection(dbAlias)
                    else:
                        log.warning(
                            "A Warning was detected when crawling connection '%s'. It will not be marked as completed and will retry the operation"
                            % (dbAlias))

            time.sleep(1)

        self.disconnectDBImportDB()
        if atlasEnabled == True:
            log.info("atlasDiscovery stopped")
Exemple #10
0
    def run(self):
        logger = "atlasDiscovery"
        log = logging.getLogger(logger)
        #		log.info("atlasDiscovery started")
        self.mysql_conn = None
        self.mysql_cursor = None
        self.configDBSession = None
        self.configDBEngine = None
        self.debugLogLevel = False

        if logging.root.level == 10:  # DEBUG
            self.debugLogLevel = True

        # Fetch configuration about MySQL database and how to connect to it
        self.configHostname = configuration.get("Database", "mysql_hostname")
        self.configPort = configuration.get("Database", "mysql_port")
        self.configDatabase = configuration.get("Database", "mysql_database")
        self.configUsername = configuration.get("Database", "mysql_username")
        self.configPassword = configuration.get("Database", "mysql_password")

        self.common_config = common_config.config()

        jdbcConnections = aliased(configSchema.jdbcConnections)

        failureLog = {}

        # The interval between the scans. This is in hours
        atlasDiscoveryInterval = self.common_config.getConfigValue(
            key="atlas_discovery_interval")

        atlasEnabled = self.common_config.checkAtlasSchema(logger=logger)
        if atlasEnabled == True:
            log.info("atlasDiscovery started")
            log.info("Atlas discovery interval is set to %s hours" %
                     (atlasDiscoveryInterval))

        while not self.threadStopEvent.isSet() and atlasEnabled == True:

            try:
                session = self.getDBImportSession()
                atlasDiscoveryCheckTime = datetime.utcnow() - timedelta(
                    hours=atlasDiscoveryInterval)

                jdbcConnectionsDf = pd.DataFrame(
                    session.query(jdbcConnections.dbalias,
                                  jdbcConnections.timewindow_start,
                                  jdbcConnections.timewindow_stop,
                                  jdbcConnections.atlas_last_discovery,
                                  jdbcConnections.atlas_discovery).
                    select_from(jdbcConnections).filter(
                        jdbcConnections.atlas_discovery == 1).filter(
                            (jdbcConnections.atlas_last_discovery <
                             atlasDiscoveryCheckTime)
                            | (jdbcConnections.atlas_last_discovery == None)).
                    order_by(jdbcConnections.atlas_last_discovery).all())
                session.close()

            except SQLAlchemyError as e:
                log.error(str(e.__dict__['orig']))
                session.rollback()
                self.disconnectDBImportDB()

            else:

                for index, row in jdbcConnectionsDf.iterrows():
                    dbAlias = row['dbalias']

                    currentTime = str(datetime.now().strftime('%H:%M:%S'))
                    timeWindowStart = None
                    timeWindowStop = None
                    dbAliasAllowedAtThisTime = False

                    if row['timewindow_start'] != None:
                        timeWindowStart = str(row['timewindow_start'])
                    if row['timewindow_stop'] != None:
                        timeWindowStop = str(row['timewindow_stop'])

                    if timeWindowStart != None and re.search(
                            '^[0-9]:', timeWindowStart):
                        timeWindowStart = "0" + timeWindowStart
                    if timeWindowStop != None and re.search(
                            '^[0-9]:', timeWindowStop):
                        timeWindowStop = "0" + timeWindowStop

                    if timeWindowStart == None and timeWindowStop == None:
                        dbAliasAllowedAtThisTime = True

                    elif currentTime > timeWindowStart and currentTime < timeWindowStop:
                        dbAliasAllowedAtThisTime = True

                    # Find out if the dbAlias is blacklisted
                    if failureLog.get(dbAlias, None) != None:
                        blackListEnableTime = failureLog[dbAlias][
                            'blackListStart'] + timedelta(
                                hours=failureLog[dbAlias]['blackListTime'])

                        if datetime.now() < blackListEnableTime:
                            # This dbAlias is still blacklisted
                            continue

                    if dbAliasAllowedAtThisTime == False:
                        # Not allowed to access this connection at this time
                        continue

                    self.common_config.mysql_conn.commit()
                    self.common_config.lookupConnectionAlias(dbAlias)

                    if self.common_config.atlasJdbcSourceSupport == False:
                        # This source type does not support Atlas discovery
                        continue

                    # We now have a valid connection in dbAlias that we can do a discovery on
                    log.info("Starting a Atlas discovery on connection '%s'" %
                             (dbAlias))

                    altasOperationFailed = False
                    if self.common_config.connectToJDBC(allJarFiles=True,
                                                        exitIfFailure=False,
                                                        logger=logger) == True:
                        self.common_config.atlasEnabled = True
                        response = self.common_config.discoverAtlasRdbms(
                            dbAlias=dbAlias, logger=logger)
                        if response == False:
                            # Something went wrong when getting source system schema
                            altasOperationFailed = True
                            log.warning(
                                "There was an error/warning when discovering source schema"
                            )
                        else:
                            log.info(
                                "Finished Atlas discovery on connection '%s'" %
                                (dbAlias))

                        self.common_config.disconnectFromJDBC()

                        if altasOperationFailed == False:
                            updateDict = {}
                            updateDict["atlas_last_discovery"] = str(
                                datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S.%f'))

                            try:
                                session = self.getDBImportSession()
                                (session.query(
                                    configSchema.jdbcConnections).filter(
                                        configSchema.jdbcConnections.dbalias ==
                                        dbAlias).update(updateDict))
                                session.commit()
                                session.close()

                            except SQLAlchemyError as e:
                                log.error(str(e.__dict__['orig']))
                                session.rollback()
                                self.disconnectDBImportDB()

                            else:
                                failureLog.pop(dbAlias, None)
                                break
                    else:
                        altasOperationFailed = True

                    if altasOperationFailed == True:

                        # Connection failed. We need to blacklist this connection for some time
                        blackListData = failureLog.get(dbAlias, None)
                        if blackListData == None:
                            blackListTime = 1
                        else:
                            blackListTime = failureLog[dbAlias][
                                'blackListTime'] * 2

                            # Max blacklist time is 24 hours
                            if blackListTime > 24: blackListTime = 24

                        failureLog[dbAlias] = {
                            'blackListTime': blackListTime,
                            'blackListStart': datetime.now()
                        }

                        log.warning(
                            "Atlas Discovery failed on connection '%s'" %
                            (dbAlias))
                        log.warning(
                            "This connection is now blacklisted for %s hours" %
                            (failureLog[dbAlias]['blackListTime']))

            time.sleep(1)

        self.disconnectDBImportDB()
        if atlasEnabled == True:
            log.info("atlasDiscovery stopped")