def run(self): logger = "restServer" log = logging.getLogger(logger) log.info("REST Server started") self.mysql_conn = None self.mysql_cursor = None self.configDBSession = None self.configDBEngine = None self.debugLogLevel = False if logging.root.level == 10: # DEBUG self.debugLogLevel = True restAddress = configuration.get("Server", "restServer_address") restPort = configuration.get("Server", "restServer_port") if restAddress.strip() != "" and restPort.strip() != "": app = Flask("restServer") api = Api(app) api.add_resource(restRoot, '/') api.add_resource(restStatus, '/status') api.add_resource(restJdbcConnections, '/jdbc_connections') log.info("Starting RESTserver on %s:%s"%(restAddress, restPort)) serve( TransLogger(app, setup_console_handler=False, logger=logging.getLogger("restServerAccess")), host=restAddress, port=restPort, ident="DBImport REST Server", url_scheme='https', _quiet=True)
def __init__(self, Hive_DB=None, Hive_Table=None): logging.debug("Executing common_operation.__init__()") self.Hive_DB = Hive_DB self.Hive_Table = Hive_Table # self.mysql_conn = None # self.mysql_cursor = None self.hive_conn = None self.hive_cursor = None self.debugLogLevel = False if logging.root.level == 10: # DEBUG self.debugLogLevel = True # Fetch and initialize the Kerberos configuration self.kerberosPrincipal = configuration.get("Kerberos", "principal") self.webHCatAuth = HTTPKerberosAuth(force_preemptive=True, principal=self.kerberosPrincipal) self.common_config = common_config.config() # Fetch configuration details about Hive LLAP # self.hive_hostname = configuration.get("Hive", "hostname") # self.hive_port = configuration.get("Hive", "port") self.hive_servers = configuration.get("Hive", "servers") self.hive_kerberos_service_name = configuration.get("Hive", "kerberos_service_name") self.hive_kerberos_realm = configuration.get("Hive", "kerberos_realm") self.hive_print_messages = self.common_config.getConfigValue(key = "hive_print_messages") if configuration.get("Hive", "use_ssl").lower() == "true": self.hive_use_ssl = True else: self.hive_use_ssl = False self.hive_min_buckets = int(configuration.get("Hive", "min_buckets")) self.hive_max_buckets = int(configuration.get("Hive", "max_buckets")) # HDFS Settings self.hdfs_address = self.common_config.getConfigValue(key = "hdfs_address") self.hdfs_basedir = self.common_config.getConfigValue(key = "hdfs_basedir") self.hdfs_blocksize = self.common_config.getConfigValue(key = "hdfs_blocksize") self.hiveConnectStr = configuration.get("Hive", "hive_metastore_alchemy_conn") try: self.hiveMetaDB = sa.create_engine(self.hiveConnectStr, echo = self.debugLogLevel) self.hiveMetaDB.connect() self.hiveMetaSession = sessionmaker(bind=self.hiveMetaDB) except sa.exc.OperationalError as err: logging.error("%s"%err) self.common_config.remove_temporary_files() sys.exit(1) except: print("Unexpected error: ") print(sys.exc_info()) self.common_config.remove_temporary_files() sys.exit(1) logging.debug("Executing common_operations.__init__() - Finished")
def __init__(self): logging.debug("Executing rest.__init__()") self.headers = {'Content-type': 'application/json'} self.RESTendpoint = configuration.get("REST_statistics", "rest_endpoint") self.RESTtimeout = configuration.get("REST_statistics", "timeout") if self.RESTendpoint == "": logging.error( "Cant find the REST endpoint. Please check configuration file") sys.exit(1) if self.RESTtimeout == "": logging.error( "Cant find the REST endpoint timeout. Please check configuration file" ) sys.exit(1)
def __init__(self, mysql_conn, Hive_DB, Hive_Table): logging.debug("Executing stage.__init__()") self.Hive_DB = Hive_DB self.Hive_Table = Hive_Table self.mysql_conn = mysql_conn self.mysql_cursor = self.mysql_conn.cursor(buffered=False) self.currentStage = None self.memoryStage = False self.stageTimeStart = None self.stageTimeStop = None self.stageDurationStart = float() self.stageDurationStop = float() self.stageDurationTime = float() if configuration.get("REST_statistics", "post_import_data").lower() == "true": self.post_import_data = True else: self.post_import_data = False self.rest = rest.restInterface()
def __init__(self): logging.debug("Executing database.__init__()") self.mysql_conn = None self.mysql_cursor = None self.debugLogLevel = False if logging.root.level == 10: # DEBUG self.debugLogLevel = True try: DBImport_Home = os.environ['DBIMPORT_HOME'] except KeyError: print( "Error: System Environment Variable DBIMPORT_HOME is not set") # self.remove_temporary_files() sys.exit(1) # Fetch configuration about MySQL database and how to connect to it self.configHostname = configuration.get("Database", "mysql_hostname") self.configPort = configuration.get("Database", "mysql_port") self.configDatabase = configuration.get("Database", "mysql_database") self.configUsername = configuration.get("Database", "mysql_username") self.configPassword = configuration.get("Database", "mysql_password") # Esablish a SQLAlchemy connection to the DBImport database # try: self.connectStr = "mysql+pymysql://%s:%s@%s:%s/%s" % ( self.configUsername, self.configPassword, self.configHostname, self.configPort, self.configDatabase) try: self.configDB = sa.create_engine(self.connectStr, echo=self.debugLogLevel) self.configDB.connect() self.configDBSession = sessionmaker(bind=self.configDB) except sa.exc.OperationalError as err: logging.error("%s" % err) sys.exit(1) except: print("Unexpected error: ") print(sys.exc_info()) sys.exit(1) # Setup configuration for Alembic self.alembicSchemaDir = DBImport_Home + '/bin/SchemaUpgrade' self.alembicConfig = Config() self.alembicConfig.set_main_option('script_location', self.alembicSchemaDir) self.alembicConfig.set_main_option('sqlalchemy.url', self.connectStr) # Esablish a connection to the DBImport database in MySQL try: self.mysql_conn = mysql.connector.connect( host=self.configHostname, port=self.configPort, database=self.configDatabase, user=self.configUsername, password=self.configPassword) except mysql.connector.errors.ProgrammingError as err: logging.error("%s" % err) # self.remove_temporary_files() sys.exit(1) except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: logging.error( "Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: logging.error("Database does not exist") else: logging.error("%s" % err) logging.error( "Error: There was a problem connecting to the MySQL database. Please check configuration and serverstatus and try again" ) # self.remove_temporary_files() sys.exit(1) else: self.mysql_cursor = self.mysql_conn.cursor(buffered=False) logging.debug("Executing database.__init__() - Finished")
def __init__(self): logging.debug("Executing rest.__init__()") self.mysql_conn = None self.mysql_cursor_01 = None self.mysql_cursor_02 = None self.RESTendpoint = configuration.get("REST_statistics", "rest_endpoint") if self.RESTendpoint == "": logging.error( "Cant find the REST endpoint. Please check configuration file") sys.exit(1) # Fetch configuration about MySQL database and how to connect to it mysql_hostname = configuration.get("Database", "mysql_hostname") mysql_port = configuration.get("Database", "mysql_port") mysql_database = configuration.get("Database", "mysql_database") mysql_username = configuration.get("Database", "mysql_username") mysql_password = configuration.get("Database", "mysql_password") # Esablish a connection to the DBImport database in MySQL try: self.mysql_conn = mysql.connector.connect(host=mysql_hostname, port=mysql_port, database=mysql_database, user=mysql_username, password=mysql_password) except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: logging.error( "Something is wrong with your user name or password") elif err.errno == errorcode.ER_BAD_DB_ERROR: logging.error("Database does not exist") else: logging.error("%s" % err) logging.error( "Error: There was a problem connecting to the MySQL database. Please check configuration and serverstatus and try again" ) self.remove_temporary_files() sys.exit(1) else: self.mysql_cursor_01 = self.mysql_conn.cursor(buffered=False) self.mysql_cursor_02 = self.mysql_conn.cursor(buffered=False) rest = restInterface() query = "select id, jsondata from json_to_rest" self.mysql_cursor_01.execute(query) logging.debug("SQL Statement executed: %s" % (self.mysql_cursor_01.statement)) successCounter = 0 errorCounter = 0 for row in self.mysql_cursor_01.fetchall(): jsonID = row[0] jsonData = row[1] response_code = -1 response_code = rest.sendData(jsonData) if response_code == 200: query = "delete from json_to_rest where id = %s" self.mysql_cursor_02.execute(query, (jsonID, )) logging.debug("SQL Statement executed: %s" % (self.mysql_cursor_02.statement)) self.mysql_conn.commit() successCounter += 1 else: errorCounter += 1 logging.info("Transmitted %s JSON documents to %s" % (successCounter, self.RESTendpoint)) if errorCounter > 0: logging.error("%s errors encountered" % (errorCounter)) self.mysql_conn.close()
def run(self): log = logging.getLogger(self.loggerName) log.info("distCP %s started" % (self.name)) yarnQueue = configuration.get("Server", "distCP_yarnqueue") while not self.threadStopEvent.isSet(): distCPrequest = self.distCPreqQueue.get() if distCPrequest is None: time.sleep(1) break tableID = distCPrequest.get('tableID') hiveDB = distCPrequest.get('hiveDB') hiveTable = distCPrequest.get('hiveTable') destination = distCPrequest.get('destination') failures = distCPrequest.get('failures') HDFSsourcePath = distCPrequest.get('HDFSsourcePath') HDFStargetPath = distCPrequest.get('HDFStargetPath') log.info( "Thread %s: Starting a new distCP copy with the following paramaters" % (self.name)) log.info( "Thread %s: --------------------------------------------------------" % (self.name)) log.info("Thread %s: tableID = %s" % (self.name, tableID)) log.info("Thread %s: hiveDB = %s" % (self.name, hiveDB)) log.info("Thread %s: hiveTable = %s" % (self.name, hiveTable)) log.info("Thread %s: destination = %s" % (self.name, destination)) log.info("Thread %s: HDFSsourcePath = %s" % (self.name, HDFSsourcePath)) log.info("Thread %s: HDFStargetPath = %s" % (self.name, HDFStargetPath)) log.info( "Thread %s: --------------------------------------------------------" % (self.name)) distcpCommand = [ "hadoop", "distcp", "-D", "yarn.timeline-service.enabled=false", "-D", "mapreduce.job.queuename=%s" % (yarnQueue), "-overwrite", "-delete", "%s" % (HDFSsourcePath), "%s" % (HDFStargetPath) ] log.info("Thread %s: ______________________ " % (self.name)) log.info("Thread %s: | |" % (self.name)) log.info("Thread %s: | Hadoop distCp starts |" % (self.name)) log.info("Thread %s: |______________________|" % (self.name)) log.info("Thread %s: " % (self.name)) # Start distcp sh_session = subprocess.Popen(distcpCommand, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) distCPoutput = "" # Print Stdout and stderr while distcp is running while sh_session.poll() == None: row = sh_session.stdout.readline().decode('utf-8').rstrip() if row != "": log.info("Thread %s: %s" % (self.name, row)) distCPoutput += row + "\n" sys.stdout.flush() # Print what is left in output after distcp is finished for row in sh_session.stdout.readlines(): row = row.decode('utf-8').rstrip() if row != "": log.info("Thread %s: %s" % (self.name, row)) distCPoutput += row + "\n" sys.stdout.flush() log.info("Thread %s: _________________________ " % (self.name)) log.info("Thread %s: | |" % (self.name)) log.info("Thread %s: | Hadoop distCp completed |" % (self.name)) log.info("Thread %s: |_________________________|" % (self.name)) log.info("Thread %s: " % (self.name)) disCPresult = False if " ERROR " in distCPoutput: log.error("Thread %s: ERROR detected during distCP copy." % (self.name)) failures = failures + 1 elif " completed successfully" in distCPoutput: disCPresult = True failures = 0 else: log.error( "Thread %s: Unknown status of distCP. Marked as failure as it cant find that it was finished successful" % (self.name)) failures = failures + 1 distCPresponse = {} distCPresponse["tableID"] = tableID distCPresponse["hiveDB"] = hiveDB distCPresponse["hiveTable"] = hiveTable distCPresponse["destination"] = destination distCPresponse["result"] = disCPresult distCPresponse["failures"] = failures self.distCPresQueue.put(distCPresponse) log.info("distCP %s stopped" % (self.name))
def run(self): # This is the main event loop where the 'real' daemonwork happens log = logging.getLogger("server") log.debug("Executing daemon.serverDaemon.run()") log.info("Server initializing") self.mysql_conn = None self.mysql_cursor = None self.debugLogLevel = False if logging.root.level == 10: # DEBUG self.debugLogLevel = True self.common_config = common_config.config() self.crypto = self.common_config.crypto self.crypto.setPrivateKeyFile( configuration.get("Credentials", "private_key")) self.crypto.setPublicKeyFile( configuration.get("Credentials", "public_key")) self.remoteDBImportEngines = {} self.remoteDBImportSessions = {} self.remoteInstanceConfigDB = None self.configDBSession = None self.configDBEngine = None self.distCPreqQueue = Queue() self.distCPresQueue = Queue() self.threadStopEvent = threading.Event() # Start the Atlas Discovery Thread self.atlasDiscoveryThread = atlasDiscovery.atlasDiscovery( self.threadStopEvent) self.atlasDiscoveryThread.daemon = True self.atlasDiscoveryThread.start() # Start the REST Server Thread self.restServerThread = restServer.restServer(self.threadStopEvent) self.restServerThread.daemon = True self.restServerThread.start() # Start the distCP threads if configuration.get("Server", "distCP_separate_logs").lower() == "true": distCP_separate_logs = True else: distCP_separate_logs = False distCPobjects = [] distCPthreads = int(configuration.get("Server", "distCP_threads")) if distCPthreads == 0: log.error( "'distCP_threads' configuration in configfile must be larger than 0" ) sys.exit(1) log.info("Starting %s distCp threads" % (distCPthreads)) for threadID in range(0, distCPthreads): if distCP_separate_logs == False: distCPlogName = "distCP" else: distCPlogName = "distCP-thread%s" % (str(threadID)) thread = distCP(name=str(threadID), distCPreqQueue=self.distCPreqQueue, distCPresQueue=self.distCPresQueue, threadStopEvent=self.threadStopEvent, loggerName=distCPlogName) thread.daemon = True thread.start() distCPobjects.append(thread) # Fetch configuration about MySQL database and how to connect to it self.configHostname = configuration.get("Database", "mysql_hostname") self.configPort = configuration.get("Database", "mysql_port") self.configDatabase = configuration.get("Database", "mysql_database") self.configUsername = configuration.get("Database", "mysql_username") self.configPassword = configuration.get("Database", "mysql_password") # Set all rows that have copy_status = 1 to 0. This is needed in the startup as if they are 1 in this stage, it means that a previous # server marked it as 1 but didnt finish the copy. We need to retry that copy here and now try: updateDict = {} updateDict["last_status_update"] = str( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) updateDict["copy_status"] = 0 session = self.getDBImportSession() (session.query(configSchema.copyASyncStatus).filter( configSchema.copyASyncStatus.copy_status == 1).update( updateDict)) session.commit() session.close() log.debug("Init part of daemon.serverDaemon.run() completed") log.info("Server startup completed") except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) log.error("Server startup failed") self.disconnectDBImportDB() # As we require this operation to be completed successful before entering the main loop, we will exit if there is a problem self.common_config.remove_temporary_files() sys.exit(1) except SQLerror: log.error("Server startup failed. Cant connect to config database") self.disconnectDBImportDB() self.common_config.remove_temporary_files() sys.exit(1) importTables = aliased(configSchema.importTables) dbimportInstances = aliased(configSchema.dbimportInstances) copyASyncStatus = aliased(configSchema.copyASyncStatus) while True: # *********************************** # Main Loop for server # *********************************** try: session = self.getDBImportSession() # status 0 = New data from import # status 1 = Data sent to distCP thread # status 2 = Data returned from distCP and was a failure # status 3 = Data returned from distCP and was a success # ------------------------------------------ # Fetch all rows from copyASyncStatus that contains the status 0 and send them to distCP threads # ------------------------------------------ # TODO: make the 1 min interval a configured param status2checkTimestamp = ( datetime.now() - timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S.%f') aSyncRow = pd.DataFrame( session.query( copyASyncStatus.table_id, copyASyncStatus.hive_db, copyASyncStatus.hive_table, copyASyncStatus.destination, copyASyncStatus.failures, copyASyncStatus.hdfs_source_path, copyASyncStatus.hdfs_target_path).select_from( copyASyncStatus).filter( (copyASyncStatus.copy_status == 0) | ((copyASyncStatus.copy_status == 2) & (copyASyncStatus.last_status_update <= status2checkTimestamp))).all()) for index, row in aSyncRow.iterrows(): tableID = row['table_id'] destination = row['destination'] hiveDB = row['hive_db'] hiveTable = row['hive_table'] failures = row['failures'] HDFSsourcePath = row['hdfs_source_path'] HDFStargetPath = row['hdfs_target_path'] log.info("New sync request for table %s.%s" % (hiveDB, hiveTable)) updateDict = {} updateDict["copy_status"] = 1 updateDict["last_status_update"] = str( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) (session.query(configSchema.copyASyncStatus).filter( configSchema.copyASyncStatus.table_id == tableID). filter(configSchema.copyASyncStatus.destination == destination).update(updateDict)) session.commit() distCPrequest = {} distCPrequest["tableID"] = tableID distCPrequest["hiveDB"] = hiveDB distCPrequest["hiveTable"] = hiveTable distCPrequest["destination"] = destination distCPrequest["failures"] = failures distCPrequest["HDFSsourcePath"] = HDFSsourcePath distCPrequest["HDFStargetPath"] = HDFStargetPath self.distCPreqQueue.put(distCPrequest) log.debug( "Status changed to 1 for table %s.%s and sent to distCP threads" % (hiveDB, hiveTable)) session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() except SQLerror: self.disconnectDBImportDB() # ------------------------------------------ # Read the response from the distCP threads # ------------------------------------------ try: distCPresponse = self.distCPresQueue.get(block=False) except Empty: pass else: updateDict = {} updateDict["last_status_update"] = str( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) updateDict["failures"] = distCPresponse.get("failures") distCPresult = distCPresponse.get("result") if distCPresult == True: updateDict["copy_status"] = 3 else: updateDict["copy_status"] = 2 try: session = self.getDBImportSession() (session.query(configSchema.copyASyncStatus).filter( configSchema.copyASyncStatus.table_id == distCPresponse.get('tableID')).filter( configSchema.copyASyncStatus.destination == distCPresponse.get('destination')).update( updateDict)) session.commit() session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() except SQLerror: self.disconnectDBImportDB() # ------------------------------------------ # Fetch all rows from copyASyncStatus that contains the status 3 and update the remote DBImport instance database # Also dlete the record from the copyASyncStatus table # ------------------------------------------ try: session = self.getDBImportSession() aSyncRow = pd.DataFrame( session.query( copyASyncStatus.table_id, copyASyncStatus.hive_db, copyASyncStatus.hive_table, copyASyncStatus.destination, copyASyncStatus.failures, copyASyncStatus.hdfs_source_path, copyASyncStatus.hdfs_target_path).select_from( copyASyncStatus).filter( copyASyncStatus.copy_status == 3).all()) session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() except SQLerror: self.disconnectDBImportDB() else: for index, row in aSyncRow.iterrows(): tableID = row['table_id'] destination = row['destination'] hiveDB = row['hive_db'] hiveTable = row['hive_table'] failures = row['failures'] HDFSsourcePath = row['hdfs_source_path'] HDFStargetPath = row['hdfs_target_path'] # Get the remote sessions. if sessions is not available, we just continue to the next item in the database _remoteSession = self.getDBImportRemoteSession(destination) if _remoteSession == None: continue try: remoteSession = _remoteSession() # Get the table_id from the table at the remote instance remoteImportTableID = (remoteSession.query( importTables.table_id ).select_from(importTables).filter( importTables.hive_db == hiveDB).filter( importTables.hive_table == hiveTable).one()) remoteTableID = remoteImportTableID[0] updateDict = {} updateDict["copy_finished"] = str( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) # Update the values in import_table on the remote instance (remoteSession.query(configSchema.importTables).filter( configSchema.importTables.table_id == remoteTableID).update(updateDict)) remoteSession.commit() remoteSession.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) remoteSession.rollback() self.disconnectRemoteSession(destination) else: # Delete the record from copyASyncStatus try: session = self.getDBImportSession() (session.query( configSchema.copyASyncStatus).filter( configSchema.copyASyncStatus.table_id == tableID).filter( configSchema.copyASyncStatus. destination == destination).delete()) session.commit() session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() except SQLerror: self.disconnectDBImportDB() else: log.info( "Table %s.%s copied successfully to '%s'" % (hiveDB, hiveTable, destination)) session.close() # log.info("Starting wait") time.sleep(1) log.info("Server stopped") log.debug("Executing daemon.serverDaemon.run() - Finished")
def run(self): logger = "atlasDiscovery" log = logging.getLogger(logger) self.mysql_conn = None self.mysql_cursor = None self.configDBSession = None self.configDBEngine = None self.debugLogLevel = False atlasEnabled = True # self.atlasOperation = atlas_operations.atlasOperation(logger) if logging.root.level == 10: # DEBUG self.debugLogLevel = True self.atlasCrawlerProcessQueue = Queue() self.atlasCrawlerResultQueue = Queue() self.jdbcConnectionMutex = threading.Lock() # Fetch configuration about MySQL database and how to connect to it self.configHostname = configuration.get("Database", "mysql_hostname") self.configPort = configuration.get("Database", "mysql_port") self.configDatabase = configuration.get("Database", "mysql_database") self.configUsername = configuration.get("Database", "mysql_username") self.configPassword = configuration.get("Database", "mysql_password") atlasCrawlerObjects = [] atlasCrawlerThreads = int(configuration.get("Server", "atlas_threads")) if atlasCrawlerThreads == 0: log.info( "Atlas discovery disabled as the number of threads is set to 0" ) atlasEnabled = False else: log.info("Starting %s Atlas crawler threads" % (atlasCrawlerThreads)) for threadID in range(0, atlasCrawlerThreads): # if distCP_separate_logs == False: atlasCrawlerLogName = "atlasCrawler-thread%s" % (str(threadID)) # else: # distCPlogName = "distCP-thread%s"%(str(threadID)) thread = atlasCrawler( name=str(threadID), atlasCrawlerProcessQueue=self.atlasCrawlerProcessQueue, atlasCrawlerResultQueue=self.atlasCrawlerResultQueue, threadStopEvent=self.threadStopEvent, loggerName=atlasCrawlerLogName, mutex=self.jdbcConnectionMutex) thread.daemon = True thread.start() atlasCrawlerObjects.append(thread) self.common_config = common_config.config() jdbcConnections = aliased(configSchema.jdbcConnections) self.failureLog = {} self.connectionsSentToCrawlers = [] # The interval between the scans. This is in hours atlasDiscoveryInterval = self.common_config.getConfigValue( key="atlas_discovery_interval") # if atlasEnabled == True: # atlasEnabled = self.atlasOperation.checkAtlasSchema(logger=logger) if atlasEnabled == True: log.info("atlasDiscovery started") log.info("Atlas discovery interval is set to %s hours" % (atlasDiscoveryInterval)) while not self.threadStopEvent.isSet() and atlasEnabled == True: # **************************************************************** # Read data from jdbc_connection and put in queue for processing # **************************************************************** if self.atlasCrawlerProcessQueue.qsize() < atlasCrawlerThreads: # Only read the database if there isn't enough items in the queue to the crawlers to processes. This will save # a large number of sql requests if the queue is full try: # Read a list of connection aliases that we are going to process in this iteration session = self.getDBImportSession() atlasDiscoveryCheckTime = datetime.utcnow() - timedelta( hours=atlasDiscoveryInterval) # TODO: Antagligen bara köra denna om jdbcConnectionsDf är tom från föregående körning jdbcConnectionsDf = pd.DataFrame( session.query( jdbcConnections.dbalias, jdbcConnections.atlas_last_discovery, jdbcConnections.atlas_discovery, jdbcConnections.contact_info, jdbcConnections.description, jdbcConnections.owner, jdbcConnections.atlas_include_filter, jdbcConnections.atlas_exclude_filter).select_from( jdbcConnections).filter( jdbcConnections.atlas_discovery == 1) # .filter((jdbcConnections.atlas_last_discovery < atlasDiscoveryCheckTime) | (jdbcConnections.atlas_last_discovery == None)) .order_by(jdbcConnections.atlas_last_discovery).all()) session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() else: for index, row in jdbcConnectionsDf.iterrows(): dbAlias = row['dbalias'] # TODO: Flytta denna till thread if self.common_config.checkTimeWindow( dbAlias, atlasDiscoveryMode=True) == False: continue # Find out if the dbAlias is blacklisted if self.isConnectionBlacklisted(dbAlias) == True: continue if dbAlias in self.connectionsSentToCrawlers: # log.warning("This connection is already being processed. Skipping....") continue altasOperationFailed = False printBlackListWarning = True self.common_config.mysql_conn.commit() try: self.common_config.lookupConnectionAlias(dbAlias) except invalidConfiguration as err: if self.common_config.atlasJdbcSourceSupport == True: log.error( "Connection '%s' have invalid configuration. Failed with '%s'" % (dbAlias, err)) altasOperationFailed = True if self.common_config.atlasJdbcSourceSupport == False: # This source type does not support Atlas discovery log.debug( "Connection '%s' does not support Atlas discovery. Skipping..." % (dbAlias)) altasOperationFailed = True printBlackListWarning = False # Start the Jpype JVM as that needs to be running before the crawlers starts to use it if jpype.isJVMStarted() == False: log.debug("Starting jpype JVM") self.common_config.connectToJDBC( allJarFiles=True, exitIfFailure=False, logger=logger, printError=False) self.common_config.disconnectFromJDBC() # if altasOperationFailed == False and self.common_config.connectToJDBC(allJarFiles=True, exitIfFailure=False, logger=logger) == True: if altasOperationFailed == False: # self.common_config.atlasEnabled = True self.connectionsSentToCrawlers.append(dbAlias) log.debug("Sending alias '%s' to queue" % (dbAlias)) atlasCrawlerRequest = {} atlasCrawlerRequest["dbAlias"] = row['dbalias'] atlasCrawlerRequest["contactInfo"] = row[ 'contact_info'] atlasCrawlerRequest["description"] = row[ 'description'] atlasCrawlerRequest["owner"] = row['owner'] atlasCrawlerRequest["atlasIncludeFilter"] = row[ 'atlas_include_filter'] atlasCrawlerRequest["atlasExcludeFilter"] = row[ 'atlas_exclude_filter'] atlasCrawlerRequest[ "jdbc_hostname"] = self.common_config.jdbc_hostname atlasCrawlerRequest[ "jdbc_port"] = self.common_config.jdbc_port atlasCrawlerRequest[ "jdbc_servertype"] = self.common_config.jdbc_servertype atlasCrawlerRequest[ "jdbc_database"] = self.common_config.jdbc_database atlasCrawlerRequest[ "jdbc_oracle_sid"] = self.common_config.jdbc_oracle_sid atlasCrawlerRequest[ "jdbc_oracle_servicename"] = self.common_config.jdbc_oracle_servicename atlasCrawlerRequest[ "jdbc_username"] = self.common_config.jdbc_username atlasCrawlerRequest[ "jdbc_password"] = self.common_config.jdbc_password atlasCrawlerRequest[ "jdbc_driver"] = self.common_config.jdbc_driver atlasCrawlerRequest[ "jdbc_url"] = self.common_config.jdbc_url atlasCrawlerRequest[ "jdbc_classpath_for_python"] = self.common_config.jdbc_classpath_for_python atlasCrawlerRequest[ "jdbc_environment"] = self.common_config.jdbc_environment atlasCrawlerRequest["hdfs_address"] = None atlasCrawlerRequest["cluster_name"] = None self.atlasCrawlerProcessQueue.put( atlasCrawlerRequest) else: # altasOperationFailed = True # if altasOperationFailed == True: self.blacklistConnection(dbAlias, printBlackListWarning) # ******************************** # Read response from atlasCrawler # ******************************** try: atlasCrawlerResult = self.atlasCrawlerResultQueue.get( block=False, timeout=1) except Empty: atlasCrawlerResult = None if atlasCrawlerResult is not None: dbAlias = atlasCrawlerResult.get('dbAlias') result = atlasCrawlerResult.get('result') blacklist = atlasCrawlerResult.get('blacklist') log.debug("atlasCrawlerResultQueue: %s" % (atlasCrawlerResult)) self.connectionsSentToCrawlers.remove(dbAlias) if result == True: updateDict = {} updateDict["atlas_last_discovery"] = str( datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')) try: session = self.getDBImportSession() (session.query(configSchema.jdbcConnections).filter( configSchema.jdbcConnections.dbalias == dbAlias).update(updateDict)) session.commit() session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() else: self.removeBlacklist(dbAlias) else: if blacklist == True: log.error( "Connection '%s' failed during crawling of database schema" % (dbAlias)) self.blacklistConnection(dbAlias) else: log.warning( "A Warning was detected when crawling connection '%s'. It will not be marked as completed and will retry the operation" % (dbAlias)) time.sleep(1) self.disconnectDBImportDB() if atlasEnabled == True: log.info("atlasDiscovery stopped")
def run(self): logger = "atlasDiscovery" log = logging.getLogger(logger) # log.info("atlasDiscovery started") self.mysql_conn = None self.mysql_cursor = None self.configDBSession = None self.configDBEngine = None self.debugLogLevel = False if logging.root.level == 10: # DEBUG self.debugLogLevel = True # Fetch configuration about MySQL database and how to connect to it self.configHostname = configuration.get("Database", "mysql_hostname") self.configPort = configuration.get("Database", "mysql_port") self.configDatabase = configuration.get("Database", "mysql_database") self.configUsername = configuration.get("Database", "mysql_username") self.configPassword = configuration.get("Database", "mysql_password") self.common_config = common_config.config() jdbcConnections = aliased(configSchema.jdbcConnections) failureLog = {} # The interval between the scans. This is in hours atlasDiscoveryInterval = self.common_config.getConfigValue( key="atlas_discovery_interval") atlasEnabled = self.common_config.checkAtlasSchema(logger=logger) if atlasEnabled == True: log.info("atlasDiscovery started") log.info("Atlas discovery interval is set to %s hours" % (atlasDiscoveryInterval)) while not self.threadStopEvent.isSet() and atlasEnabled == True: try: session = self.getDBImportSession() atlasDiscoveryCheckTime = datetime.utcnow() - timedelta( hours=atlasDiscoveryInterval) jdbcConnectionsDf = pd.DataFrame( session.query(jdbcConnections.dbalias, jdbcConnections.timewindow_start, jdbcConnections.timewindow_stop, jdbcConnections.atlas_last_discovery, jdbcConnections.atlas_discovery). select_from(jdbcConnections).filter( jdbcConnections.atlas_discovery == 1).filter( (jdbcConnections.atlas_last_discovery < atlasDiscoveryCheckTime) | (jdbcConnections.atlas_last_discovery == None)). order_by(jdbcConnections.atlas_last_discovery).all()) session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() else: for index, row in jdbcConnectionsDf.iterrows(): dbAlias = row['dbalias'] currentTime = str(datetime.now().strftime('%H:%M:%S')) timeWindowStart = None timeWindowStop = None dbAliasAllowedAtThisTime = False if row['timewindow_start'] != None: timeWindowStart = str(row['timewindow_start']) if row['timewindow_stop'] != None: timeWindowStop = str(row['timewindow_stop']) if timeWindowStart != None and re.search( '^[0-9]:', timeWindowStart): timeWindowStart = "0" + timeWindowStart if timeWindowStop != None and re.search( '^[0-9]:', timeWindowStop): timeWindowStop = "0" + timeWindowStop if timeWindowStart == None and timeWindowStop == None: dbAliasAllowedAtThisTime = True elif currentTime > timeWindowStart and currentTime < timeWindowStop: dbAliasAllowedAtThisTime = True # Find out if the dbAlias is blacklisted if failureLog.get(dbAlias, None) != None: blackListEnableTime = failureLog[dbAlias][ 'blackListStart'] + timedelta( hours=failureLog[dbAlias]['blackListTime']) if datetime.now() < blackListEnableTime: # This dbAlias is still blacklisted continue if dbAliasAllowedAtThisTime == False: # Not allowed to access this connection at this time continue self.common_config.mysql_conn.commit() self.common_config.lookupConnectionAlias(dbAlias) if self.common_config.atlasJdbcSourceSupport == False: # This source type does not support Atlas discovery continue # We now have a valid connection in dbAlias that we can do a discovery on log.info("Starting a Atlas discovery on connection '%s'" % (dbAlias)) altasOperationFailed = False if self.common_config.connectToJDBC(allJarFiles=True, exitIfFailure=False, logger=logger) == True: self.common_config.atlasEnabled = True response = self.common_config.discoverAtlasRdbms( dbAlias=dbAlias, logger=logger) if response == False: # Something went wrong when getting source system schema altasOperationFailed = True log.warning( "There was an error/warning when discovering source schema" ) else: log.info( "Finished Atlas discovery on connection '%s'" % (dbAlias)) self.common_config.disconnectFromJDBC() if altasOperationFailed == False: updateDict = {} updateDict["atlas_last_discovery"] = str( datetime.now().strftime( '%Y-%m-%d %H:%M:%S.%f')) try: session = self.getDBImportSession() (session.query( configSchema.jdbcConnections).filter( configSchema.jdbcConnections.dbalias == dbAlias).update(updateDict)) session.commit() session.close() except SQLAlchemyError as e: log.error(str(e.__dict__['orig'])) session.rollback() self.disconnectDBImportDB() else: failureLog.pop(dbAlias, None) break else: altasOperationFailed = True if altasOperationFailed == True: # Connection failed. We need to blacklist this connection for some time blackListData = failureLog.get(dbAlias, None) if blackListData == None: blackListTime = 1 else: blackListTime = failureLog[dbAlias][ 'blackListTime'] * 2 # Max blacklist time is 24 hours if blackListTime > 24: blackListTime = 24 failureLog[dbAlias] = { 'blackListTime': blackListTime, 'blackListStart': datetime.now() } log.warning( "Atlas Discovery failed on connection '%s'" % (dbAlias)) log.warning( "This connection is now blacklisted for %s hours" % (failureLog[dbAlias]['blackListTime'])) time.sleep(1) self.disconnectDBImportDB() if atlasEnabled == True: log.info("atlasDiscovery stopped")