def run(cls): ''' Main command-line entry point. :param cls: The class as implicit first argument. ''' try: args = _parse_args() conf = json.loads(args.config_file.read()) # .........................set up logger Util.get_logger('SPOT', args.log_level) # .........................check kerberos authentication if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() conf['producer'] = { 'bootstrap_servers': [ '{0}:{1}'.format(conf['kafka']['kafka_server'], conf['kafka']['kafka_port']) ] } conf['file_watcher'] = { 'path': conf['pipelines'][args.type]['collector_path'], 'supported_files': conf['pipelines'][args.type]['supported_files'], 'recursive': True } # .........................migrate configs if not 'local_staging' in conf['pipelines'][args.type].keys(): conf['pipelines'][args.type]['local_staging'] = '/tmp' if 'max_request_size' in conf['kafka'].keys(): conf['producer']['max_request_size'] = conf['kafka'][ 'max_request_size'] if not 'process_opt' in conf['pipelines'][args.type].keys(): conf['pipelines'][args.type]['process_opt'] = '' if 'recursive' in conf['pipelines'][args.type].keys(): conf['file_watcher']['recursive'] = conf['pipelines'][ args.type]['recursive'] collector = cls(args.type, args.topic, args.skip_conversion, **conf) collector.start() except SystemExit: raise except: sys.excepthook(*sys.exc_info()) sys, exit(1)
def __init__(self): self._logger = Util.get_logger('SPOT.COMMON.KERBEROS') principal, keytab, sasl_mech, security_proto = config.kerberos() if os.getenv('KINITPATH'): self._kinit = os.getenv('KINITPATH') else: self._kinit = "kinit" self._kinitopts = os.getenv('KINITOPTS') self._keytab = "-kt {0}".format(keytab) self._krb_user = principal if self._kinit == None or self._keytab == None or self._krb_user == None: self._logger.error( "Please verify kerberos configuration, some environment variables are missing." ) sys.exit(1) if self._kinitopts is None: self._kinit_cmd = "{0} {1} {2}".format(self._kinit, self._keytab, self._krb_user) else: self._kinit_cmd = "{0} {1} {2} {3}".format(self._kinit, self._kinitopts, self._keytab, self._krb_user)
def start_collector(type, workers_num, id=None): # generate ingest id ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace( ":", "_").replace(".", "_") # create logger. logger = Util.get_logger("SPOT.INGEST") # validate the given configuration exists in ingest_conf.json. if not type in master_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(master_conf["pipelines"][type]["type"]): logger.error( "'{0}' type is not configured. Please check you ingest conf file". format(master_conf["pipelines"][type]["type"])) sys.exit(1) # validate if kerberos authentication is required. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # kafka server info. logger.info("Initializing kafka instance") k_server = master_conf["kafka"]['kafka_server'] k_port = master_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = master_conf["kafka"]['zookeper_server'] zk_port = master_conf["kafka"]['zookeper_port'] topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port, workers_num) # create a collector instance based on data source type. logger.info("Starting {0} ingest instance".format(topic)) module = __import__("pipelines.{0}.collector".format( master_conf["pipelines"][type]["type"]), fromlist=['Collector']) # start collector. ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka, type) ingest_collector.start()
def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._spark_conf = conf["spark-streaming"] self._conf = conf["pipelines"][conf_type] self._processes = processes
def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.FLOW') self._db_name = db_name self._hdfs_app_path = hdfs_app_path # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._process_opt = self._conf['process_opt'] self._local_staging = self._conf['local_staging'] self.kafka_consumer = kafka_consumer
def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer, conf_type, processes): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._spark_conf = conf["spark-streaming"] self._conf = conf["pipelines"][conf_type] self._processes = processes
def start_worker(type, topic, id, processes=None): logger = Util.get_logger("SPOT.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in WORKER_CONF["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(WORKER_CONF["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type)) sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format( WORKER_CONF["pipelines"][type]["type"]), fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = WORKER_CONF["kafka"]['kafka_server'] k_port = WORKER_CONF["kafka"]['kafka_port'] # required zookeeper info. zk_server = WORKER_CONF["kafka"]['zookeper_server'] zk_port = WORKER_CONF["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port, id) # start worker. db_name = WORKER_CONF['dbname'] app_path = WORKER_CONF['hdfs_app_path'] ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type, processes) ingest_worker.start()
def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer, conf_type): # get logger instance. self._logger = Util.get_logger('SPOT.INGEST.WRK.DNS') self._db_name = db_name self._hdfs_app_path = hdfs_app_path # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._process_opt = self._conf['process_opt'] self._local_staging = self._conf['local_staging'] self.kafka_consumer = kafka_consumer
def start_worker(type,topic,id,processes=None): logger = Util.get_logger("SPOT.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in worker_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)); sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type));sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = worker_conf["kafka"]['kafka_server'] k_port = worker_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = worker_conf["kafka"]['zookeper_server'] zk_port = worker_conf["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id) # start worker. db_name = worker_conf['dbname'] app_path = worker_conf['hdfs_app_path'] ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes) ingest_worker.start()