Esempio n. 1
0
    def run(cls):
        '''
            Main command-line entry point.

        :param cls: The class as implicit first argument.
        '''
        try:
            args = _parse_args()
            conf = json.loads(args.config_file.read())

            # .........................set up logger
            Util.get_logger('SPOT', args.log_level)

            # .........................check kerberos authentication
            if os.getenv('KRB_AUTH'):
                kb = Kerberos()
                kb.authenticate()

            conf['producer'] = {
                'bootstrap_servers': [
                    '{0}:{1}'.format(conf['kafka']['kafka_server'],
                                     conf['kafka']['kafka_port'])
                ]
            }

            conf['file_watcher'] = {
                'path': conf['pipelines'][args.type]['collector_path'],
                'supported_files':
                conf['pipelines'][args.type]['supported_files'],
                'recursive': True
            }

            # .........................migrate configs
            if not 'local_staging' in conf['pipelines'][args.type].keys():
                conf['pipelines'][args.type]['local_staging'] = '/tmp'

            if 'max_request_size' in conf['kafka'].keys():
                conf['producer']['max_request_size'] = conf['kafka'][
                    'max_request_size']

            if not 'process_opt' in conf['pipelines'][args.type].keys():
                conf['pipelines'][args.type]['process_opt'] = ''

            if 'recursive' in conf['pipelines'][args.type].keys():
                conf['file_watcher']['recursive'] = conf['pipelines'][
                    args.type]['recursive']

            collector = cls(args.type, args.topic, args.skip_conversion,
                            **conf)
            collector.start()

        except SystemExit:
            raise
        except:
            sys.excepthook(*sys.exc_info())
            sys, exit(1)
Esempio n. 2
0
    def __init__(self):

        self._logger = Util.get_logger('SPOT.COMMON.KERBEROS')
        principal, keytab, sasl_mech, security_proto = config.kerberos()

        if os.getenv('KINITPATH'):
            self._kinit = os.getenv('KINITPATH')
        else:
            self._kinit = "kinit"

        self._kinitopts = os.getenv('KINITOPTS')
        self._keytab = "-kt {0}".format(keytab)
        self._krb_user = principal

        if self._kinit == None or self._keytab == None or self._krb_user == None:
            self._logger.error(
                "Please verify kerberos configuration, some environment variables are missing."
            )
            sys.exit(1)

        if self._kinitopts is None:
            self._kinit_cmd = "{0} {1} {2}".format(self._kinit, self._keytab,
                                                   self._krb_user)
        else:
            self._kinit_cmd = "{0} {1} {2} {3}".format(self._kinit,
                                                       self._kinitopts,
                                                       self._keytab,
                                                       self._krb_user)
def start_collector(type, workers_num, id=None):

    # generate ingest id
    ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(
        ":", "_").replace(".", "_")

    # create logger.
    logger = Util.get_logger("SPOT.INGEST")

    # validate the given configuration exists in ingest_conf.json.
    if not type in master_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(master_conf["pipelines"][type]["type"]):
        logger.error(
            "'{0}' type is not configured. Please check you ingest conf file".
            format(master_conf["pipelines"][type]["type"]))
        sys.exit(1)

    # validate if kerberos authentication is required.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = master_conf["kafka"]['kafka_server']
    k_port = master_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = master_conf["kafka"]['zookeper_server']
    zk_port = master_conf["kafka"]['zookeper_port']

    topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id
    kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port,
                       workers_num)

    # create a collector instance based on data source type.
    logger.info("Starting {0} ingest instance".format(topic))
    module = __import__("pipelines.{0}.collector".format(
        master_conf["pipelines"][type]["type"]),
                        fromlist=['Collector'])

    # start collector.
    ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka,
                                        type)
    ingest_collector.start()
Esempio n. 4
0
    def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes):
        
        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._spark_conf  = conf["spark-streaming"]
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Esempio n. 5
0
    def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.FLOW')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        self._process_opt = self._conf['process_opt']
        self._local_staging = self._conf['local_staging']
        self.kafka_consumer = kafka_consumer
Esempio n. 6
0
    def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer,
                            conf_type, processes):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.PROXY')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path
        self._kafka_consumer = kafka_consumer

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._spark_conf = conf["spark-streaming"]
        self._conf = conf["pipelines"][conf_type]
        self._processes = processes
Esempio n. 7
0
def start_worker(type, topic, id, processes=None):

    logger = Util.get_logger("SPOT.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in WORKER_CONF["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(WORKER_CONF["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type))
        sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(
        WORKER_CONF["pipelines"][type]["type"]),
                        fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = WORKER_CONF["kafka"]['kafka_server']
    k_port = WORKER_CONF["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = WORKER_CONF["kafka"]['zookeper_server']
    zk_port = WORKER_CONF["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port,
                                   id)

    # start worker.
    db_name = WORKER_CONF['dbname']
    app_path = WORKER_CONF['hdfs_app_path']
    ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type,
                                  processes)
    ingest_worker.start()
Esempio n. 8
0
    def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer,
                            conf_type):

        # get logger instance.
        self._logger = Util.get_logger('SPOT.INGEST.WRK.DNS')

        self._db_name = db_name
        self._hdfs_app_path = hdfs_app_path

        # read proxy configuration.
        self._script_path = os.path.dirname(os.path.abspath(__file__))
        conf_file = "{0}/ingest_conf.json".format(
            os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        self._process_opt = self._conf['process_opt']
        self._local_staging = self._conf['local_staging']
        self.kafka_consumer = kafka_consumer
Esempio n. 9
0
def start_worker(type,topic,id,processes=None):

    logger = Util.get_logger("SPOT.INGEST.WORKER")

    # validate the given configuration exists in ingest_conf.json.
    if not type in worker_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type));
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]):
        logger.error("The provided data source {0} is not valid".format(type));sys.exit(1)

    # validate if kerberos authentication is requiered.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # create a worker instance based on the data source type.
    module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker'])

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = worker_conf["kafka"]['kafka_server']
    k_port = worker_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = worker_conf["kafka"]['zookeper_server']
    zk_port = worker_conf["kafka"]['zookeper_port']
    topic = topic

    # create kafka consumer.
    kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id)

    # start worker.
    db_name = worker_conf['dbname']
    app_path = worker_conf['hdfs_app_path']
    ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes)
    ingest_worker.start()