def _ingest_file(self, file, partition): # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path, file_date_path, file_date_hour) Util.creat_hdfs_folder(hdfs_path, self._logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path, file_name) Util.load_to_hdfs(file, hdfs_file, self._logger) # create event for workers to process the file. self._logger.info( "Sending file to worker number: {0}".format(partition)) self.kafka_topic.send_message(hdfs_file, partition) self._logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(file, self._kafka_topic.Topic))
def _initialize_members(self,conf): self._collector_path = None self._hdfs_root_path = None self._queue_name = None self._pkt_num = None self._pcap_split_staging = None self._time_to_wait = None self._dsource='dns' # valdiate configuration info. conf_err_msg = "Please provide a valid '{0}' in the configuration file" Util.validate_parameter(conf['collector_path'],conf_err_msg.format("collector_path")) Util.validate_parameter(conf['queue_name'],conf_err_msg.format("queue_name")) Util.validate_parameter(conf['pkt_num'],conf_err_msg.format("pkt_num")) Util.validate_parameter(conf['pcap_split_staging'],conf_err_msg.format("pcap_split_staging")) Util.validate_parameter(conf['time_to_wait'],conf_err_msg.format("time_to_wait")) # set configuration. self._collector_path = conf['collector_path'] self._hdfs_root_path = "{0}/{1}".format(conf['huser'] , self._dsource) self._time_to_wait = conf['time_to_wait'] self._pkt_num = conf['pkt_num'] self._pcap_split_staging = conf['pcap_split_staging'] self._queue_name = conf['queue_name']
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format( self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] # spark job command. spark_job_cmd = ( "spark-submit --master yarn " "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{1}/{2} " "-zk {3} " "-t {4} " "-db {5} " "-dt {6} " "-w {7}".format( os.path.dirname(os.path.dirname(self._script_path)), self._script_path, parser, self._kafka_consumer.ZookeperServer, self._kafka_consumer.Topic, self._db_name, "proxy", self._processes)) # start spark job. Util.execute_cmd(spark_job_cmd, self._logger)
def _initialize_members(self, conf): self._collector_path = None self._hdfs_root_path = None self._queue_name = None self._pkt_num = None self._pcap_split_staging = None self._time_to_wait = None self._dsource = 'dns' # valdiate configuration info. conf_err_msg = "Please provide a valid '{0}' in the configuration file" Util.validate_parameter(conf['collector_path'], conf_err_msg.format("collector_path")) Util.validate_parameter(conf['queue_name'], conf_err_msg.format("queue_name")) Util.validate_parameter(conf['pkt_num'], conf_err_msg.format("pkt_num")) Util.validate_parameter(conf['pcap_split_staging'], conf_err_msg.format("pcap_split_staging")) Util.validate_parameter(conf['time_to_wait'], conf_err_msg.format("time_to_wait")) # set configuration. self._collector_path = conf['collector_path'] self._hdfs_root_path = "{0}/{1}".format(conf['huser'], self._dsource) self._time_to_wait = conf['time_to_wait'] self._pkt_num = conf['pkt_num'] self._pcap_split_staging = conf['pcap_split_staging'] self._queue_name = conf['queue_name']
def start_collector(type, workers_num, id=None): # generate ingest id ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace( ":", "_").replace(".", "_") # create logger. logger = Util.get_logger("ONI.INGEST") # validate the given configuration exists in ingest_conf.json. if not type in master_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(master_conf["pipelines"][type]["type"]): logger.error( "'{0}' type is not configured. Please check you ingest conf file". format(master_conf["pipelines"][type]["type"])) sys.exit(1) # validate if kerberos authentication is required. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # kafka server info. logger.info("Initializing kafka instance") k_server = master_conf["kafka"]['kafka_server'] k_port = master_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = master_conf["kafka"]['zookeper_server'] zk_port = master_conf["kafka"]['zookeper_port'] topic = "ONI-INGEST-{0}_{1}".format(type, ingest_id) if not id else id kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port, workers_num) # create a collector instance based on data source type. logger.info("Starting {0} ingest instance".format(topic)) module = __import__("pipelines.{0}.collector".format( master_conf["pipelines"][type]["type"]), fromlist=['Collector']) # start collector. ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka, type) ingest_collector.start()
def _ingest_file(self,file): message = "" with open(file,"rb") as f: for line in f: message += line if len(message) > self._message_size: self._kafka_topic.send_message(message,self._kafka_topic.Partition) message = "" # send the last package. self._kafka_topic.send_message(message,self._kafka_topic.Partition) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file,self._logger) self._logger.info("File {0} has been successfully sent to Kafka Topic:{1}".format(file,self._kafka_topic.Topic))
def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type): # getting parameters. self._logger = logging.getLogger('ONI.INGEST.DNS') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read dns configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] # set configuration. self._collector_path = self._conf['collector_path'] self._dsource = 'dns' self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource) # set configuration. self._pkt_num = self._conf['pkt_num'] self._pcap_split_staging = self._conf['pcap_split_staging'] # initialize message broker client. self.kafka_topic = kafka_topic # create collector watcher self._watcher = Util.create_watcher(self._collector_path,NewFileEvent(self),self._logger)
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd,self._logger)
def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type): # getting parameters. self._logger = logging.getLogger('ONI.INGEST.FLOW') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read flow configuration. conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] # set configuration. self._collector_path = self._conf['collector_path'] self._dsource = 'flow' self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource) # initialize message broker client. self.kafka_topic = kafka_topic # create collector watcher self._watcher = Util.create_watcher(self._collector_path, NewFileEvent(self), self._logger)
def _initialize_members(self,conf): self._collector_path = None self._hdfs_root_path = None self._queue_name = None self._dsource = 'flow' # valdiate configuration info. conf_err_msg = "Please provide a valid '{0}' in the configuration file" Util.validate_parameter(conf['collector_path'],conf_err_msg.format("collector_path")) Util.validate_parameter(conf['queue_name'],conf_err_msg.format("queue_name")) # set configuration. self._collector_path = conf['collector_path'] self._hdfs_root_path = "{0}/{1}".format(os.getenv('HUSER','/user/duxbury') , self._dsource) self._queue_name = conf['queue_name']
def start(self): self._logger.info("Starting PROXY ingest") self._logger.info("Watching: {0}".format(self._collector_path)) self._watcher.start() try: while True: time.sleep(1) except KeyboardInterrupt: self._logger.info("Stopping PROXY collector...") self._watcher.stop() self._watcher.join() # remove kafka topic Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger)
def process_new_binary_file(new_file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file) print get_file_cmd subprocess.call(get_file_cmd,shell=True) # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(new_file,ingest_type) # build process cmd. post_process_cmd = None process_opt = worker_conf[ingest_type]['process_opt'] if ingest_type == 'dns': post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt) elif ingest_type == 'flow': post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt) else: print "Unsupported ingest type" sys.exit(1) print post_process_cmd subprocess.call(post_process_cmd,shell=True) # create folder if it does not exist h_base_path = "{0}/{1}".format(os.getenv('HUSER','/user/oni'), ingest_type) h_csv_path = "{0}/csv".format(h_base_path) create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour) print create_folder_cmd subprocess.call(create_folder_cmd,shell=True) #move to hdfs. upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour) print upld_cmd subprocess.call(upld_cmd,shell=True) #make tmp folder in stage h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] h_stage_path = "{0}/stage/{1}".format(h_base_path,h_stage_timestamp) create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path) print create_tmp_cmd subprocess.call(create_tmp_cmd,shell=True) #load to avro load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,os.getenv('DBNAME','default') ) print load_avro_cmd subprocess.call(load_avro_cmd,shell=True) #remove from stage rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path) print rm_tmp_cmd subprocess.call(rm_tmp_cmd,shell=True) #can this delete other files when all is running on the same edge server? rm_tmp = "rm ../stage/{0}*".format(file_name) subprocess.call(rm_tmp,shell=True) print datetime.datetime.now()
def start(self): self._logger.info("Starting PROXY ingest") self._logger.info("Watching: {0}".format(self._collector_path)) self._watcher.start() try: while True: time.sleep(1) except KeyboardInterrupt: self._logger.info("Stopping PROXY collector...") self._watcher.stop() self._watcher.join() # remove kafka topic Util.remove_kafka_topic(self._kafka_topic.Zookeeper, self._kafka_topic.Topic, self._logger)
def _initialize_members(self, conf): self._collector_path = None self._hdfs_root_path = None self._queue_name = None self._dsource = 'flow' # valdiate configuration info. conf_err_msg = "Please provide a valid '{0}' in the configuration file" Util.validate_parameter(conf['collector_path'], conf_err_msg.format("collector_path")) Util.validate_parameter(conf['queue_name'], conf_err_msg.format("queue_name")) # set configuration. self._collector_path = conf['collector_path'] self._hdfs_root_path = "{0}/{1}".format(conf['huser'], self._dsource) self._queue_name = conf['queue_name']
def start_worker(type, topic, id, processes=None): logger = Util.get_logger("ONI.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in worker_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type)) sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format( worker_conf["pipelines"][type]["type"]), fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = worker_conf["kafka"]['kafka_server'] k_port = worker_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = worker_conf["kafka"]['zookeper_server'] zk_port = worker_conf["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic, k_server, k_port, zk_server, zk_port, id) # start worker. db_name = worker_conf['dbname'] app_path = worker_conf['hdfs_app_path'] ingest_worker = module.Worker(db_name, app_path, kafka_consumer, type, processes) ingest_worker.start()
def _ingest_file(self, file): message = "" with open(file, "rb") as f: for line in f: message += line if len(message) > self._message_size: self._kafka_topic.send_message(message, self._kafka_topic.Partition) message = "" # send the last package. self._kafka_topic.send_message(message, self._kafka_topic.Partition) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file, self._logger) self._logger.info( "File {0} has been successfully sent to Kafka Topic:{1}".format( file, self._kafka_topic.Topic))
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] # spark job command. spark_job_cmd = ("spark-submit --master yarn " "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{1}/{2} " "-zk {3} " "-t {4} " "-db {5} " "-dt {6} " "-w {7}".format(os.path.dirname(os.path.dirname(self._script_path)),self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes)) # start spark job. Util.execute_cmd(spark_job_cmd,self._logger)
def _split_pcap_file(self,file_name,file_local_path,hdfs_path): # split file. name = file_name.split('.')[0] split_cmd="editcap -c {0} {1} {2}/{3}_split.pcap".format(self._pkt_num,file_local_path,self._pcap_split_staging,name) print split_cmd subprocess.call(split_cmd,shell=True) for currdir,subdir,files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_split".format(name) in file: # load file to hdfs. Util.load_to_hdfs(file,os.path.join(currdir,file),hdfs_path) #send rabbitmq notificaion. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name) rm_big_file = "rm {0}".format(file_local_path) print rm_big_file subprocess.call(rm_big_file,shell=True)
def _process_pcap_file(self, file_name, file_local_path, hdfs_root_path): # get timestamp from the file name. file_date = file_name.split('.')[0] pcap_hour = file_date[-4:-2] pcap_date_path = file_date[-12:-4] # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour) Util.creat_hdfs_folder(hdfs_path) # get file size. file_size = os.stat(file_local_path) if file_size.st_size > 1145498644: # split file. self._split_pcap_file(file_name, file_local_path, hdfs_path) else: # load file to hdfs Util.load_to_hdfs(file_name, file_local_path, hdfs_path) # send rabbitmq notification. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)
def start_worker(type,topic,id,processes=None): logger = Util.get_logger("ONI.INGEST.WORKER") # validate the given configuration exists in ingest_conf.json. if not type in worker_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)); sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(worker_conf["pipelines"][type]["type"]): logger.error("The provided data source {0} is not valid".format(type));sys.exit(1) # validate if kerberos authentication is requiered. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # create a worker instance based on the data source type. module = __import__("pipelines.{0}.worker".format(worker_conf["pipelines"][type]["type"]),fromlist=['Worker']) # kafka server info. logger.info("Initializing kafka instance") k_server = worker_conf["kafka"]['kafka_server'] k_port = worker_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = worker_conf["kafka"]['zookeper_server'] zk_port = worker_conf["kafka"]['zookeper_port'] topic = topic # create kafka consumer. kafka_consumer = KafkaConsumer(topic,k_server,k_port,zk_server,zk_port,id) # start worker. db_name = worker_conf['dbname'] app_path = worker_conf['hdfs_app_path'] ingest_worker = module.Worker(db_name,app_path,kafka_consumer,type,processes) ingest_worker.start()
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic, p) for p in range(int(self._num_of_partitions)) ] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def _ingest_file(self, file, partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format( self._pkt_num, file, self._pcap_split_staging, name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, self._logger) for currdir, subdir, files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( self._hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format( partition)) self._kafka_topic.send_message(hadoop_pcap_file, partition) self._logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, self._logger)
def _initialize_members(self,db_name,hdfs_app_path,kafka_consumer,conf_type,processes): # get logger instance. self._logger = Util.get_logger('ONI.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._processes = processes
def _split_pcap_file(self, file_name, file_local_path, hdfs_path): # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_split.pcap".format( self._pkt_num, file_local_path, self._pcap_split_staging, name) print split_cmd subprocess.call(split_cmd, shell=True) for currdir, subdir, files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_split".format(name) in file: # load file to hdfs. Util.load_to_hdfs(file, os.path.join(currdir, file), hdfs_path) #send rabbitmq notificaion. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name) rm_big_file = "rm {0}".format(file_local_path) print rm_big_file subprocess.call(rm_big_file, shell=True)
def _initialize_members(self, db_name, hdfs_app_path, kafka_consumer, conf_type, processes): # get logger instance. self._logger = Util.get_logger('ONI.INGEST.WRK.PROXY') self._db_name = db_name self._hdfs_app_path = hdfs_app_path self._kafka_consumer = kafka_consumer # read proxy configuration. self._script_path = os.path.dirname(os.path.abspath(__file__)) conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] self._processes = processes
def _ingest_file(self,file,partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,self._logger) for currdir,subdir,files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format(partition)) self._kafka_topic.send_message(hadoop_pcap_file,partition) self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,self._logger)
def _load_to_hdfs(self,file): # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(file,'flow') # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name,file,hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name) print "Done !!!!!"
def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type): # getting parameters. self._logger = logging.getLogger('ONI.INGEST.PROXY') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read proxy configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._message_size = conf["kafka"]["message_size"] self._conf = conf["pipelines"][conf_type] # get collector path. self._collector_path = self._conf['collector_path'] # create collector watcher self._watcher = Util.create_watcher(self._collector_path,NewFileEvent(self),self._logger)
def _initialize_members(self, hdfs_app_path, kafka_topic, conf_type): # getting parameters. self._logger = logging.getLogger('ONI.INGEST.PROXY') self._hdfs_app_path = hdfs_app_path self._kafka_topic = kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read proxy configuration. conf_file = "{0}/ingest_conf.json".format( os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._message_size = conf["kafka"]["message_size"] self._conf = conf["pipelines"][conf_type] # get collector path. self._collector_path = self._conf['collector_path'] # create collector watcher self._watcher = Util.create_watcher(self._collector_path, NewFileEvent(self), self._logger)
def _load_to_hdfs(self, file): # get file name and date binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path( file, 'flow') # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path, binary_date_path, binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name, file, hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name) print "Done !!!!!"
def _process_pcap_file(self,file_name,file_local_path,hdfs_root_path): # get timestamp from the file name. file_date = file_name.split('.')[0] pcap_hour=file_date[-4:-2] pcap_date_path = file_date[-12:-4] # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour) Util.creat_hdfs_folder(hdfs_path) # get file size. file_size = os.stat(file_local_path) if file_size.st_size > 1145498644: # split file. self._split_pcap_file(file_name,file_local_path,hdfs_path) else: # load file to hdfs Util.load_to_hdfs(file_name,file_local_path,hdfs_path) # send rabbitmq notification. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)
def process_new_binary_file(new_file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file) print get_file_cmd subprocess.call(get_file_cmd,shell=True) # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(new_file,ingest_type) # build process cmd. post_process_cmd = None process_opt = worker_conf[ingest_type]['process_opt'] if ingest_type == 'dns': post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt) elif ingest_type == 'flow': post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt) else: print "Unsupported ingest type" sys.exit(1) print post_process_cmd subprocess.call(post_process_cmd,shell=True) # create folder if it does not exist h_base_path = "{0}/{1}".format(worker_conf['huser'], ingest_type) h_csv_path = "{0}/csv".format(h_base_path) create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour) print create_folder_cmd subprocess.call(create_folder_cmd,shell=True) #move to hdfs. upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour) print upld_cmd subprocess.call(upld_cmd,shell=True) #make tmp folder in stage h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] h_stage_path = "{0}/stage/{1}".format(h_base_path,h_stage_timestamp) create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path) print create_tmp_cmd subprocess.call(create_tmp_cmd,shell=True) # move to stage. mv_to_stage = "hadoop fs -cp {0}/y={1}/m={2}/d={3}/h={4}/{5}.csv {6}/.".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour,file_name,h_stage_path) print mv_to_stage subprocess.call(mv_to_stage,shell=True) #load to avro load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,worker_conf['dbname'] ) print load_avro_cmd subprocess.call(load_avro_cmd,shell=True) #remove from stage rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path) print rm_tmp_cmd subprocess.call(rm_tmp_cmd,shell=True) #can this delete other files when all is running on the same edge server? rm_tmp = "rm ../stage/{0}*".format(file_name) subprocess.call(rm_tmp,shell=True) print datetime.datetime.now()
def _process_new_file(self, file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format( file, self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd, self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # build process cmd. process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format( hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd, self._logger) # move to stage. mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format( self._local_staging, file_name, hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) subprocess.call(mv_to_staging, shell=True) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format( self._db_name, flow_year, flow_month, flow_day, flow_hour, hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd, self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format( hdfs_staging_path) self._logger.info( "Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd, self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))
def _process_new_file(self,file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd,self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] binary_hour = file_name_parts[len(file_name_parts)-2] binary_date_path = file_name_parts[len(file_name_parts)-3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd,self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd,self._logger) # move to stage. mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) Util.execute_cmd(mv_to_staging,self._logger) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(self._db_name,binary_year,binary_month,binary_day,binary_hour,hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd,self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path) self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd,self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name) self._logger.info("Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging,self._logger) self._logger.info("File {0} was successfully processed.".format(file_name))