def start(self): self._logger.info("Creating Spark Job for topic: {0}".format( self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] # spark job command. spark_job_cmd = ( "spark-submit --master yarn " "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{1}/{2} " "-zk {3} " "-t {4} " "-db {5} " "-dt {6} " "-w {7}".format( os.path.dirname(os.path.dirname(self._script_path)), self._script_path, parser, self._kafka_consumer.ZookeperServer, self._kafka_consumer.Topic, self._db_name, "proxy", self._processes)) # start spark job. Util.execute_cmd(spark_job_cmd, self._logger)
def _ingest_file(self,file): message = "" with open(file,"rb") as f: for line in f: message += line if len(message) > self._message_size: self._kafka_topic.send_message(message,self._kafka_topic.Partition) message = "" # send the last package. self._kafka_topic.send_message(message,self._kafka_topic.Partition) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file,self._logger) self._logger.info("File {0} has been successfully sent to Kafka Topic:{1}".format(file,self._kafka_topic.Topic))
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd,self._logger)
def _ingest_file(self, file, partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format( self._pkt_num, file, self._pcap_split_staging, name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, self._logger) for currdir, subdir, files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( self._hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format( partition)) self._kafka_topic.send_message(hadoop_pcap_file, partition) self._logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, self._logger)
def _ingest_file(self, file): message = "" with open(file, "rb") as f: for line in f: message += line if len(message) > self._message_size: self._kafka_topic.send_message(message, self._kafka_topic.Partition) message = "" # send the last package. self._kafka_topic.send_message(message, self._kafka_topic.Partition) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file, self._logger) self._logger.info( "File {0} has been successfully sent to Kafka Topic:{1}".format( file, self._kafka_topic.Topic))
def _ingest_file(self,file,partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,self._logger) for currdir,subdir,files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format(partition)) self._kafka_topic.send_message(hadoop_pcap_file,partition) self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,self._logger)
def start(self): self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic)) # parser parser = self._conf["parser"] # spark job command. spark_job_cmd = ("spark-submit --master yarn " "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar " "{1}/{2} " "-zk {3} " "-t {4} " "-db {5} " "-dt {6} " "-w {7}".format(os.path.dirname(os.path.dirname(self._script_path)),self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes)) # start spark job. Util.execute_cmd(spark_job_cmd,self._logger)
def _create_topic(self): self._logger.info("Creating topic: {0} with {1} parititions".format( self._topic, self._num_of_partitions)) # Create partitions for the workers. self._partitions = [ TopicPartition(self._topic, p) for p in range(int(self._num_of_partitions)) ] # create partitioner self._partitioner = RoundRobinPartitioner(self._partitions) # get script path zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port) create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format( os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf, self._num_of_partitions) # execute create topic cmd Util.execute_cmd(create_topic_cmd, self._logger)
def _process_new_file(self,file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd,self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] binary_hour = file_name_parts[len(file_name_parts)-2] binary_date_path = file_name_parts[len(file_name_parts)-3] binary_year = binary_date_path[0:4] binary_month = binary_date_path[4:6] binary_day = binary_date_path[6:8] # build process cmd. process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd,self._logger) # create hdfs staging. hdfs_path = "{0}/dns".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd,self._logger) # move to stage. mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) Util.execute_cmd(mv_to_staging,self._logger) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(self._db_name,binary_year,binary_month,binary_day,binary_hour,hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd,self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path) self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd,self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name) self._logger.info("Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging,self._logger) self._logger.info("File {0} was successfully processed.".format(file_name))
def _process_new_file(self, file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} {1}.".format( file, self._local_staging) self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd)) Util.execute_cmd(get_file_cmd, self._logger) # get file name and date file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] flow_date = file_name.split('.')[1] flow_year = flow_date[0:4] flow_month = flow_date[4:6] flow_day = flow_date[6:8] flow_hour = flow_date[8:10] # build process cmd. process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format( self._local_staging, file_name, self._process_opt) self._logger.info("Processing file: {0}".format(process_cmd)) Util.execute_cmd(process_cmd, self._logger) # create hdfs staging. hdfs_path = "{0}/flow".format(self._hdfs_app_path) staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path, staging_timestamp) create_staging_cmd = "hadoop fs -mkdir -p {0}".format( hdfs_staging_path) self._logger.info("Creating staging: {0}".format(create_staging_cmd)) Util.execute_cmd(create_staging_cmd, self._logger) # move to stage. mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format( self._local_staging, file_name, hdfs_staging_path) self._logger.info("Moving data to staging: {0}".format(mv_to_staging)) subprocess.call(mv_to_staging, shell=True) #load to avro load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format( self._db_name, flow_year, flow_month, flow_day, flow_hour, hdfs_staging_path) self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd)) Util.execute_cmd(load_to_avro_cmd, self._logger) # remove from hdfs staging rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format( hdfs_staging_path) self._logger.info( "Removing staging path: {0}".format(rm_hdfs_staging_cmd)) Util.execute_cmd(rm_hdfs_staging_cmd, self._logger) # remove from local staging. rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name) self._logger.info( "Removing files from local staging: {0}".format(rm_local_staging)) Util.execute_cmd(rm_local_staging, self._logger) self._logger.info( "File {0} was successfully processed.".format(file_name))