def _ingest_file(self, file, partition): # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path, file_date_path, file_date_hour) Util.creat_hdfs_folder(hdfs_path, self._logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path, file_name) Util.load_to_hdfs(file, hdfs_file, self._logger) # create event for workers to process the file. self._logger.info( "Sending file to worker number: {0}".format(partition)) self.kafka_topic.send_message(hdfs_file, partition) self._logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(file, self._kafka_topic.Topic))
def _process_pcap_file(self, file_name, file_local_path, hdfs_root_path): # get timestamp from the file name. file_date = file_name.split('.')[0] pcap_hour = file_date[-4:-2] pcap_date_path = file_date[-12:-4] # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour) Util.creat_hdfs_folder(hdfs_path) # get file size. file_size = os.stat(file_local_path) if file_size.st_size > 1145498644: # split file. self._split_pcap_file(file_name, file_local_path, hdfs_path) else: # load file to hdfs Util.load_to_hdfs(file_name, file_local_path, hdfs_path) # send rabbitmq notification. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)
def _ingest_file(self, file, partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format( self._pkt_num, file, self._pcap_split_staging, name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, self._logger) for currdir, subdir, files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( self._hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format( partition)) self._kafka_topic.send_message(hadoop_pcap_file, partition) self._logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, self._logger)
def _load_to_hdfs(self,file): # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(file,'flow') # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name,file,hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name) print "Done !!!!!"
def _load_to_hdfs(self, file): # get file name and date binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path( file, 'flow') # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path, binary_date_path, binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name, file, hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name) print "Done !!!!!"
def _ingest_file(self,file,partition): # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name) self._logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,self._logger) for currdir,subdir,files in os.walk(self._pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_oni".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,self._logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger) # create event for workers to process the file. self._logger.info( "Sending split file to worker number: {0}".format(partition)) self._kafka_topic.send_message(hadoop_pcap_file,partition) self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic)) self._logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,self._logger)
def _process_pcap_file(self,file_name,file_local_path,hdfs_root_path): # get timestamp from the file name. file_date = file_name.split('.')[0] pcap_hour=file_date[-4:-2] pcap_date_path = file_date[-12:-4] # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour) Util.creat_hdfs_folder(hdfs_path) # get file size. file_size = os.stat(file_local_path) if file_size.st_size > 1145498644: # split file. self._split_pcap_file(file_name,file_local_path,hdfs_path) else: # load file to hdfs Util.load_to_hdfs(file_name,file_local_path,hdfs_path) # send rabbitmq notification. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)