コード例 #1
0
    def _ingest_file(self, file, partition):

        # get file name and date.
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]

        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,
                                                file_date_path, file_date_hour)
        Util.creat_hdfs_folder(hdfs_path, self._logger)

        # load to hdfs.
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.load_to_hdfs(file, hdfs_file, self._logger)

        # create event for workers to process the file.
        self._logger.info(
            "Sending file to worker number: {0}".format(partition))
        self.kafka_topic.send_message(hdfs_file, partition)

        self._logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(file, self._kafka_topic.Topic))
コード例 #2
0
    def _process_pcap_file(self, file_name, file_local_path, hdfs_root_path):

        # get timestamp from the file name.
        file_date = file_name.split('.')[0]
        pcap_hour = file_date[-4:-2]
        pcap_date_path = file_date[-12:-4]

        # hdfs path with timestamp.
        hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path, pcap_date_path,
                                         pcap_hour)
        Util.creat_hdfs_folder(hdfs_path)

        # get file size.
        file_size = os.stat(file_local_path)
        if file_size.st_size > 1145498644:

            # split file.
            self._split_pcap_file(file_name, file_local_path, hdfs_path)
        else:
            # load file to hdfs
            Util.load_to_hdfs(file_name, file_local_path, hdfs_path)

            # send rabbitmq notification.
            hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name)
            Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)
コード例 #3
0
ファイル: collector.py プロジェクト: bulmanp/oni-ingest
    def _ingest_file(self, file, partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format(
            self._pkt_num, file, self._pcap_split_staging, name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, self._logger)

        for currdir, subdir, files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        self._hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, self._logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, self._logger)

                    # create event for workers to process the file.
                    self._logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    self._kafka_topic.send_message(hadoop_pcap_file, partition)
                    self._logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, self._kafka_topic.Topic))

        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, self._logger)
コード例 #4
0
	def _load_to_hdfs(self,file):

		# get file name and date
		binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name =  Util.build_hdfs_path(file,'flow')

		# hdfs path with timestamp.
                hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour)
                Util.creat_hdfs_folder(hdfs_path)

		# load to hdfs.
		Util.load_to_hdfs(file_name,file,hdfs_path)

		# send the notification to rabbitmq server.
		hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
                Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)

		print "Done !!!!!"
コード例 #5
0
ファイル: flow_master.py プロジェクト: ronkuhl/oni-ingest
    def _load_to_hdfs(self, file):

        # get file name and date
        binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path(
            file, 'flow')

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,
                                                binary_date_path, binary_hour)
        Util.creat_hdfs_folder(hdfs_path)

        # load to hdfs.
        Util.load_to_hdfs(file_name, file, hdfs_path)

        # send the notification to rabbitmq server.
        hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.send_new_file_notification(hadoop_pcap_file, self._queue_name)

        print "Done !!!!!"
コード例 #6
0
    def _ingest_file(self,file,partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,self._logger)

        for currdir,subdir,files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour)

 			            # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,self._logger)

  			            # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger)

                        # create event for workers to process the file.
                        self._logger.info( "Sending split file to worker number: {0}".format(partition))
                        self._kafka_topic.send_message(hadoop_pcap_file,partition)
                        self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic))


        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,self._logger)
コード例 #7
0
	def _process_pcap_file(self,file_name,file_local_path,hdfs_root_path):

		# get timestamp from the file name.
		file_date = file_name.split('.')[0]
                pcap_hour=file_date[-4:-2]
                pcap_date_path = file_date[-12:-4]

		# hdfs path with timestamp.
		hdfs_path = "{0}/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour)
		Util.creat_hdfs_folder(hdfs_path)

		# get file size.
		file_size = os.stat(file_local_path)
		if file_size.st_size > 1145498644:

			# split file.
			self._split_pcap_file(file_name,file_local_path,hdfs_path)
	        else:
			# load file to hdfs
            		Util.load_to_hdfs(file_name,file_local_path,hdfs_path)

			# send rabbitmq notification.
			hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name)
			Util.send_new_file_notification(hadoop_pcap_file,self._queue_name)