コード例 #1
0
ファイル: worker.py プロジェクト: rahulpedduri/oni-ingest
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(
            self._kafka_consumer.Topic))

        # parser
        parser = self._conf["parser"]

        # spark job command.
        spark_job_cmd = (
            "spark-submit --master yarn "
            "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
            "{1}/{2} "
            "-zk {3} "
            "-t {4} "
            "-db {5} "
            "-dt {6} "
            "-w {7}".format(
                os.path.dirname(os.path.dirname(self._script_path)),
                self._script_path, parser, self._kafka_consumer.ZookeperServer,
                self._kafka_consumer.Topic, self._db_name, "proxy",
                self._processes))

        # start spark job.
        Util.execute_cmd(spark_job_cmd, self._logger)
コード例 #2
0
    def _ingest_file(self,file):

        message = ""
        with open(file,"rb") as f:

            for line in f:
                message += line
                if len(message) > self._message_size:
                    self._kafka_topic.send_message(message,self._kafka_topic.Partition)
                    message = ""
            # send the last package.
            self._kafka_topic.send_message(message,self._kafka_topic.Partition)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file,self._logger)
        self._logger.info("File {0} has been successfully sent to Kafka Topic:{1}".format(file,self._kafka_topic.Topic))
コード例 #3
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions))     

        # Create partitions for the workers.
        self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))]        

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)
        
        # get script path 
        zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd,self._logger)
コード例 #4
0
ファイル: collector.py プロジェクト: bulmanp/oni-ingest
    def _ingest_file(self, file, partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_oni.pcap".format(
            self._pkt_num, file, self._pcap_split_staging, name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, self._logger)

        for currdir, subdir, files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        self._hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, self._logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, self._logger)

                    # create event for workers to process the file.
                    self._logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    self._kafka_topic.send_message(hadoop_pcap_file, partition)
                    self._logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, self._kafka_topic.Topic))

        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, self._logger)
コード例 #5
0
ファイル: collector.py プロジェクト: rahulpedduri/oni-ingest
    def _ingest_file(self, file):

        message = ""
        with open(file, "rb") as f:

            for line in f:
                message += line
                if len(message) > self._message_size:
                    self._kafka_topic.send_message(message,
                                                   self._kafka_topic.Partition)
                    message = ""
            # send the last package.
            self._kafka_topic.send_message(message,
                                           self._kafka_topic.Partition)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file, self._logger)
        self._logger.info(
            "File {0} has been successfully sent to Kafka Topic:{1}".format(
                file, self._kafka_topic.Topic))
コード例 #6
0
    def _ingest_file(self,file,partition):

        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd="editcap -c {0} {1} {2}/{3}_oni.pcap".format(self._pkt_num,file,self._pcap_split_staging,name)
        self._logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,self._logger)

        for currdir,subdir,files in os.walk(self._pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_oni".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path,pcap_date_path,pcap_hour)

 			            # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,self._logger)

  			            # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,self._logger)

                        # create event for workers to process the file.
                        self._logger.info( "Sending split file to worker number: {0}".format(partition))
                        self._kafka_topic.send_message(hadoop_pcap_file,partition)
                        self._logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,self._kafka_topic.Topic))


        self._logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,self._logger)
コード例 #7
0
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic))                

        # parser
        parser = self._conf["parser"]
        

        # spark job command.
        spark_job_cmd = ("spark-submit --master yarn "
                        "--jars {0}/oni/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
                        "{1}/{2} " 
                        "-zk {3} " 
                        "-t {4} "
                        "-db {5} "
                        "-dt {6} " 
                        "-w {7}".format(os.path.dirname(os.path.dirname(self._script_path)),self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes))

        
        # start spark job.
        Util.execute_cmd(spark_job_cmd,self._logger)
コード例 #8
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # Create partitions for the workers.
        self._partitions = [
            TopicPartition(self._topic, p)
            for p in range(int(self._num_of_partitions))
        ]

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
コード例 #9
0
    def _process_new_file(self,file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd,self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        binary_hour = file_name_parts[len(file_name_parts)-2]
        binary_date_path = file_name_parts[len(file_name_parts)-3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd,self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path =  "{0}/stage/{1}".format(hdfs_path,staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd,self._logger)

        # move to stage.
        mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        Util.execute_cmd(mv_to_staging,self._logger)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(self._db_name,binary_year,binary_month,binary_day,binary_hour,hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd,self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path)
        self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd,self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name)
        self._logger.info("Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging,self._logger)

        self._logger.info("File {0} was successfully processed.".format(file_name))
コード例 #10
0
    def _process_new_file(self, file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(
            file, self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd, self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # build process cmd.
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(
            hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd, self._logger)

        # move to stage.
        mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(
            self._local_staging, file_name, hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        subprocess.call(mv_to_staging, shell=True)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(
            self._db_name, flow_year, flow_month, flow_day, flow_hour,
            hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd, self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(
            hdfs_staging_path)
        self._logger.info(
            "Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd, self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))