コード例 #1
0
def ingest_file(file, message_size, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid()))
    try:
        message = ""
        logger.info("Ingesting file: {0} process:{1}".format(
            file, os.getpid()))
        with open(file, "rb") as f:
            for line in f:
                message += line
                if len(message) > message_size:
                    KafkaProducer.SendMessage(message, kafka_servers, topic, 0)
                    message = ""
            #send the last package.
            KafkaProducer.SendMessage(message, kafka_servers, topic, 0)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file, logger)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic: {1}".format(
                file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
コード例 #2
0
ファイル: worker.py プロジェクト: cgiraldo/incubator-spot
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(self._kafka_consumer.Topic))                

        # parser
        parser = self._conf["parser"]

        #spark conf
        diver_memory = self._spark_conf["driver_memory"]
        num_exec = self._spark_conf["spark_exec"]
        exec_memory = self._spark_conf["spark_executor_memory"]
        exec_cores = self._spark_conf["spark_executor_cores"]
        batch_size = self._spark_conf["spark_batch_size"]
        
        jar_path = os.path.dirname(os.path.dirname(self._script_path))
        # spark job command.          
        spark_job_cmd = ("spark-submit --master yarn "
                        "--driver-memory {0} "
                        "--num-executors {1} "
                        "--conf spark.executor.memory={2} "
                        "--conf spark.executor.cores={3} "
                        "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
                        "{5}/{6} "
                        "-zk {7} "
                        "-t {8} "
                        "-db {9} "
                        "-dt {10} "
                        "-w {11} "
                        "-bs {12}".format(diver_memory,num_exec,exec_memory,exec_cores,jar_path,self._script_path,parser,self._kafka_consumer.ZookeperServer,self._kafka_consumer.Topic,self._db_name,"proxy",self._processes,batch_size))
        
        # start spark job.
        Util.execute_cmd(spark_job_cmd,self._logger)
コード例 #3
0
ファイル: collector.py プロジェクト: solrac901/incubator-spot
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path,
                topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

        for currdir, subdir, files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, logger)

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers,
                                           topic, partition)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

    except Exception as err:

        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
コード例 #4
0
ファイル: kafka_client.py プロジェクト: zy0001/incubator-spot
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
コード例 #5
0
ファイル: collector.py プロジェクト: zy0001/incubator-spot
def _ingest_file(new_file, hdfs_root_path, producer, topic):

    logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

    try:

        # get file name and date.
        file_name_parts = new_file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]
        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path,
                                                file_date_hour)
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)

        try:
            if len(hdfs.list_dir(hdfs_path)) == 0:
                logger.info('creating directory: ' + hdfs_path)
                hdfs.mkdir(hdfs_path)
            logger.info('uploading file to hdfs: ' + hdfs_file)
            result = hdfs.upload_file(hdfs_path, new_file)
            if not result:
                logger.error('File failed to upload: ' + hdfs_file)
                raise HdfsException
            else:
                rm_file = "rm {0}".format(new_file)
                logger.info(
                    "Removing files from local staging: {0}".format(rm_file))
                Util.execute_cmd(rm_file, logger)

        except HdfsException as err:
            logger.error('Exception: ' + err.exception)
            logger.info('Check Hdfs Connection settings and server health')

    except Exception as err:
        logger.error("There was a problem, Exception: {0}".format(err))

        # create event for workers to process the file.
        # logger.info("Sending file to worker number: {0}".format(partition))
    try:
        producer.SendMessage(hdfs_file, topic)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(hdfs_file, topic))
    except Exception as err:
        logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(
            hdfs_file, topic))
        logger.error("Error: {0}".format(err))
コード例 #6
0
ファイル: collector.py プロジェクト: cgiraldo/incubator-spot
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))
    
    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,logger)    

        for currdir,subdir,files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour)

                        # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,logger)

                        # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger)

                        # create event for workers to process the file.
                        logger.info( "Sending split file to worker number: {0}".format(partition))
                        KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition)
                        logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))


  
    except Exception as err:
        
        logger.error("There was a problem, please check the following error message:{0}".format(err.message))
        logger.error("Exception: {0}".format(err))
コード例 #7
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(self._topic,self._num_of_partitions))     

        # Create partitions for the workers.
        self._partitions = [ TopicPartition(self._topic,p) for p in range(int(self._num_of_partitions))]        

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)
        
        # get script path 
        zk_conf = "{0}:{1}".format(self._zk_server,self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(os.path.dirname(os.path.abspath(__file__)),self._topic,zk_conf,self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd,self._logger)
コード例 #8
0
ファイル: collector.py プロジェクト: cgiraldo/incubator-spot
def ingest_file(file,message_size,topic,kafka_servers):
    
    logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid()))
    try:        
        message = ""
        logger.info("Ingesting file: {0} process:{1}".format(file,os.getpid())) 
        with open(file,"rb") as f:
            for line in f:
                message += line
                if len(message) > message_size:
                    KafkaTopic.SendMessage(message,kafka_servers,topic,0)
                    message = ""
            #send the last package.        
            KafkaTopic.SendMessage(message,kafka_servers,topic,0)            
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file,logger)
        logger.info("File {0} has been successfully sent to Kafka Topic: {1}".format(file,topic))

    except Exception as err:        
        logger.error("There was a problem, please check the following error message:{0}".format(err.message))
        logger.error("Exception: {0}".format(err))
コード例 #9
0
    def _create_topic(self):

        self._logger.info("Creating topic: {0} with {1} parititions".format(
            self._topic, self._num_of_partitions))

        # Create partitions for the workers.
        self._partitions = [
            TopicPartition(self._topic, p)
            for p in range(int(self._num_of_partitions))
        ]

        # create partitioner
        self._partitioner = RoundRobinPartitioner(self._partitions)

        # get script path
        zk_conf = "{0}:{1}".format(self._zk_server, self._zk_port)
        create_topic_cmd = "{0}/kafka_topic.sh create {1} {2} {3}".format(
            os.path.dirname(os.path.abspath(__file__)), self._topic, zk_conf,
            self._num_of_partitions)

        # execute create topic cmd
        Util.execute_cmd(create_topic_cmd, self._logger)
コード例 #10
0
ファイル: worker.py プロジェクト: zy0001/incubator-spot
    def start(self):

        self._logger.info("Creating Spark Job for topic: {0}".format(
            self._kafka_consumer.Topic))

        # parser
        parser = self._conf["parser"]

        #spark conf
        diver_memory = self._spark_conf["driver_memory"]
        num_exec = self._spark_conf["spark_exec"]
        exec_memory = self._spark_conf["spark_executor_memory"]
        exec_cores = self._spark_conf["spark_executor_cores"]
        batch_size = self._spark_conf["spark_batch_size"]

        jar_path = os.path.dirname(os.path.dirname(self._script_path))
        # spark job command.
        spark_job_cmd = (
            "spark-submit --master yarn "
            "--driver-memory {0} "
            "--num-executors {1} "
            "--conf spark.executor.memory={2} "
            "--conf spark.executor.cores={3} "
            "--jars {4}/common/spark-streaming-kafka-0-8-assembly_2.11-2.0.0.jar "
            "{5}/{6} "
            "-zk {7} "
            "-t {8} "
            "-db {9} "
            "-dt {10} "
            "-w {11} "
            "-bs {12}".format(diver_memory, num_exec, exec_memory, exec_cores,
                              jar_path, self._script_path, parser,
                              self._kafka_consumer.ZookeperServer,
                              self._kafka_consumer.Topic, self._db_name,
                              "proxy", self._processes, batch_size))

        # start spark job.
        Util.execute_cmd(spark_job_cmd, self._logger)
コード例 #11
0
def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging,
                 hdfs_root_path, producer, topic):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = new_file
        file_name_parts = new_file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, new_file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

    except Exception as err:
        logger.error("There was a problem splitting the file: {0}".format(
            err.message))
        logger.error("Exception: {0}".format(err))

    for currdir, subdir, files in os.walk(pcap_split_staging):
        for file in files:
            if file.endswith(".pcap") and "{0}_spot".format(name) in file:
                # get timestamp from the file name to build hdfs path.
                file_date = file.split('.')[0]
                pcap_hour = file_date[-6:-4]
                pcap_date_path = file_date[-14:-6]

                # hdfs path with timestamp.
                hdfs_path = "{0}/binary/{1}/{2}".format(
                    hdfs_root_path, pcap_date_path, pcap_hour)

                # create hdfs path.
                try:
                    if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0:
                        logger.info('creating directory: ' + hdfs_path)
                        hdfs_client.mkdir(hdfs_path, hdfs_client)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    result = hdfs_client.upload_file(
                        hadoop_pcap_file, os.path.join(currdir, file))
                    if not result:
                        logger.error('File failed to upload: ' +
                                     hadoop_pcap_file)
                        raise HdfsException

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to Topic: {0}".format(topic))
                    producer.SendMessage(hadoop_pcap_file, topic)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

                except HdfsException as err:
                    logger.error('Exception: ' + err.exception)
                    logger.info(
                        'Check Hdfs Connection settings and server health')

                except Exception as err:
                    logger.info(
                        "File {0} failed to be sent to Kafka Topic to: {1}".
                        format(new_file, topic))
                    logger.error("Error: {0}".format(err))
コード例 #12
0
    def _process_new_file(self, nf):

        # get file name and date
        file_name_parts = nf.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        nf_path = nf.rstrip(file_name)
        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # get file from hdfs
        if hdfs.file_exists(nf_path, file_name):
            self._logger.info("Getting file from hdfs: {0}".format(nf))
            hdfs.download_file(nf, self._local_staging)
        else:
            self._logger.info("file: {0} not found".format(nf))
            # TODO: error handling

        # build process cmd.
        sf = "{0}{1}.csv".format(self._local_staging, file_name)
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {3}".format(
            self._local_staging, file_name, self._process_opt, sf)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        self._logger.info("Creating staging: {0}".format(hdfs_staging_path))
        hdfs.mkdir(hdfs_staging_path)

        # move to stage.
        local_file = "{0}{1}.csv".format(self._local_staging, file_name)
        self._logger.info(
            "Moving data to staging: {0}".format(hdfs_staging_path))
        hdfs.upload_file(hdfs_staging_path, local_file)

        # load with impyla
        drop_table = "DROP TABLE IF EXISTS {0}.flow_tmp".format(self._db_name)
        self._logger.info("Dropping temp table: {0}".format(drop_table))
        self._cursor.execute_query(drop_table)

        create_external = (
            "\n"
            "CREATE EXTERNAL TABLE {0}.flow_tmp (\n"
            "  treceived STRING,\n"
            "  tryear INT,\n"
            "  trmonth INT,\n"
            "  trday INT,\n"
            "  trhour INT,\n"
            "  trminute INT,\n"
            "  trsec INT,\n"
            "  tdur FLOAT,\n"
            "  sip  STRING,\n"
            "  dip STRING,\n"
            "  sport INT,\n"
            "  dport INT,\n"
            "  proto STRING,\n"
            "  flag STRING,\n"
            "  fwd INT,\n"
            "  stos INT,\n"
            "  ipkt BIGINT,\n"
            "  ibyt BIGINT,\n"
            "  opkt BIGINT,\n"
            "  obyt BIGINT,\n"
            "  input INT,\n"
            "  output INT,\n"
            "  sas INT,\n"
            "  das INT,\n"
            "  dtos INT,\n"
            "  dir INT,\n"
            "  rip STRING\n"
            "  )\n"
            "  ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n"
            "  STORED AS TEXTFILE\n"
            "  LOCATION '{1}'\n"
            "  TBLPROPERTIES ('avro.schema.literal'='{{\n"
            "  \"type\":   \"record\"\n"
            "  , \"name\":   \"RawFlowRecord\"\n"
            "  , \"namespace\" : \"com.cloudera.accelerators.flows.avro\"\n"
            "  , \"fields\": [\n"
            "      {{\"name\": \"treceived\",             \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"tryear\",              \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trmonth\",             \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trday\",               \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trhour\",              \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trminute\",            \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"trsec\",               \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"tdur\",                \"type\":[\"float\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sport\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dport\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"proto\",              \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"flag\",               \"type\":[\"string\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"fwd\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"stos\",                  \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"ipkt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"ibyt\",              \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"opkt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"obyt\",               \"type\":[\"bigint\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"input\",                 \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"output\",                \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"sas\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"das\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dtos\",                  \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"dir\",                   \"type\":[\"int\",   \"null\"]}}\n"
            "      ,  {{\"name\": \"rip\",                \"type\":[\"string\",   \"null\"]}}\n"
            "      ]\n"
            "}}')\n").format(self._db_name, hdfs_staging_path)
        self._logger.info(
            "Creating external table: {0}".format(create_external))
        self._cursor.execute_query(create_external)

        insert_into_table = """
        INSERT INTO TABLE {0}.flow
        PARTITION (y={1}, m={2}, d={3}, h={4})
        SELECT   treceived,  unix_timestamp(treceived) AS unix_tstamp, tryear,  trmonth, trday,  trhour,  trminute,  trsec,
          tdur,  sip, dip, sport, dport,  proto,  flag,  fwd,  stos,  ipkt,  ibyt,  opkt,  obyt,  input,  output,
          sas,  das,  dtos,  dir,  rip
        FROM {0}.flow_tmp
        """.format(self._db_name, flow_year, flow_month, flow_day, flow_hour)
        self._logger.info("Loading data to {0}: {1}".format(
            self._db_name, insert_into_table))
        self._cursor.execute_query(insert_into_table)

        # remove from hdfs staging
        self._logger.info(
            "Removing staging path: {0}".format(hdfs_staging_path))
        hdfs.delete_folder(hdfs_staging_path)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        rm_local_staging = "rm {0}".format(sf)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
コード例 #13
0
ファイル: worker.py プロジェクト: zy0001/incubator-spot
    def _process_new_file(self, nf):

        # get file from hdfs
        self._logger.info("Getting file from hdfs: {0}".format(nf))
        if hdfs.file_exists(nf):
            hdfs.download_file(nf, self._local_staging)
        else:
            self._logger.info("file: {0} not found".format(nf))
            # TODO: error handling

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        binary_hour = file_name_parts[len(file_name_parts) - 2]
        binary_date_path = file_name_parts[len(file_name_parts) - 3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        self._logger.info("Creating staging: {0}".format(hdfs_staging_path))
        hdfs.mkdir(hdfs_staging_path)

        # move to stage.
        local_file = "{0}{1}.csv".format(self._local_staging, file_name)
        self._logger.info(
            "Moving data to staging: {0}".format(hdfs_staging_path))
        hdfs.upload_file(hdfs_staging_path, local_file)

        #load to avro
        drop_table = 'DROP TABLE IF EXISTS {0}.dns_tmp'.format(self._db_name)
        self._cursor.execute(drop_table)

        # Create external table
        create_external = (
            "\n"
            "CREATE EXTERNAL TABLE {0}.dns_tmp (\n"
            "  frame_day STRING,\n"
            "  frame_time STRING,\n"
            "  unix_tstamp BIGINT,\n"
            "  frame_len INT,\n"
            "  ip_src STRING,\n"
            "  ip_dst STRING,\n"
            "  dns_qry_name STRING,\n"
            "  dns_qry_type INT,\n"
            "  dns_qry_class STRING,\n"
            "  dns_qry_rcode INT,\n"
            "  dns_a STRING  \n"
            "  )\n"
            "  ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n"
            "  STORED AS TEXTFILE\n"
            "  LOCATION '{1}'\n"
            "  TBLPROPERTIES ('avro.schema.literal'='{{\n"
            "  \"type\":   \"record\"\n"
            "  , \"name\":   \"RawDnsRecord\"\n"
            "  , \"namespace\" : \"com.cloudera.accelerators.dns.avro\"\n"
            "  , \"fields\": [\n"
            "      {{\"name\": \"frame_day\",        \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"frame_time\",     \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"unix_tstamp\",    \"type\":[\"bigint\", \"null\"]}\n"
            "      , {{\"name\": \"frame_len\",      \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"ip_src\",         \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"ip_dst\",         \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_name\",   \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_type\",   \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_class\",  \"type\":[\"string\", \"null\"]}\n"
            "      , {{\"name\": \"dns_qry_rcode\",  \"type\":[\"int\",    \"null\"]}\n"
            "      , {{\"name\": \"dns_a\",          \"type\":[\"string\", \"null\"]}\n"
            "      ]\n"
            "}')\n").format(self._db_name, hdfs_staging_path)
        self._logger.info(
            "Creating external table: {0}".format(create_external))
        self._cursor.execute(create_external)

        # Insert data
        insert_into_table = """
            INSERT INTO TABLE {0}.dns
            PARTITION (y={1}, m={2}, d={3}, h={4)
            SELECT   CONCAT(frame_day , frame_time) as treceived, unix_tstamp, frame_len, ip_dst, ip_src, dns_qry_name,
            dns_qry_class,dns_qry_type, dns_qry_rcode, dns_a 
            FROM {0}.dns_tmp
        """.format(self._db_name, binary_year, binary_month, binary_day,
                   binary_hour)
        self._logger.info("Loading data to {0}: {1}".format(
            self._db_name, insert_into_table))
        self._cursor.execute(insert_into_table)

        # remove from hdfs staging
        self._logger.info(
            "Removing staging path: {0}".format(hdfs_staging_path))
        hdfs.delete_folder(hdfs_staging_path)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
コード例 #14
0
ファイル: worker.py プロジェクト: cgiraldo/incubator-spot
    def _process_new_file(self,file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(file,self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd,self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        flow_date = file_name.split('.')[1]
        flow_year = flow_date[0:4]
        flow_month = flow_date[4:6]
        flow_day = flow_date[6:8]
        flow_hour = flow_date[8:10]

        # build process cmd.
        process_cmd = "nfdump -o csv -r {0}{1} {2} > {0}{1}.csv".format(self._local_staging,file_name,self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd,self._logger)        

        # create hdfs staging.
        hdfs_path = "{0}/flow".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path =  "{0}/stage/{1}".format(hdfs_path,staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd,self._logger)

        # move to stage.
        mv_to_staging ="hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(self._local_staging,file_name,hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        subprocess.call(mv_to_staging,shell=True)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/flow/load_flow_avro_parquet.hql".format(self._db_name,flow_year,flow_month,flow_day,flow_hour,hdfs_staging_path)

        self._logger.info( "Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd,self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(hdfs_staging_path)
        self._logger.info("Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd,self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging,file_name)
        self._logger.info("Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging,self._logger)

        self._logger.info("File {0} was successfully processed.".format(file_name))
コード例 #15
0
    def _process_new_file(self, file):

        # get file from hdfs
        get_file_cmd = "hadoop fs -get {0} {1}.".format(
            file, self._local_staging)
        self._logger.info("Getting file from hdfs: {0}".format(get_file_cmd))
        Util.execute_cmd(get_file_cmd, self._logger)

        # get file name and date
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        binary_hour = file_name_parts[len(file_name_parts) - 2]
        binary_date_path = file_name_parts[len(file_name_parts) - 3]
        binary_year = binary_date_path[0:4]
        binary_month = binary_date_path[4:6]
        binary_day = binary_date_path[6:8]

        # build process cmd.
        process_cmd = "tshark -r {0}{1} {2} > {0}{1}.csv".format(
            self._local_staging, file_name, self._process_opt)
        self._logger.info("Processing file: {0}".format(process_cmd))
        Util.execute_cmd(process_cmd, self._logger)

        # create hdfs staging.
        hdfs_path = "{0}/dns".format(self._hdfs_app_path)
        staging_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4]
        hdfs_staging_path = "{0}/stage/{1}".format(hdfs_path,
                                                   staging_timestamp)
        create_staging_cmd = "hadoop fs -mkdir -p {0}".format(
            hdfs_staging_path)
        self._logger.info("Creating staging: {0}".format(create_staging_cmd))
        Util.execute_cmd(create_staging_cmd, self._logger)

        # move to stage.
        mv_to_staging = "hadoop fs -moveFromLocal {0}{1}.csv {2}/.".format(
            self._local_staging, file_name, hdfs_staging_path)
        self._logger.info("Moving data to staging: {0}".format(mv_to_staging))
        Util.execute_cmd(mv_to_staging, self._logger)

        #load to avro
        load_to_avro_cmd = "hive -hiveconf dbname={0} -hiveconf y={1} -hiveconf m={2} -hiveconf d={3} -hiveconf h={4} -hiveconf data_location='{5}' -f pipelines/dns/load_dns_avro_parquet.hql".format(
            self._db_name, binary_year, binary_month, binary_day, binary_hour,
            hdfs_staging_path)

        self._logger.info("Loading data to hive: {0}".format(load_to_avro_cmd))
        Util.execute_cmd(load_to_avro_cmd, self._logger)

        # remove from hdfs staging
        rm_hdfs_staging_cmd = "hadoop fs -rm -R -skipTrash {0}".format(
            hdfs_staging_path)
        self._logger.info(
            "Removing staging path: {0}".format(rm_hdfs_staging_cmd))
        Util.execute_cmd(rm_hdfs_staging_cmd, self._logger)

        # remove from local staging.
        rm_local_staging = "rm {0}{1}".format(self._local_staging, file_name)
        self._logger.info(
            "Removing files from local staging: {0}".format(rm_local_staging))
        Util.execute_cmd(rm_local_staging, self._logger)

        self._logger.info(
            "File {0} was successfully processed.".format(file_name))
コード例 #16
0
ファイル: kerberos.py プロジェクト: zy0001/incubator-spot
    def authenticate(self):

        Util.execute_cmd(self._kinit_cmd, self._logger)
        self._logger.info("Kerberos ticket obtained")