Exemple #1
0
def ingest_file(file,partition,hdfs_root_path,topic,kafka_servers):

        logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

        try:

            # get file name and date.
            file_name_parts = file.split('/')
            file_name = file_name_parts[len(file_name_parts)-1]
            file_date = file_name.split('.')[1]

            file_date_path = file_date[0:8]
            file_date_hour = file_date[8:10]

            # hdfs path with timestamp.
            hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,file_date_path,file_date_hour)
            Util.creat_hdfs_folder(hdfs_path,logger)

            # load to hdfs.
            hdfs_file = "{0}/{1}".format(hdfs_path,file_name)
            Util.load_to_hdfs(file,hdfs_file,logger)

            # create event for workers to process the file.
            logger.info("Sending file to worker number: {0}".format(partition))
            KafkaTopic.SendMessage(hdfs_file,kafka_servers,topic,partition)    
            logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))

        except Exception as err:
            logger.error("There was a problem, please check the following error message:{0}".format(err.message))
            logger.error("Exception: {0}".format(err))
Exemple #2
0
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

    try:

        # get file name and date.
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]

        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path,
                                                file_date_hour)
        Util.creat_hdfs_folder(hdfs_path, logger)

        # load to hdfs.
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.load_to_hdfs(file, hdfs_file, logger)

        # create event for workers to process the file.
        logger.info("Sending file to worker number: {0}".format(partition))
        KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path,
                topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

        for currdir, subdir, files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, logger)

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers,
                                           topic, partition)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

    except Exception as err:

        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Exemple #4
0
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))
    
    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,logger)    

        for currdir,subdir,files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                        # get timestamp from the file name to build hdfs path.
                        file_date = file.split('.')[0]
                        pcap_hour = file_date[-6:-4]
                        pcap_date_path = file_date[-14:-6]

                        # hdfs path with timestamp.
                        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour)

                        # create hdfs path.
                        Util.creat_hdfs_folder(hdfs_path,logger)

                        # load file to hdfs.
                        hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                        Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger)

                        # create event for workers to process the file.
                        logger.info( "Sending split file to worker number: {0}".format(partition))
                        KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition)
                        logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))


  
    except Exception as err:
        
        logger.error("There was a problem, please check the following error message:{0}".format(err.message))
        logger.error("Exception: {0}".format(err))