Esempio n. 1
0
def ingest_file(file, message_size, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid()))
    try:
        message = ""
        logger.info("Ingesting file: {0} process:{1}".format(
            file, os.getpid()))
        with open(file, "rb") as f:
            for line in f:
                message += line
                if len(message) > message_size:
                    KafkaTopic.SendMessage(message, kafka_servers, topic, 0)
                    message = ""
            #send the last package.
            KafkaTopic.SendMessage(message, kafka_servers, topic, 0)
        rm_file = "rm {0}".format(file)
        Util.execute_cmd(rm_file, logger)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic: {1}".format(
                file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Esempio n. 2
0
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid()))

    try:

        # get file name and date.
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]
        file_date = file_name.split('.')[1]

        file_date_path = file_date[0:8]
        file_date_hour = file_date[8:10]

        # hdfs path with timestamp.
        hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path,
                                                file_date_hour)
        Util.creat_hdfs_folder(hdfs_path, logger)

        # load to hdfs.
        hdfs_file = "{0}/{1}".format(hdfs_path, file_name)
        Util.load_to_hdfs(file, hdfs_file, logger)

        # create event for workers to process the file.
        logger.info("Sending file to worker number: {0}".format(partition))
        KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition)
        logger.info(
            "File {0} has been successfully sent to Kafka Topic to: {1}".
            format(file, topic))

    except Exception as err:
        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
Esempio n. 3
0
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path,
                topic, kafka_servers):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))

    try:
        # get file name and date.
        org_file = file
        file_name_parts = file.split('/')
        file_name = file_name_parts[len(file_name_parts) - 1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(
            pkt_num, file, pcap_split_staging, name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd, logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file, logger)

        for currdir, subdir, files in os.walk(pcap_split_staging):
            for file in files:
                if file.endswith(".pcap") and "{0}_spot".format(name) in file:

                    # get timestamp from the file name to build hdfs path.
                    file_date = file.split('.')[0]
                    pcap_hour = file_date[-6:-4]
                    pcap_date_path = file_date[-14:-6]

                    # hdfs path with timestamp.
                    hdfs_path = "{0}/binary/{1}/{2}".format(
                        hdfs_root_path, pcap_date_path, pcap_hour)

                    # create hdfs path.
                    Util.creat_hdfs_folder(hdfs_path, logger)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file)
                    Util.load_to_hdfs(os.path.join(currdir, file),
                                      hadoop_pcap_file, logger)

                    # create event for workers to process the file.
                    logger.info(
                        "Sending split file to worker number: {0}".format(
                            partition))
                    KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers,
                                           topic, partition)
                    logger.info(
                        "File {0} has been successfully sent to Kafka Topic to: {1}"
                        .format(file, topic))

    except Exception as err:

        logger.error(
            "There was a problem, please check the following error message:{0}"
            .format(err.message))
        logger.error("Exception: {0}".format(err))
def start_collector(type, workers_num, id=None):

    # generate ingest id
    ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace(
        ":", "_").replace(".", "_")

    # create logger.
    logger = Util.get_logger("SPOT.INGEST")

    # validate the given configuration exists in ingest_conf.json.
    if not type in master_conf["pipelines"]:
        logger.error("'{0}' type is not a valid configuration.".format(type))
        sys.exit(1)

    # validate the type is a valid module.
    if not Util.validate_data_source(master_conf["pipelines"][type]["type"]):
        logger.error(
            "'{0}' type is not configured. Please check you ingest conf file".
            format(master_conf["pipelines"][type]["type"]))
        sys.exit(1)

    # validate if kerberos authentication is required.
    if os.getenv('KRB_AUTH'):
        kb = Kerberos()
        kb.authenticate()

    # kafka server info.
    logger.info("Initializing kafka instance")
    k_server = master_conf["kafka"]['kafka_server']
    k_port = master_conf["kafka"]['kafka_port']

    # required zookeeper info.
    zk_server = master_conf["kafka"]['zookeper_server']
    zk_port = master_conf["kafka"]['zookeper_port']

    topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id
    kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port,
                       workers_num)

    # create a collector instance based on data source type.
    logger.info("Starting {0} ingest instance".format(topic))
    module = __import__("pipelines.{0}.collector".format(
        master_conf["pipelines"][type]["type"]),
                        fromlist=['Collector'])

    # start collector.
    ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka,
                                        type)
    ingest_collector.start()