def ingest_file(file, message_size, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format( file, os.getpid())) with open(file, "rb") as f: for line in f: message += line if len(message) > message_size: KafkaTopic.SendMessage(message, kafka_servers, topic, 0) message = "" #send the last package. KafkaTopic.SendMessage(message, kafka_servers, topic, 0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file, logger) logger.info( "File {0} has been successfully sent to Kafka Topic: {1}".format( file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path, file_date_hour) Util.creat_hdfs_folder(hdfs_path, logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path, file_name) Util.load_to_hdfs(file, hdfs_file, logger) # create event for workers to process the file. logger.info("Sending file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format( partition)) KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def start_collector(type, workers_num, id=None): # generate ingest id ingest_id = str(datetime.datetime.time(datetime.datetime.now())).replace( ":", "_").replace(".", "_") # create logger. logger = Util.get_logger("SPOT.INGEST") # validate the given configuration exists in ingest_conf.json. if not type in master_conf["pipelines"]: logger.error("'{0}' type is not a valid configuration.".format(type)) sys.exit(1) # validate the type is a valid module. if not Util.validate_data_source(master_conf["pipelines"][type]["type"]): logger.error( "'{0}' type is not configured. Please check you ingest conf file". format(master_conf["pipelines"][type]["type"])) sys.exit(1) # validate if kerberos authentication is required. if os.getenv('KRB_AUTH'): kb = Kerberos() kb.authenticate() # kafka server info. logger.info("Initializing kafka instance") k_server = master_conf["kafka"]['kafka_server'] k_port = master_conf["kafka"]['kafka_port'] # required zookeeper info. zk_server = master_conf["kafka"]['zookeper_server'] zk_port = master_conf["kafka"]['zookeper_port'] topic = "SPOT-INGEST-{0}_{1}".format(type, ingest_id) if not id else id kafka = KafkaTopic(topic, k_server, k_port, zk_server, zk_port, workers_num) # create a collector instance based on data source type. logger.info("Starting {0} ingest instance".format(topic)) module = __import__("pipelines.{0}.collector".format( master_conf["pipelines"][type]["type"]), fromlist=['Collector']) # start collector. ingest_collector = module.Collector(master_conf['hdfs_app_path'], kafka, type) ingest_collector.start()