def ingest_file(file,partition,hdfs_root_path,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,file_date_path,file_date_hour) Util.creat_hdfs_folder(hdfs_path,logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path,file_name) Util.load_to_hdfs(file,hdfs_file,logger) # create event for workers to process the file. logger.info("Sending file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hdfs_file,kafka_servers,topic,partition) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.FLOW.{0}'.format(os.getpid())) try: # get file name and date. file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] file_date = file_name.split('.')[1] file_date_path = file_date[0:8] file_date_hour = file_date[8:10] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, file_date_path, file_date_hour) Util.creat_hdfs_folder(hdfs_path, logger) # load to hdfs. hdfs_file = "{0}/{1}".format(hdfs_path, file_name) Util.load_to_hdfs(file, hdfs_file, logger) # create event for workers to process the file. logger.info("Sending file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hdfs_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}". format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file, pkt_num, pcap_split_staging, partition, hdfs_root_path, topic, kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts) - 1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format( pkt_num, file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd, logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file, logger) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format( hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path, logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file) Util.load_to_hdfs(os.path.join(currdir, file), hadoop_pcap_file, logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format( partition)) KafkaTopic.SendMessage(hadoop_pcap_file, kafka_servers, topic, partition) logger.info( "File {0} has been successfully sent to Kafka Topic to: {1}" .format(file, topic)) except Exception as err: logger.error( "There was a problem, please check the following error message:{0}" .format(err.message)) logger.error("Exception: {0}".format(err))
def ingest_file(file,pkt_num,pcap_split_staging, partition,hdfs_root_path,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = file file_name_parts = file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,file,pcap_split_staging,name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,logger) for currdir,subdir,files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path,pcap_date_path,pcap_hour) # create hdfs path. Util.creat_hdfs_folder(hdfs_path,logger) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) Util.load_to_hdfs(os.path.join(currdir,file),hadoop_pcap_file,logger) # create event for workers to process the file. logger.info( "Sending split file to worker number: {0}".format(partition)) KafkaTopic.SendMessage(hadoop_pcap_file,kafka_servers,topic,partition) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))