def process_new_binary_file(new_file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file) print get_file_cmd subprocess.call(get_file_cmd,shell=True) # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(new_file,ingest_type) # build process cmd. post_process_cmd = None process_opt = worker_conf[ingest_type]['process_opt'] if ingest_type == 'dns': post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt) elif ingest_type == 'flow': post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt) else: print "Unsupported ingest type" sys.exit(1) print post_process_cmd subprocess.call(post_process_cmd,shell=True) # create folder if it does not exist h_base_path = "{0}/{1}".format(os.getenv('HUSER','/user/oni'), ingest_type) h_csv_path = "{0}/csv".format(h_base_path) create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour) print create_folder_cmd subprocess.call(create_folder_cmd,shell=True) #move to hdfs. upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour) print upld_cmd subprocess.call(upld_cmd,shell=True) #make tmp folder in stage h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] h_stage_path = "{0}/stage/{1}".format(h_base_path,h_stage_timestamp) create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path) print create_tmp_cmd subprocess.call(create_tmp_cmd,shell=True) #load to avro load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,os.getenv('DBNAME','default') ) print load_avro_cmd subprocess.call(load_avro_cmd,shell=True) #remove from stage rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path) print rm_tmp_cmd subprocess.call(rm_tmp_cmd,shell=True) #can this delete other files when all is running on the same edge server? rm_tmp = "rm ../stage/{0}*".format(file_name) subprocess.call(rm_tmp,shell=True) print datetime.datetime.now()
def _load_to_hdfs(self,file): # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(file,'flow') # hdfs path with timestamp. hdfs_path = "{0}/{1}/{2}".format(self._hdfs_root_path,binary_date_path,binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name,file,hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file_name) Util.send_new_file_notification(hadoop_pcap_file,self._queue_name) print "Done !!!!!"
def _load_to_hdfs(self, file): # get file name and date binary_year, binary_month, binary_day, binary_hour, binary_date_path, file_name = Util.build_hdfs_path( file, 'flow') # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(self._hdfs_root_path, binary_date_path, binary_hour) Util.creat_hdfs_folder(hdfs_path) # load to hdfs. Util.load_to_hdfs(file_name, file, hdfs_path) # send the notification to rabbitmq server. hadoop_pcap_file = "{0}/{1}".format(hdfs_path, file_name) Util.send_new_file_notification(hadoop_pcap_file, self._queue_name) print "Done !!!!!"
def process_new_binary_file(new_file): # get file from hdfs get_file_cmd = "hadoop fs -get {0} ../stage/.".format(new_file) print get_file_cmd subprocess.call(get_file_cmd,shell=True) # get file name and date binary_year,binary_month,binary_day,binary_hour,binary_date_path,file_name = Util.build_hdfs_path(new_file,ingest_type) # build process cmd. post_process_cmd = None process_opt = worker_conf[ingest_type]['process_opt'] if ingest_type == 'dns': post_process_cmd = "tshark -r ../stage/{0} {1} >> ../stage/{0}.csv".format(file_name,process_opt) elif ingest_type == 'flow': post_process_cmd = "nfdump -o csv -r ../stage/{0} {1} > ../stage/{0}.csv".format(file_name,process_opt) else: print "Unsupported ingest type" sys.exit(1) print post_process_cmd subprocess.call(post_process_cmd,shell=True) # create folder if it does not exist h_base_path = "{0}/{1}".format(worker_conf['huser'], ingest_type) h_csv_path = "{0}/csv".format(h_base_path) create_folder_cmd = "hadoop fs -mkdir -p {0}/y={1}/m={2}/d={3}/h={4}".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour) print create_folder_cmd subprocess.call(create_folder_cmd,shell=True) #move to hdfs. upld_cmd = "hadoop fs -moveFromLocal ../stage/{0}.csv {1}/y={2}/m={3}/d={4}/h={5}/.".format(file_name,h_csv_path,binary_year,binary_month,binary_day,binary_hour) print upld_cmd subprocess.call(upld_cmd,shell=True) #make tmp folder in stage h_stage_timestamp = datetime.datetime.now().strftime('%M%S%f')[:-4] h_stage_path = "{0}/stage/{1}".format(h_base_path,h_stage_timestamp) create_tmp_cmd = "hadoop fs -mkdir -p {0}".format(h_stage_path) print create_tmp_cmd subprocess.call(create_tmp_cmd,shell=True) # move to stage. mv_to_stage = "hadoop fs -cp {0}/y={1}/m={2}/d={3}/h={4}/{5}.csv {6}/.".format(h_csv_path,binary_year,binary_month,binary_day,binary_hour,file_name,h_stage_path) print mv_to_stage subprocess.call(mv_to_stage,shell=True) #load to avro load_avro_cmd = "hive -hiveconf dbname={6} -hiveconf y={0} -hiveconf m={1} -hiveconf d={2} -hiveconf h={3} -hiveconf data_location='{4}' -f oni/load_{5}_avro_parquet.hql".format(binary_year,binary_month,binary_day,binary_hour,h_stage_path,ingest_type,worker_conf['dbname'] ) print load_avro_cmd subprocess.call(load_avro_cmd,shell=True) #remove from stage rm_tmp_cmd = "hadoop fs -rm -R -skipTrash {0}".format(h_stage_path) print rm_tmp_cmd subprocess.call(rm_tmp_cmd,shell=True) #can this delete other files when all is running on the same edge server? rm_tmp = "rm ../stage/{0}*".format(file_name) subprocess.call(rm_tmp,shell=True) print datetime.datetime.now()