def feat_importance_ip(row_id_str, ds_id, hdfs_feat_dir, local_score_file, score_file_IT, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, jobname, uploadtype): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) ''' SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) #SparkContext.setSystemProperty('spark.kryoserializer.buffer.mb', config.get('spark', 'spark_kryoserializer_buffer_mb')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'feature_importance_2ways:'+str(args.row_id)) ''' t0 = time() # get folder list (labels) from hdfs data_out/<id>/metadata ============== dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: folder_list=", folder_list # get feature seq : ngram hash mapping ================================== key = "dic_seq_hashes" #{"123":"136,345"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = dic_list feature_count = len(dic_list) #print "INFO: feature_count=",feature_count #print "dic_list=",dic_list #{u'123,345':u'136'} #print "INFO: dic_all_columns=",dic_all_columns # {1: u'8215,8216'} # end # get hash : raw string mapping ================================== key = "dic_hash_str" #{"123":"openFile"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_hash_str = doc['value'] ''' # get folder list (labels) from hdfs data_out/<id>/libsvm ============== libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data") # based on label, divide RDD into arrays f_rdd = sc.textFile(libsvm_loc).map(lambda x: libsvm2tuple_arr(x)) arr_libsvm=sorted(f_rdd.collect(), key=lambda x:x[0]) # sorted by label ''' # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file print "INFO: feature_count=",feature_count\ # get sample array from hdfs arr_libsvm = zip_feature_util.get_sample_tuple_arr(sc, libsvm_data_file) # sorted by label arr_libsvm = sorted(arr_libsvm, key=lambda x: x[0]) # convert libsvm to features_list, row_list, col_list, sample count, col_num lbl_flag = -1 row_num_training = 0 sparse_mtx_list = [] # for feat impor calculation features_list = [] # for csc_matrix row_list = [] # for csc_matrix col_list = [] # for csc_matrix sample_numbers = [] # for csc_matrix feature_arr = None for idx, i in enumerate(arr_libsvm): #print "idx=",idx,",l=",i[0],",d=",i[1:] if lbl_flag != i[0]: if feature_arr and len(feature_arr) > 0: features_list.append(np.array(feature_arr)) row_list.append(np.array(row_arr)) col_list.append(np.array(col_arr)) sample_numbers.append(cnt) row_arr = [] col_arr = [] feature_arr = [] cnt = 0 lbl_flag += 1 for j in i[1:]: row_arr.append(cnt) col_arr.append(j[0] - 1) feature_arr.append(j[1]) cnt += 1 # for last part if len(feature_arr) > 0: features_list.append(np.array(feature_arr)) row_list.append(np.array(row_arr)) col_list.append(np.array(col_arr)) sample_numbers.append(cnt) #print ",features_list=",features_list #print ",row_list=",row_list #print ",col_list=",col_list print "INFO: sample_numbers=", sample_numbers col_num = len(dic_list) print "INFO: column number: ", col_num #, ",len(max_feat_list)=",len(max_feat_list) for i in range(0, len(features_list)): #print "i=",i #print "features_list=",features_list[i] #print "row_list=",row_list[i] #print "col_list=",col_list[i] #print "sample_numbers=",sample_numbers[i] sparse_mtx = csc_matrix((features_list[i], (row_list[i], col_list[i])), shape=(sample_numbers[i], col_num)) sparse_mtx_list.append(sparse_mtx) #print sparse_mtx_list[0] print "INFO: sparse_mtx_list[0].shape=", sparse_mtx_list[0].shape #print sparse_mtx_list[1] print "INFO: sparse_mtx_list[1].shape=", sparse_mtx_list[1].shape exclusive_feature_set_mal = [] exclusive_feature_set_clean = [] dic_feature_cnt_mal = {} dic_feature_cnt_clean = {} dic_score = {} dic_cnt_mal = {} dic_cnt_clean = {} dic_IT_grain = {} #################################################### ####feature importance algorithms: 2 methods ####### # Only for 2 classes ??? #################################################### if len(sample_numbers) == 2: ################################################### ################## calculate probability ############ ################################################### print "INFO: =======Feature Importance(probability) ================ " for j in range(0, col_num): curr_col_dirty = sparse_mtx_list[0].getcol(j) sum_col = curr_col_dirty.sum(0) cnt_mal = sum_col.tolist()[0][0] curr_col_clean = sparse_mtx_list[1].getcol(j) sum_col = curr_col_clean.sum(0) cnt_clean = sum_col.tolist()[0][0] percnt_mal = cnt_mal / float(sample_numbers[0]) percnt_clean = cnt_clean / float(sample_numbers[1]) score_j = (percnt_mal + 1 - percnt_clean) / 2 dic_score[j + 1] = score_j dic_cnt_clean[j + 1] = cnt_clean dic_cnt_mal[j + 1] = cnt_mal sorted_score = sorted(dic_score.items(), key=operator.itemgetter(1), reverse=True) #print "sorted_score:", sorted_score #print "dic_cnt_clean", dic_cnt_clean #print "dic_cnt_mal", dic_cnt_mal ############output result######################## if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("Error: %s - %s." % (e.local_score_file, e.strerror)) for ii in range(0, len(sorted_score)): (feat, score) = sorted_score[ii] #print feat, score, dic_all_columns[feat] if dic_hash_str: description_str = feats2strs(dic_all_columns[str(feat)], dic_hash_str) else: description_str = "N/A" print "Warning: No mapping found for feature number" str01 = str(feat) + "\t" + str( score) + "\t" + description_str + "\n" with open(local_score_file, "a") as f: f.write(str01) ######################################################## ##################Information Gain (entropy)############ ######################################################## print "INFO: =======Information Gain================ " for j in range(0, col_num): cnt_mal = dic_cnt_mal[j + 1] cnt_clean = dic_cnt_clean[j + 1] total_samples = sample_numbers[0] + sample_numbers[1] p0 = float(sample_numbers[0]) / total_samples p1 = 1 - p0 if p0 == 0 or p1 == 0: parent_entropy = 0 else: parent_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(p1) if cnt_clean + cnt_mal == 0: information_gain = 0 elif total_samples - cnt_clean - cnt_mal == 0: information_gain = 0 else: p0 = float(cnt_mal) / (cnt_clean + cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_left_entropy = 0 else: child_left_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) p0 = float(sample_numbers[0] - cnt_mal) / (total_samples - cnt_clean - cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_right_entropy = 0 else: child_right_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) weighted_child_entropy = child_left_entropy * float( cnt_clean + cnt_mal) / total_samples + child_right_entropy * float( total_samples - cnt_clean - cnt_mal) / total_samples information_gain = parent_entropy - weighted_child_entropy dic_IT_grain[j + 1] = information_gain sorted_IT_gain = sorted(dic_IT_grain.items(), key=operator.itemgetter(1), reverse=True) if os.path.exists(score_file_IT): try: os.remove(score_file_IT) except OSError, e: print("Error: %s - %s." % (e.score_file_IT, e.strerror))
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, training_fraction, jobname, uploadtype, description_file): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # get feature seq mapping from mongo if uploadtype == "MD5 List IN-dynamic": ### connect to database to get the column list which contains all column number of the corresponding feature key = "dict_dynamic" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = {} max_feature = 0 # reverse dict{hashes:sequence number} ====== for i in range(0, len(dic_list)): for key in dic_list[i]: dic_all_columns[eval(dic_list[i][key])] = key if eval(dic_list[i][key]) > max_feature: max_feature = eval(dic_list[i][key]) print "INFO: max_feature=", max_feature #print "dic_all_columns=",dic_all_columns # fid:numb,numb dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() #print "INFO: dirFile_loc=",dirFile_loc,", hash_Folders=",hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: hdfs folder_list=", folder_list #['dirty/', 'clean/'] features_training = [] labels_training = [] names_training = [] row_training = [] col_training = [] max_feat_training = 0 row_num_training = 0 features_testing = [] labels_testing = [] names_testing = [] row_testing = [] col_testing = [] max_feat_testing = 0 row_num_testing = 0 # loop through hdfs folders; TBD for folder in folder_list: print "INFO: folder=", folder label = folder_list.index(folder) + 1 print 'INFO: label=', label logFile_name = os.path.join(hdfs_feat_dir, folder, mtx_name_list) #print "logFile_name=",logFile_name logFile_data = os.path.join(hdfs_feat_dir, folder, mtx_libsvm) #print "logFile_data=",logFile_data logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] ##########data seperation###### id_perm = data_seperation_random(name_list) num_names = len(name_list) print 'INFO: num of samples=', num_names num_train = int(training_portion * num_names) print 'INFO: num_train = ', num_train ########generate training data######### i = 0 #print "INFO: generate training data" #print "INFO: len(id_perm)=",len(id_perm) while i < num_train: #print i, id_perm[i] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i + row_num_training) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat_training = max(max_feat_training, int(feat)) j = j + 1 i = i + 1 row_num_training = row_num_training + num_train i = num_train ########generate testing data######### while i < num_names: ####for generating testing data folder#### test_file_name = name_list[id_perm[i]] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_testing.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_testing.append(i - num_train + row_num_testing) col_testing.append(int(feat) - 1) features_testing.append(int(value)) max_feat_testing = max(max_feat_testing, int(feat)) j = j + 1 i = i + 1 row_num_testing = row_num_testing + (num_names - num_train) # end for loop here ======================== col_num = max(max_feat_training, max_feat_testing) if max_feat_training < col_num: for i in range(0, row_num_training): for j in range(max_feat_training, col_num): features_training.append(0) row_training.append(i) col_training.append(j) elif max_feat_testing < col_num: for i in range(0, row_num_testing): for j in range(max_feat_testing, col_num): features_testing.append(0) row_testing.append(i) col_testing.append(j) features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) #print "row_training:", row_training #print "INFO: col_training:", col_training len_col = len(col_training) print "INFO: col_num:", col_num labels_training = array(labels_training) features_testing = array(features_testing) row_testing = array(row_testing) col_testing = array(col_testing) labels_testing = array(labels_testing) sparse_mtx = csc_matrix((features_training, (row_training, col_training)), shape=(row_num_training, col_num)) #print "sparse_mtx.todense(), sparse_mtx.shape=",sparse_mtx.todense(), sparse_mtx.shape sparse_test = csc_matrix((features_testing, (row_testing, col_testing)), shape=(row_num_testing, col_num)) #print " sparse_test.todense(), sparse_test.shape=",sparse_test.todense(), sparse_test.shape clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) #print "labels_training=",labels_training #print "sparse_mtx=",sparse_mtx clf.fit(sparse_mtx, labels_training) #print "INFO: model:intercept=",clf.intercept_ #print "INFO: model:coef=",clf.coef_ labels_pred = clf.predict(sparse_test) #print "labels_pred:", labels_pred accuracy = clf.score(sparse_test, labels_testing) #print "INFO: data folder=", hdfs_feat_dir print "INFO: accuracy=", accuracy ##################################################################### ##################calculate feature importance with predication labels####################### ##################################################################### AA = sparse_mtx.todense() BB = sparse_test.todense() labels_train_pred = clf.predict(sparse_mtx) labels_test_pred = labels_pred #print "###################################################################################" print "INFO: ======= Calculate feature importance with predication labels ==================" #print "###################################################################################" dic_importance_label = {} for j in range(0, col_num): ###for all features in the loop ############################## #print "====new way with sparse matrix=========" curr_col_train = sparse_mtx.getcol(j) sum_col = curr_col_train.sum(0) positive_feature_number = int(sum_col.tolist()[0][0]) labels_value = 3 - labels_train_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train) sum_product = dot_product.sum(1) labels_positive_sum = int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = sum_label_values - labels_positive_sum ############################## #print "====new way with sparse matrix=========" curr_col_test = sparse_test.getcol(j) sum_col = curr_col_test.sum(0) positive_feature_number = positive_feature_number + int( sum_col.tolist()[0][0]) labels_value = 3 - labels_test_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test) sum_product = dot_product.sum(1) labels_positive_sum = labels_positive_sum + int( sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = labels_negitive_sum + sum_label_values - int( sum_product.tolist()[0][0]) n_total = row_num_training + row_num_testing negitive_feature_number = n_total - positive_feature_number if positive_feature_number == 0: #print "feature ", j+1, "all 0s!" dic_importance_label[j + 1] = -100 elif negitive_feature_number == 0: #print "feature ", j+1, "all 1s!" dic_importance_label[j + 1] = -200 else: q_positive = float(labels_positive_sum) / positive_feature_number q_negitive = float(labels_negitive_sum) / negitive_feature_number Q = (q_positive - q_negitive) * sqrt( float(q_positive) * q_negitive / float(n_total) / float(n_total)) dic_importance_label[j + 1] = Q sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True) print "INFO: ======= Feature Importance(FIRM score) ================" if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
def feat_importance_2way(row_id_str, ds_id, hdfs_feat_dir, score_file_IT, score_file_prob, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, jobname, uploadtype, description_file): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # get folder list (labels) from hdfs data_out/<id>/metadata dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() #print "hash_Folders=",hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] #print "INFO: dirFile_loc=",dirFile_loc,", folder_list=",folder_list row_num_training = 0 sample_numbers = [] sparse_mtx_list = [] features_list = [] row_list = [] col_list = [] max_feat_list = [] for folder in folder_list: print "INFO: folder:", folder label = folder_list.index(folder) + 1 print 'INFO: label=', label # md5 list logFile_name = os.path.join(hdfs_feat_dir, folder, mtx_name_list) # libsvm data logFile_data = os.path.join(hdfs_feat_dir, folder, mtx_libsvm) #logFile_data = hdfs_feat_dir + folder + mtx_stat #print "INFO: logFile_name=",logFile_name #print "INFO: logFile_data=",logFile_data logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] num_names = len(name_list) sample_numbers.append(num_names) print 'INFO: sample_numbers=', sample_numbers print 'INFO: name_list count=', num_names ########generate list for csc_matrix creation ######### i = 0 features_training = [] row_training = [] col_training = [] labels_training = [] max_feat = 0 while i < num_names: features = feature_list[i] features = features.strip() feature_array = features.split(' ') labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat = max(max_feat, int(feat)) j = j + 1 i = i + 1 #print "features_training=",features_training #print "row_training=",row_training #print "col_training=",col_training features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) #print "type(row_training)=",type(row_training),",row_training=",row_training #print "type(col_training)=",type(col_training),",col_training=",col_training max_feat_list.append(max_feat) features_list.append(features_training) row_list.append(row_training) col_list.append(col_training) row_num_training = row_num_training + num_names #print " END for folder in folder_list: " col_num = max(max_feat_list) print "INFO: column number=", col_num, ", len(max_feat_list)=", len( max_feat_list) for i in range(0, len(max_feat_list)): sparse_mtx = csc_matrix((features_list[i], (row_list[i], col_list[i])), shape=(sample_numbers[i], col_num)) sparse_mtx_list.append(sparse_mtx) #print sparse_mtx_list[0] #print "sparse_mtx_list[0].shape=",sparse_mtx_list[0].shape #print sparse_mtx_list[1] #print "sparse_mtx_list[1].shape=",sparse_mtx_list[1].shape exclusive_feature_set_mal = [] exclusive_feature_set_clean = [] dic_feature_cnt_mal = {} dic_feature_cnt_clean = {} dic_score = {} dic_cnt_mal = {} dic_cnt_clean = {} dic_IT_grain = {} #################################################### ####feature importance algorithms: 2 methods ####### # Only for 2 classes ??? #################################################### if len(sample_numbers) == 2: ################################################### ################## calculate probability ############ ################################################### print "INFO: ======= Feature Importance(probability) ================" for j in range(0, col_num): curr_col_dirty = sparse_mtx_list[0].getcol(j) sum_col = curr_col_dirty.sum(0) cnt_mal = sum_col.tolist()[0][0] curr_col_clean = sparse_mtx_list[1].getcol(j) sum_col = curr_col_clean.sum(0) cnt_clean = sum_col.tolist()[0][0] percnt_mal = cnt_mal / float(sample_numbers[0]) percnt_clean = cnt_clean / float(sample_numbers[1]) score_j = (percnt_mal + 1 - percnt_clean) / 2 dic_score[j + 1] = score_j dic_cnt_clean[j + 1] = cnt_clean dic_cnt_mal[j + 1] = cnt_mal sorted_score = sorted(dic_score.items(), key=operator.itemgetter(1), reverse=True) #print "sorted_score:", sorted_score #print "dic_cnt_clean", dic_cnt_clean #print "dic_cnt_mal", dic_cnt_mal if os.path.exists(score_file_prob): try: os.remove(score_file_prob) except OSError, e: print("Error: %s - %s." % (e.score_file_prob, e.strerror)) for ii in range(0, len(sorted_score)): (feat, score) = sorted_score[ii] #print feat, score, dic_all_columns[feat] str01 = str(feat) + "\t" + str( score) + "\t" + description_str + "\n" with open(score_file_prob, "a") as f: f.write(str01) ######################################################## ##################Information Gain (entropy)############ ######################################################## print "INFO: ======= Information Gain ================" for j in range(0, col_num): cnt_mal = dic_cnt_mal[j + 1] cnt_clean = dic_cnt_clean[j + 1] total_samples = sample_numbers[0] + sample_numbers[1] p0 = float(sample_numbers[0]) / total_samples p1 = 1 - p0 if p0 == 0 or p1 == 0: parent_entropy = 0 else: parent_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(p1) if cnt_clean + cnt_mal == 0: information_gain = 0 elif total_samples - cnt_clean - cnt_mal == 0: information_gain = 0 else: p0 = float(cnt_mal) / (cnt_clean + cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_left_entropy = 0 else: child_left_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) p0 = float(sample_numbers[0] - cnt_mal) / (total_samples - cnt_clean - cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_right_entropy = 0 else: child_right_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) weighted_child_entropy = child_left_entropy * float( cnt_clean + cnt_mal) / total_samples + child_right_entropy * float( total_samples - cnt_clean - cnt_mal) / total_samples information_gain = parent_entropy - weighted_child_entropy dic_IT_grain[j + 1] = information_gain sorted_IT_gain = sorted(dic_IT_grain.items(), key=operator.itemgetter(1), reverse=True) if os.path.exists(score_file_IT): try: os.remove(score_file_IT) except OSError, e: print("ERROR: %s - %s." % (e.score_file_IT, e.strerror))
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, training_fraction, jobname, random_seed=None): ### generate data folder and out folder, clean up if needed #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # create zip files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat = 0 ml_opts = {} if not ml_opts_jstr is None: ml_opts = json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat = ml_opts["has_excluded_feat"] #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist = ml_util.ml_get_excluded_feat( row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=", excluded_feat_cslist # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file:", libsvm_data_file # load feature count file feat_count_file = libsvm_data_file + "_feat_count" feature_count = zip_feature_util.get_feature_count(sc, feat_count_file) print "INFO: feature_count=", feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist) # get distinct label list labels_list_all = samples_rdd.map( lambda p: p[0].label).distinct().collect() # split samples to training and testing data, format (LabeledPoint,hash) training_rdd, testing_rdd = samples_rdd.randomSplit( [training_fraction, 1 - training_fraction], seed=int(random_seed)) training_rdd = training_rdd.map(lambda p: p[0]) # keep LabeledPoint only training_rdd.cache() training_sample_count = training_rdd.count() training_lbl_cnt_list = training_rdd.map( lambda p: (p.label, 1)).reduceByKey(add).collect() testing_rdd.cache() testing_sample_count = testing_rdd.count() testing_lbl_cnt_list = testing_rdd.map( lambda p: (p[0].label, 1)).reduceByKey(add).collect() sample_count = training_sample_count + testing_sample_count # random_seed testing if not random_seed is None: all_t = testing_rdd.collect() all_t = sorted(all_t, key=lambda x: x[1]) cnt = 0 for i in all_t: print i[1] cnt = cnt + 1 if cnt > 3: break t1 = time() print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list print "INFO: labels_list_all=", labels_list_all print "INFO: training and testing samples generated!" print 'INFO: running time: %f' % (t1 - t0) t0 = t1 ############################################### ###########build learning model################ ############################################### ### get the parameters### print "INFO: ======Learning Algorithm and Parameters=============" #ml_opts = json.loads(ml_opts_jstr) model_name = ml_opts[ 'learning_algorithm'] # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd iteration_num = 0 if 'iterations' in ml_opts: iteration_num = ml_opts['iterations'] C = 0 if 'c' in ml_opts: C = eval(ml_opts['c']) regularization = "" if 'regularization' in ml_opts: regularization = ml_opts['regularization'] print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: iterations = ", iteration_num print "INFO: regType = ", regularization regP = C / float(training_sample_count) print "INFO: Calculated: regParam = ", regP ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: ''' key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] print "INFO: dic_list=",dic_list label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') ''' label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id) print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels:", labels_list class_num = len(labels_list) if class_num > 2: print "INFO: Multi-class classification! Number of classes = ", class_num ### build model ### if model_name == "linear_svm_with_sgd": ### 1: linearSVM print "INFO: ====================1: Linear SVM=============" model_classification = SVMWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) #print model_classification elif model_name == "logistic_regression_with_lbfgs": ### 2: LogisticRegressionWithLBFGS print "INFO: ====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_sgd": ### 3: LogisticRegressionWithSGD print "INFO: ====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "INFO: Training model selection error: no valid ML model selected!" return print "INFO: model type=", type(model_classification) # create feature coefficient file ================================ coef_arr = None intercept = None if model_classification.weights is None: print "WARNING: model weights not found!" else: coef_weights = model_classification.weights #print "coef_weights=",coef_weights #print type(coef_weights),coef_weights.shape coef_arr = coef_weights.toArray().tolist() # save coef_arr to mongo key = "coef_arr" ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples) # save coef_arr to local file if ret == 0: # drop old record in mongo filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}' ret = query_mongo.delete_many(mongo_tuples, None, filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fn_ca = os.path.join(local_out_dir, row_id_str, row_id_str + "_coef_arr.pkl") print ml_util.ml_pickle_save(coef_arr, fn_ca) # save intercept to mongo intercept = model_classification.intercept key = "coef_intercept" ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples) # feature list + coef file ============= feat_filename = os.path.join(local_out_dir, row_id_str + "_feat_coef.json") print "INFO: feat_filename=", feat_filename # create feature, coef & raw string file =============================================== ============ # expect a dict of {"fid":(coef, feature_raw_string)} jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None, coef_arr, ds_id, mongo_tuples) # special featuring for IN or libsvm if jret is None: jret = ml_util.build_feat_coef_raw_list_t(row_id_str, feat_filename, coef_arr, ds_id, mongo_tuples) if jret is None: print "WARNING: Cannot create sample list for testing dataset. " jfeat_coef_dict = jret print "INFO: coef_arr len=", len( coef_arr), ", feature_count=", feature_count # for multi-class if len(coef_arr) != feature_count: jfeat_coef_dict = {} print "WARNING: coef count didn't match feature count. multi-class classification was not supported" # Calculate prediction and Save testing dataset bt_coef_arr = sc.broadcast(coef_arr) bt_intercept = sc.broadcast(intercept) bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict) ### Evaluating the model on testing dataset: label, predict label, score, feature list print "INFO: intercept=", intercept print "INFO: coef_arr len=", len(coef_arr), type(coef_arr) print "INFO: jfeat_coef_dict len=", len( jfeat_coef_dict) #, jfeat_coef_dict # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ============================== if len(coef_arr) == feature_count: testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \ ,p[0].features \ ,p[1] \ ) ).cache() else: # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,"-" \ ,p[0].features \ ,p[1] \ ) ).cache() ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \ # Save testing dataset for analysis libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str print "INFO: libsvm_testing_output=", libsvm_testing_output try: hdfs.rmr(libsvm_testing_output) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] # save only false prediction? #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output) testing_pred_rdd.saveAsTextFile(libsvm_testing_output) ''' #test_tmp=testing_pred_rdd.collect() # save false prediction to local file false_pred_fname = os.path.join(local_out_dir, row_id_str + "_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\ .map(lambda p: (p[0],p[1],p[2] \ ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value) ,p[4] ) ) \ .collect() print "INFO: false predicted count=", len(false_pred_data) false_pred_arr = [] with open(false_pred_fname, "w") as fp: for sp in false_pred_data: jsp = { "tlabel": sp[0], "plabel": sp[1], "score": sp[2], "feat": sp[3], "hash": sp[4] } #print "jsp=",jsp false_pred_arr.append(jsp) fp.write(json.dumps(false_pred_arr)) # save prediction results, format: label, prediction, hash pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname pred_out_arr = testing_pred_rdd.map(lambda p: (p[0], p[1], p[4])).collect() ml_util.ml_pickle_save(pred_out_arr, pred_ofname) ''' one_item= testing_pred_rdd.first() print "one_item=",one_item sparse_arr=one_item[3] dict_feat=zip_feature_util.sparseVector2dict(sparse_arr) print "len=",len(dict_feat),"dict_feat=",dict_feat dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat) print "len=",len(dict_weit),"dict_weit=",dict_weit ''' # Calculate Accuracy. labelsAndPreds = (true_label,predict_label) labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1])) labelsAndPreds.cache() testing_sample_number = testing_rdd.count() testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( testing_sample_number) accuracy = 1 - testErr print "INFO: Accuracy = ", accuracy ### Save model #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/' #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'), config.get('app', 'HDFS_MODEL_DIR'), row_id_str) try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror), ". At HDFS=", save_dir except: print "WARNING: Unexpected error:", sys.exc_info( )[0], ". At HDFS=", save_dir model_classification.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) t1 = time() print 'INFO: training run time: %f' % (t1 - t0) t0 = t1 ############################################### ###########plot prediction result figure ==================================================== =============== ############################################### labels = labelsAndPreds.collect() true_label_list = [x for x, _ in labels] pred_label_list = [x for _, x in labels] pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png") true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png") pred_xlabel = 'Prediction (Single Run)' true_xlabel = 'True Labels (Single Run)' test_cnt_dic = ml_util.ml_plot_predict_figures( pred_label_list, true_label_list, labels_list, label_dic, testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname #print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc = None perf_measures = None dataset_info = { "training_fraction": training_fraction, "class_count": class_num, "dataset_count": sample_count } ############################################################# ###################for 2 class only (plot ROC curve) ==================================================== =============== ############################################################# if len(labels_list) == 2: do_ROC = True reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!" do_ROC = False # build data file for score graph score_graph_fname = os.path.join(local_out_dir, row_id_str + "_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname # build score_arr_0, score_arr_1 # format: tlabel, plabel, score, libsvm, raw feat str, hash graph_arr = testing_pred_rdd.map(lambda p: (int(p[0]), float(p[2]))).collect() score_arr_0 = [] score_arr_1 = [] max_score = 0 min_score = 0 for p in graph_arr: if p[0] == 0: score_arr_0.append(p[1]) else: score_arr_1.append(p[1]) # save max,min score if p[1] > max_score: max_score = p[1] elif p[1] < min_score: min_score = p[1] ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name, score_graph_fname, max_score, min_score) if do_ROC: perf_measures = ml_util.calculate_fscore(true_label_list, pred_label_list) print "RESULT: perf_measures=", perf_measures ''' # calculate fscore ========== tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn precision=float(tp)/(tp+fp) recall=float(tp)/(tp+fn) print "RESULT: precision=",precision,",recall=",recall acc=(tp+tn)/(float(testing_sample_number)) fscore=2*((precision*recall)/(precision+recall)) print "RESULT: fscore=",fscore,",acc=",acc ''' model_classification.clearThreshold() scoreAndLabels = testing_rdd.map(lambda p: ( model_classification.predict(p[0].features), int(p[0].label))) #metrics = BinaryClassificationMetrics(scoreAndLabels) #areROC = metrics.areaUnderROC #print areROC scoreAndLabels_list = scoreAndLabels.collect() if flag_clean == 0: scores = [x for x, _ in scoreAndLabels_list] s_labels = [x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x, _ in scoreAndLabels_list] s_labels = [1 - x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc = ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P, local_out_dir, row_id_str) #, local_out_dir, file_name_given) perf_measures["roc_auc"] = roc_auc # only update db for web request ==================================================== =============== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" print 'INFO: Finished!' return 0
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, training_fraction, jobname, uploadtype): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) ''' SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'feature_importance_FRIM:'+str(args.row_id)) ''' t0 = time() # get folder list (labels) from hdfs data_out/<id>/metadata ============== dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: folder_list=", folder_list #['dirty/', 'clean/'] # get feature seq : ngram hash mapping ================================== key = "dic_seq_hashes" #{"123":"136,345"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = dic_list feature_count = len(dic_list) #print "INFO: feature_count=",feature_count #print "dic_list=",dic_list #{u'123,345':u'136'} #print "dic_all_columns=",dic_all_columns # {1: u'8215,8216'} # end # get {hash : raw string} mapping ================================== key = "dic_hash_str" #{"123":"openFile"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_hash_str = doc['value'] ''' # get folder list (labels) from hdfs data_out/<id>/libsvm ============== libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_loc=", libsvm_loc samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_loc) ''' # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file #feat_count_file=libsvm_data_file+"_feat_count" #feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) print "INFO: feature_count=", feature_count #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist=None) labels_and_features_rdd = samples_rdd.map(lambda p: (p[0].label, p[0].features)) all_data = labels_and_features_rdd.collect() features_list = [x.toArray() for _, x in all_data] labels_list_all = [x for x, _ in all_data] labels_list_all = np.array(labels_list_all) features_array = np.array(features_list) ### generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) ### randomly split the samples into training and testing data sparse_mtx, sparse_test, labels_training, labels_testing = \ cross_validation.train_test_split(features_sparse_mtx, labels_list_all, test_size=(1-training_fraction)) #print "INFO: sparse_mtx.shape=",sparse_mtx.shape #print "INFO: sparse_test.shape=",sparse_test.shape row_num_training = (sparse_mtx.shape)[0] row_num_testing = (sparse_test.shape)[0] # why use LinearSVC ? clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) #print "labels_training=",labels_training #print "sparse_mtx=",sparse_mtx clf.fit(sparse_mtx, labels_training) #print "**model:intercept***" #print clf.intercept_ #print "**model:coef***" #print clf.coef_ col_num = len(clf.coef_[0]) # for n_classes==2 print "INFO: col_num=", col_num labels_pred = clf.predict(sparse_test) #print "labels_pred:", labels_pred accuracy = clf.score(sparse_test, labels_testing) print "INFO: data folder:", hdfs_feat_dir print "INFO: accuracy: ", accuracy ##################################################################### ##################calculate feature importance with predication labels####################### ##################################################################### AA = sparse_mtx.todense() BB = sparse_test.todense() labels_train_pred = clf.predict(sparse_mtx) labels_test_pred = labels_pred print "INFO: ###################################################################################" print "INFO: ############calculate feature importance with predication labels###################" print "INFO: ###################################################################################" dic_importance_label = {} for j in range(0, col_num): ###for all features in the loop ############################## #print "====new way with sparse matrix=========" curr_col_train = sparse_mtx.getcol(j) sum_col = curr_col_train.sum(0) positive_feature_number = int(sum_col.tolist()[0][0]) labels_value = 3 - labels_train_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train) sum_product = dot_product.sum(1) labels_positive_sum = int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = sum_label_values - labels_positive_sum ############################## #print "====new way with sparse matrix=========" curr_col_test = sparse_test.getcol(j) sum_col = curr_col_test.sum(0) positive_feature_number = positive_feature_number + int( sum_col.tolist()[0][0]) labels_value = 3 - labels_test_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test) sum_product = dot_product.sum(1) labels_positive_sum = labels_positive_sum + int( sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = labels_negitive_sum + sum_label_values - int( sum_product.tolist()[0][0]) n_total = row_num_training + row_num_testing negitive_feature_number = n_total - positive_feature_number if positive_feature_number == 0: #print "feature ", j+1, "all 0s!" dic_importance_label[j + 1] = -100 elif negitive_feature_number == 0: #print "feature ", j+1, "all 1s!" dic_importance_label[j + 1] = -200 else: q_positive = float(labels_positive_sum) / positive_feature_number q_negitive = float(labels_negitive_sum) / negitive_feature_number Q = (q_positive - q_negitive) * sqrt( float(q_positive) * q_negitive / float(n_total) / float(n_total)) dic_importance_label[j + 1] = Q sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True) print "INFO: =======Feature Importance(FIRM score)================" if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
def feat_extraction(row_id_str, hdfs_dir_list, hdfs_feat_dir, model_data_folder, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, fromweb, label_arr, metadata_count, label_idx, data_idx, pattern_str, ln_delimitor, data_field_list, jkey_dict, jobname, num_gram, feature_count_threshold, token_dict=None, HDFS_RETR_DIR=None, remove_duplicated="N", cust_featuring=None, cust_featuring_params=None, local_out_dir=None, filter_ratio=None, binary_flag=False): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, user_custom=cust_featuring) # get_spark_context spark = ml_util.ml_get_spark_session(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, zip_file_path) if spark: sc = spark.sparkContext # log time ================================================================ ================ t0 = time() # input filename input_filename = "*" ext_type = '.gz' gz_list = None # single hdfs file if not ',' in hdfs_dir_list: # single dir having *.gz ==== ========= # read raw data from HDFS as .gz format ========== hdfs_files = os.path.join(hdfs_dir_list, input_filename + ext_type) # check if gz files in hdfs ============ try: gz_list = hdfs.ls(hdfs_dir_list) print "INFO: check hdfs folder=", hdfs_dir_list except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info()[0] # use whole folder #print "gz_list",gz_list if gz_list is None or len(gz_list) == 0: print "ERROR: No file found by ", input_filename + ext_type #,", use",hdfs_dir_list,"instead" return -2 elif len(gz_list) == 1: # use dir as filename hdfs_files = hdfs_dir_list[0:-1] else: # multiple dirs ==== ========= hdfs_files = "" cnt = 0 temp_lbl_list = [] comma = "" print "INFO: before label_arr=", label_arr # check each folder for dr in hdfs_dir_list.split(','): #print "****=",dr if not len(dr) > 0: continue try: # remove space etc. dr = dr.strip() fdr = os.path.join(HDFS_RETR_DIR, dr) # ls didn't like "*" if '*' in fdr: #gz_list=hdfs.ls(fdr.replace("*","")) dn = os.path.dirname(fdr).strip() bn = os.path.basename( fdr).strip() #print "dn=",dn,",bn=",bn # get all names under folder and do filtering gz_list = fnmatch.filter(hdfs.ls(dn), '*' + bn) else: gz_list = hdfs.ls(fdr) cnt = cnt + len(gz_list) if len(gz_list) > 0: hdfs_files = hdfs_files + comma + fdr comma = "," except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info( )[0] # use whole folder if cnt is None or cnt == 0: print "ERROR: No file found at", hdfs_files return -2 else: print "INFO: total file count=", cnt # set convert flag only when multiple dir and label_arr has dirty label if not label_arr is None and len( label_arr) == 2 and label_arr[1] == "dirty": convert2dirty = "Y" print "INFO: hdfs_dir_list=", hdfs_dir_list print "INFO: hdfs_files=", hdfs_files cust_featuring_jparams = None # custom featuring if not cust_featuring is None and len(cust_featuring) > 0: # load user module ======= user_func, cust_featuring_jparams = get_user_custom_func( cust_featuring, cust_featuring_params) # TBD apply user_func all_hashes_cnt_dic = None all_hash_str_dic = None all_hashes_seq_dic = None else: print "ERROR: custom featuring type is needed" print "INFO: cust_featuring=", cust_featuring, "cust_featuring_jparams=", cust_featuring_jparams dnn_flag = False has_header = None label_col = None label_index = None # get featuring params if cust_featuring_jparams: if 'label_index' in cust_featuring_jparams: # idx number for label, 0 based label_index = cust_featuring_jparams['label_index'] if 'has_header' in cust_featuring_jparams: # True/False has_header = eval(cust_featuring_jparams['has_header']) if has_header == 1: has_header = True if 'dnn_flag' in cust_featuring_jparams: # True/False dnn_flag = cust_featuring_jparams['dnn_flag'] if dnn_flag == 1: dnn_flag = True elif dnn_flag == 0: dnn_flag = False if label_index is None: label_index = 0 elif not isinstance(label_index, int): label_index = eval(label_index) print "INFO: label_index=", label_index, ",has_header=", has_header, ",dnn_flag=", dnn_flag # read as DataFrame =============================================== df = spark.read.csv(hdfs_files, header=has_header) df.show() print "INFO: col names=", df.columns # get column name for label label_col = None for i, v in enumerate(df.columns): if i == label_index: label_col = v # get all distinct labels into an array =============== provided by parameter? if label_arr is None and not label_col is None: label_arr = sorted([ rw[label_col] for rw in df.select(label_col).distinct().collect() ]) print "INFO: label_arr=", label_arr label_dic = {} # convert label_arr to dict; {label:number| for idx, label in enumerate(sorted(label_arr)): if not label in label_dic: label_dic[ label] = idx #starting from 0, value = idx, e.g., clean:0, dirty:1 # add params for dataframe conversion cust_featuring_jparams["label_dict"] = label_dic # convert to int cust_featuring_jparams["label_index"] = label_index featuring_params = json.dumps(cust_featuring_jparams) # convert DataFrame row to libsvm string libsvm_rdd = df.rdd.map(lambda x: user_func(list(x), featuring_params)) print "INFO: sample df row=", (libsvm_rdd.collect()[0]) print "INFO: featuring_params=", featuring_params total_input_count = df.count() print "INFO: Total input sample count=", total_input_count #print "INFO: feature_count_threshold=",feature_count_threshold #get all hashes and total occurring count =============== # all_hashes_cnt_dic: {'col index': total count,... } # build all_hashes_cnt_dic cnt_df = df.select( [count(when(~isnull(c), c)).alias(c) for c in df.columns]) #cnt_df.show() cnt_arr = cnt_df.rdd.map(lambda x: list(x)).collect() feat_sample_count_arr = cnt_arr[0] #print "feat_sample_count_arr=",feat_sample_count_arr if all_hashes_cnt_dic is None: all_hashes_cnt_dic = {} idx = 1 for i, v in enumerate(feat_sample_count_arr): if i != label_index: all_hashes_cnt_dic[idx] = v idx += 1 #print "all_hashes_cnt_dic=",all_hashes_cnt_dic #get all hashes and their extracted string =============== # all_hash_str_dic: {hash:'str1', ... if all_hash_str_dic is None: # convert header to dict=index:string; excude label column all_hash_str_dic = {} idx = 1 for i, v in enumerate(df.schema.names): if i != label_index: all_hash_str_dic[idx] = v idx += 1 #print "all_hash_str_dic=",all_hash_str_dic # save labels to hdfs as text file==================================== ============ hdfs_folder = hdfs_feat_dir #+ "/" # "/" is needed to create the folder correctly print "INFO: hdfs_folder=", hdfs_folder try: hdfs.mkdir(hdfs_folder) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0] # clean up metadata_file metadata_file = os.path.join(hdfs_folder, metadata) #"metadata" print "INFO: metadata_file=", metadata_file try: hdfs.rmr(metadata_file) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at rmr():", sys.exc_info()[0] sc.parallelize(label_arr, 1).saveAsTextFile(metadata_file) #remap all hash values to continuous key/feature number ============== # all_hashes_seq_dic: { hash : sequential_numb } if all_hashes_seq_dic is None: all_hashes_seq_dic = {} # csv column index as sequentail number remap2seq(all_hash_str_dic, all_hashes_seq_dic) #print "all_hashes_seq_dic=",all_hashes_seq_dic total_feature_numb = len(all_hashes_seq_dic) print "INFO: Total feature count=", len(all_hashes_seq_dic) # save feat_sample_count_arr data ==================================== ============ filter = '{"rid":' + row_id_str + ',"key":"feat_sample_count_arr"}' upsert_flag = True jo_insert = {} jo_insert["rid"] = eval(row_id_str) jo_insert["key"] = "feat_sample_count_arr" jo_insert["value"] = feat_sample_count_arr jstr_insert = json.dumps(jo_insert) ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert, upsert_flag) print "INFO: Upsert count for feat_sample_count_arr=", ret # insert failed, save to local if ret == 0: # drop old record in mongo ret = query_mongo.delete_many(mongo_tuples, None, filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fsca_hs = os.path.join(local_out_dir, row_id_str, row_id_str + "_feat_sample_count_arr.pkl") print "WARNING: save feat_sample_count_arr to local" ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs) # get rdd statistics info # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx if remove_duplicated == "Y": libsvm_rdd=libsvm_rdd \ .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \ .groupByKey().map(lambda x: list(x[1])[0] ) \ .cache() cnt_list = libsvm_rdd.map(lambda x: (x.split(' ')[1], 1)).reduceByKey( add).collect() stats = libsvm_rdd.map( lambda x: len(x.split(' ')[metadata_count:])).stats() feat_count_max = stats.max() feat_count_stdev = stats.stdev() feat_count_mean = stats.mean() sample_count = stats.count() print "INFO: Non-Duplicated libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev print "INFO: ,max feature count=", feat_count_max print "INFO: Non-Duplicated Label count list=", cnt_list # clean up libsvm data ==================================== ============ libsvm_data_file = os.path.join(hdfs_folder, libsvm_alldata_filename) #"libsvm_data" print "INFO: libsvm_data_file=", libsvm_data_file try: hdfs.rmr(libsvm_data_file) except IOError as e: print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info( )[0] #codec = "org.apache.hadoop.io.compress.GzipCodec" #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec) libsvm_rdd.saveAsTextFile(libsvm_data_file) # TBD encrypted feat_count_file = libsvm_data_file + "_feat_count" print "INFO: feat_count_file=", feat_count_file try: hdfs.rmr(feat_count_file) except IOError as e: print "WARNING: I/O error({0}): {1} at feat_count clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info( )[0] sc.parallelize([total_feature_numb], 1).saveAsTextFile(feat_count_file) # TBD ??? output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN =========== if dnn_flag: # special flag to tokenize and keep input orders print "INFO: processing data for DNN..." # create token dict # str_hash_dict: string to hash # all_hashes_seq_dic: hash to seq id if token_dict is None or len(token_dict) == 0: token_dict = {} str_hash_dict = {v: k for k, v in all_hash_str_dic.iteritems()} for k, v in str_hash_dict.iteritems(): token_dict[k] = int(all_hashes_seq_dic[str(v)]) #print "token_dict=",len(token_dict),token_dict # TBD here: need to implement non-binary feature dnn_rdd=df.rdd \ .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) #.cache() # filter duplication here #print dnn_rdd.take(3) dnn_data_file = os.path.join(hdfs_folder, dnn_alldata_filename) #"dnn_data" print "INFO: dnn_data_file=", dnn_data_file try: hdfs.rmr(dnn_data_file) except IOError as e: print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info( )[0] # clean up data dnn_npy_gz_file = os.path.join(hdfs_folder, row_id_str + "_dnn_") print "INFO: dnn_npy_gz_file=", dnn_npy_gz_file try: hdfs.rmr(dnn_npy_gz_file + "data.npy.gz") hdfs.rmr(dnn_npy_gz_file + "label.npy.gz") hdfs.rmr(dnn_npy_gz_file + "info.npy.gz") except IOError as e: print "WARNING: I/O error({0}): {1} at dnn_npy clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at dnn_npy file clean up:", sys.exc_info( )[0] # save new data try: dnn_rdd.saveAsTextFile(dnn_data_file) except: print "WARNING: Unexpected error at saving dnn data:", sys.exc_info( )[0] # show data statistics try: stats = dnn_rdd.map(lambda p: len(p[metadata_count])).stats() feat_count_max = stats.max() feat_count_stdev = stats.stdev() feat_count_mean = stats.mean() sample_count = stats.count() print "INFO: DNN data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev print "INFO: ,max feature count=", feat_count_max except: print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info( )[0] # clean up pca data in hdfs ============ ======================== pca_files = '*' + libsvm_alldata_filename + "_pca_*" #print "INFO: pca_files=", pca_files try: f_list = hdfs.ls(hdfs_folder) if len(f_list) > 0: df_list = fnmatch.filter(f_list, pca_files) for f in df_list: print "INFO: rm ", f hdfs.rmr(f) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info( )[0] # clean up pca data in web local ============ ======================== pca_fname = os.path.join(model_data_folder, row_id_str + '_pca_*.pkl*') print "INFO: pca_fname=", pca_fname try: for fl in glob.glob(pca_fname): print "INFO: remove ", fl os.remove(fl) except OSError, e: print("Error: %s - %s." % (e.pca_fname, e.strerror))
def preprocess( row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr #, excluded_feat_cslist , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, data_fname #, mongo_tuples, labelnameflag, fromweb , jobname, dnn_data_suffix=config.get("machine_learning", "dnn_data_suffix"), dnn_label_suffix=config.get("machine_learning", "dnn_label_suffix"), dnn_info_suffix=config.get("machine_learning", "dnn_info_suffix")): ### generate data folder and out folder, clean up if needed if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # create zip files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path, ",ml_opts_jstr=", ml_opts_jstr # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # Check option ml_opts = {} try: ml_opts = json.loads(ml_opts_jstr) except: print "ERROR: string ml_opts is invalid!" return -1 learning_algorithm = ml_opts["learning_algorithm"] #return # create 3 arrays feat_list = [] label_list = [] info_list = [] feature_count = None sample_count = None if learning_algorithm == "cnn": #================================================== ========== # libsvm_data for featured data, not use "dnn_data" here libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load sample RDD from libsvm file # output format: [([features], label, info)] , feature_count, feat_max, feat_min all_list, feature_count, feat_max, feat_min =\ zip_feature_util.get_sample_as_arr(sc, libsvm_data_file, None, None) c = 0 # convert to 3 list ========================= ========== for i in all_list: #print type(i[0]),type(i[1]), type(i[2]) if c % 10000 == 0: print ".", sys.stdout.flush() c = c + 1 # convert to float32 np array and normalize by max value (0-1.0) feat_list.append(np.array(i[0], dtype=np.float32) / feat_max) label_list.append(i[1]) info_list.append(i[2]) print "INFO: feat_list t=", type(feat_list), type(feat_list[0]), type( feat_list[0][0]) elif learning_algorithm == "lstm": #================================================== ========== # filename for featured data dnn_data_file = os.path.join(hdfs_feat_dir, data_fname) print "INFO: dnn_data_file=", dnn_data_file # output format: [([features], label, info)] , chucked_sample_count all_list, chucked_sample_count=\ zip_feature_util.get_sample_as_chunk(sc, dnn_data_file, None, None) c = 0 # convert to 3 list ========================= ========== for i in all_list: #print type(i[0]),type(i[1]), type(i[2]) if c % 10000 == 0: print ".", sys.stdout.flush() c = c + 1 # convert to float32 np array and normalize by max value (0-1.0) feat_list.append(np.array(i[0], dtype=np.int32)) label_list.append(i[1]) info_list.append(i[2]) print "INFO: feat_list t=", type(feat_list), type( feat_list[0]), feat_list[0][0] print "INFO: label_list t=", type(label_list), label_list[0] print "INFO: info_list t=", type(info_list), info_list[0] if not all_list is None and len(all_list) > 0: sample_count = len(all_list) else: sample_count = None print "INFO: sample_count=", sample_count tgt_prefix = row_id_str if (row_id_str != ds_id): tgt_prefix = ds_id # save data file ========================= ========== data_fname = os.path.join(local_out_dir, tgt_prefix + dnn_data_suffix) print "INFO: data_fname=", data_fname # reshape for image #if not feature_count is None and feature_count >0 and not sample_count is None and sample_count > 0: # nparr_feat=np.asarray(feat_list,dtype=np.float32).reshape(sample_count,feature_count) #else: # lstm didn't need to reshape nparr_feat = np.asarray(feat_list, dtype=np.int32) print "INFO: nparr_feat s=", nparr_feat.shape, nparr_feat[0].shape, type( nparr_feat[0][0]), nparr_feat[0][0] with gzip.open(data_fname, "wb") as fp: #pickle.dump(nparr_feat,fp, 2) # may overflow in pickle; limited to 2B elements; use numpy.save() np.save(fp, nparr_feat, allow_pickle=False) #62.9M for (465724, 2967) # save label file ========================= ========== lbl_fname = os.path.join(local_out_dir, tgt_prefix + dnn_label_suffix) print "INFO: lbl_fname=", lbl_fname # convert to numpy array and int32 to save space nparr_lbl = np.asarray(label_list, dtype=np.int32) #print "label=",nparr_lbl[0] with gzip.open(lbl_fname, "wb") as fp: #pickle.dump(nparr_lbl,fp, 2) np.save(fp, nparr_lbl, allow_pickle=False) # save info file ========================= ========== info_fname = os.path.join(local_out_dir, tgt_prefix + dnn_info_suffix) print "INFO: info_fname=", info_fname nparr_info = np.asarray(info_list) with gzip.open(info_fname, "wb") as fp: #pickle.dump(nparr_info,fp,2) np.save(fp, nparr_info, allow_pickle=False) t1 = time() print 'INFO: running time: %f' % (t1 - t0) print 'INFO: Finished!' return 0
def feat_extr_ngram(row_id_str, hdfs_dir_list, hdfs_feat_dir, model_data_folder, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, fromweb, label_arr, metadata_count, label_idx, data_idx, pattern_str, ln_delimitor, data_field_list, jkey_dict, jobname, num_gram, feature_count_threshold, token_dict=None, HDFS_RETR_DIR=None, remove_duplicated="N", cust_featuring=None, cust_featuring_params=None, local_out_dir=None, filter_ratio=None, binary_flag=True): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, user_custom=cust_featuring) # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) # log time ================================================================ ================ t0 = time() # input filename input_filename = "*" ext_type = '.gz' gz_list = None convert2dirty = "N" if not ',' in hdfs_dir_list: # single dir having *.gz ==== ========= # read raw data from HDFS as .gz format ========== rdd_files = os.path.join(hdfs_dir_list, input_filename + ext_type) # check if gz files in hdfs ============ try: gz_list = hdfs.ls(hdfs_dir_list) print "INFO: check hdfs folder=", hdfs_dir_list except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info()[0] # use whole folder if gz_list is None or len(gz_list) == 0: print "ERROR: No file found by ", input_filename + ext_type #,", use",hdfs_dir_list,"instead" return -2 elif len(gz_list) == 1: # use dir as filename rdd_files = hdfs_dir_list[0:-1] else: # multiple dirs ==== ========= rdd_files = "" cnt = 0 temp_lbl_list = [] comma = "" print "INFO: before label_arr=", label_arr # check each folder for dr in hdfs_dir_list.split(','): #print "****=",dr if not len(dr) > 0: continue try: # remove space etc. dr = dr.strip() fdr = os.path.join(HDFS_RETR_DIR, dr) #print "fdr=",fdr # ls didn't like "*" if '*' in fdr: #gz_list=hdfs.ls(fdr.replace("*","")) dn = os.path.dirname(fdr).strip() bn = os.path.basename(fdr).strip() #print "dn=",dn,",bn=",bn # get all names under folder and do filtering gz_list = fnmatch.filter(hdfs.ls(dn), '*' + bn) #print "gz_list=",gz_list else: gz_list = hdfs.ls(fdr) cnt = cnt + len(gz_list) if len(gz_list) > 0: rdd_files = rdd_files + comma + fdr comma = "," except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror) except: print "WARNING: Error at checking HDFS file:", sys.exc_info( )[0] # use whole folder if cnt is None or cnt == 0: print "ERROR: No file found at", rdd_files return -2 else: print "INFO: total file count=", cnt # set convert flag only when multiple dir and label_arr has dirty label #if label_arr is None: # create label arr if None # label_arr=temp_lbl_list if not label_arr is None and len( label_arr) == 2 and label_arr[1] == "dirty": convert2dirty = "Y" print "INFO: rdd_files=", rdd_files txt_rdd = sc.textFile(rdd_files) #, use_unicode=False total_input_count = txt_rdd.count() print "INFO: Total input sample count=", total_input_count # debug only #for x in txt_rdd.collect(): # print "t=",x print "INFO: hdfs_dir_list=", hdfs_dir_list print "INFO: label_arr=", label_arr print "INFO: feature_count_threshold=", feature_count_threshold #jkey_dict={"meta_list":["label","md5","mdate"], "data_key":"logs"} # this dict depends on the format of input data if not data_field_list is None: jkey_dict = json.loads(jkey_dict) data_key = jkey_dict["data_key"] meta_list = jkey_dict["meta_list"] metadata_count = len(meta_list) data_idx = metadata_count print "INFO: jkey_dict=", jkey_dict print "INFO: meta_list=", meta_list print "INFO: data_key=", data_key print "INFO: data_field_list=", data_field_list print "INFO: metadata_count=", metadata_count featured_rdd = txt_rdd \ .map(lambda x: preprocess_json(x,meta_list,data_key,data_field_list)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() #print "INFO: featured_rdd=" #for x in featured_rdd.collect(): # print "INFO: **** f=",x # user custom code for featuring ============================================= ========== # input txt_rdd format (string): each text row for each sample # output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] elif not cust_featuring is None and len(cust_featuring) > 0: user_module = None user_func = None user_func_dnn = None # load user module ======= try: modules = map(__import__, [CUSTOM_PREFIX + cust_featuring]) user_module = modules[0] user_func = getattr(user_module, CUSTOM_FUNC) except Exception as e: print "ERROR: module=", CUSTOM_PREFIX + cust_featuring print "ERROR: user module error.", e.__doc__, e.message return -101 try: jparams = json.loads(cust_featuring_params) if jparams and 'n-gram' in jparams: num_gram = jparams['n-gram'] elif jparams and 'ngram' in jparams: num_gram = jparams['ngram'] if jparams and 'binary_flag' in jparams: binary_flag = eval(jparams['binary_flag']) except Exception as e: print "ERROR: user params error.", e.__doc__, e.message return -200 # convert feast into array. output format: [ meta1,meta2,..., [feat1,feat2,...]] tmp_rdd = txt_rdd.map(lambda x: user_func(x, cust_featuring_params)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list).cache() print " tmp_rdd cnt=", tmp_rdd.count( ), ",ix=", data_idx, ",max f=", MAX_FEATURES, "ngram=", num_gram print "take(1) rdd=", tmp_rdd.take(1) # TBD for multivariant output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]] # TBD only for num_gram available # for traditional ML, feat in a dict # output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]] featured_rdd = tmp_rdd \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() all_hashes_cnt_dic = None all_hash_str_dic = None all_hashes_seq_dic = None else: print "INFO: pattern_str=", pattern_str + "<--" print "INFO: ln_delimitor=", ln_delimitor + "<--" print "INFO: label_idx=", label_idx print "INFO: data_idx=", data_idx print "INFO: metadata_count=", metadata_count print "INFO: filter_ratio=", filter_ratio # filter top and least percentage of feature if not filter_ratio is None and filter_ratio > 0 and filter_ratio < 1: # check total count here before continue upper_cnt = total_input_count * (1 - filter_ratio) lower_cnt = total_input_count * filter_ratio # set limit for lower bound. if total count is large, lower_cnt may exclude all features... # max lower count = min( MAX_FILTER_LOWER_CNT, total_input_count/100 ) if not MAX_FILTER_LOWER_CNT is None and lower_cnt > MAX_FILTER_LOWER_CNT: if MAX_FILTER_LOWER_CNT > total_input_count / 100: lower_cnt = total_input_count / 100 else: lower_cnt = MAX_FILTER_LOWER_CNT print "INFO: filtering by count, upper bound=", upper_cnt, ",lower bound=", lower_cnt # find unique feature, count them, remove them if in highest and lowest % and then create a dict f_feat_set = Set (txt_rdd.map(lambda x:x.split(ln_delimitor)).flatMap(lambda x:Set(x[metadata_count:])) \ .map(lambda x:(x,1)).reduceByKey(lambda a, b: a + b) \ .filter(lambda x:x[1]<= upper_cnt and x[1]>= lower_cnt) \ .map(lambda x:x[0]).collect() ) print "INFO: f_feat_set len=", len(f_feat_set) broadcast_f_set = sc.broadcast(f_feat_set) #txt_rdd=txt_rdd.map(lambda x: filter_by_list(x, metadata_count,ln_delimitor, broadcast_f_list.value )) txt_rdd=txt_rdd.map(lambda x: x.split(ln_delimitor)) \ .map(lambda x: x[:metadata_count]+ [w for w in x[metadata_count:] if w and w in broadcast_f_set.value]) \ .map(lambda x: ln_delimitor.join(x)) # preprocess by pattern matching and then extract n-gram features #.encode('UTF8') # input txt_rdd format (string): meta-data1\tmeta-data2\t...\tdataline1\tdataline2\t...datalineN\n # output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] # hash_cnt_dic: {hash,hash:count,...} hash_str_dic: {hash: 'str1',... } tmp_rdd = txt_rdd \ .map(lambda x: preprocess_pattern(x, metadata_count, pattern_str, ln_delimitor \ , label_idx, label_arr, convert2dirty )) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) #.cache() memory issue... #tmp_rdd_count=tmp_rdd.count() #print "INFO: After preprocessing count=",tmp_rdd_count featured_rdd = tmp_rdd \ .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is dict) \ .filter(lambda x: type(x[metadata_count+1]) is dict) \ .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \ .cache() #feat_rdd_count=featured_rdd.count() #print "INFO: After featuring count=",feat_rdd_count all_hashes_cnt_dic = None all_hash_str_dic = None all_hashes_seq_dic = None #get all hashes and total occurring count =============== # all_hashes_cnt_dic: {'hash,hash': total count,... } if all_hashes_cnt_dic is None: #all_hashes_cnt_dic = featured_rdd.map(lambda x: x[metadata_count]).reduce(lambda a, b: combine_dic_cnt(a, b)) all_hashes_cnt_dic = dict( featured_rdd.flatMap(lambda x: x[metadata_count].items()). reduceByKey(lambda a, b: a + b).collect()) #get all hashes and their extracted string =============== # all_hash_str_dic: {hash:'str1', ... if all_hash_str_dic is None: #all_hash_str_dic = featured_rdd.map(lambda x: x[metadata_count+1]).reduce(lambda a, b: combine_dic(a, b)) all_hash_str_dic = dict( featured_rdd.flatMap( lambda x: x[metadata_count + 1].items()).distinct().collect()) # get all labels into an array =============== provided by parameter? if label_arr is None: # will force "clean" be 0 here label_arr = sorted( featured_rdd.map( lambda x: x[label_idx].lower()).distinct().collect()) # debug only print "INFO: label_arr.=", json.dumps(sorted(label_arr)) # save labels to hdfs as text file==================================== ============ hdfs_folder = hdfs_feat_dir #+ "/" # "/" is needed to create the folder correctly print "INFO: hdfs_folder=", hdfs_folder try: hdfs.mkdir(hdfs_folder) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0] # clean up metadata_file metadata_file = os.path.join(hdfs_folder, metadata) #"metadata" print "INFO: metadata_file=", metadata_file try: hdfs.rmr(metadata_file) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at rmr():", sys.exc_info()[0] sc.parallelize(label_arr, 1).saveAsTextFile(metadata_file) #remap all hash values to continuous key/feature number ============== # all_hashes_seq_dic: { hash : sequential_numb } if all_hashes_seq_dic is None: all_hashes_seq_dic = {} remap2seq( all_hashes_cnt_dic, all_hashes_seq_dic) #all_hashes_seq_dic has continuous key number #print "all_hashes_seq_dic=",all_hashes_seq_dic total_feature_numb = len(all_hashes_seq_dic) print "INFO: Total feature count=", len(all_hashes_seq_dic) # featured_rdd (list): [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic] # seq_featured_rdd(list): [meta-data1,meta-data2,..., hash_cnthsh_dict, hash_str_dic] (feat id in sorted sequence) # hash_cnt_dic: {hash: count} hash_str_dic: {hash: 'str1,str2...' } # set binary_flag to True, all feature:value will be 1 broadcast_dic = sc.broadcast(all_hashes_seq_dic) seq_featured_rdd = featured_rdd.map(lambda x: convert2seq( x, label_idx, data_idx, broadcast_dic.value, binary_flag=binary_flag) ).cache() # get hash_cnthsh_dict then flatMap and reduce to (feat id, count) ct_rdd = seq_featured_rdd.flatMap(lambda x: [(i[0], i[1]) for i in x[ data_idx].iteritems()]).reduceByKey(lambda a, b: a + b) # sorted by feature id as int feat_sample_count_arr = ct_rdd.sortBy(lambda x: int(x[0])).map( lambda x: x[1]).collect() # sort after collect may fail when rdd is huge #feat_sample_count_arr=[] #for i in sorted(ct_rdd.collect(), key=lambda t: int(t[0])): # feat_sample_count_arr.append(i[1]) print "INFO: feat_sample_count_arr len=", len(feat_sample_count_arr) # save feat_sample_count_arr data ==================================== ============ filter = '{"rid":' + row_id_str + ',"key":"feat_sample_count_arr"}' upsert_flag = True jo_insert = {} jo_insert["rid"] = eval(row_id_str) jo_insert["key"] = "feat_sample_count_arr" jo_insert["value"] = feat_sample_count_arr jstr_insert = json.dumps(jo_insert) ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert, upsert_flag) print "INFO: Upsert count for feat_sample_count_arr=", ret # insert failed, save to local if ret == 0: # drop old record in mongo ret = query_mongo.delete_many(mongo_tuples, None, filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fsca_hs = os.path.join(local_out_dir, row_id_str, row_id_str + "_feat_sample_count_arr.pkl") print "WARNING: save feat_sample_count_arr to local" ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs) # save feature data; TBD. not used. ==================================== ============ #libsvm_rdd=seq_featured_rdd.map(lambda x: convert2libsvm(x,label_idx,data_idx,label_arr)) # put hash to the front of each row, assume hash is after label libsvm_rdd = seq_featured_rdd.map( lambda x: x[label_idx + 1] + " " + convert2libsvm( x, label_idx, data_idx, label_arr)) # debug only #print "libsvm_rdd=" #for i in libsvm_rdd.collect(): # print i # get rdd statistics info stats = featured_rdd.map(lambda p: len(p[metadata_count])).stats() feat_count_max = stats.max() feat_count_stdev = stats.stdev() feat_count_mean = stats.mean() sample_count = stats.count() print "INFO: libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev print "INFO: ,max feature count=", feat_count_max # find sample count lbl_arr = featured_rdd.map(lambda x: (x[label_idx], 1)).reduceByKey( add).collect() print "INFO: Sample count by label=", lbl_arr # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx if remove_duplicated == "Y": libsvm_rdd=libsvm_rdd \ .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \ .groupByKey().map(lambda x: list(x[1])[0] ) \ .cache() cnt_list = libsvm_rdd.map(lambda x: (x.split(' ')[1], 1)).reduceByKey( add).collect() stats = libsvm_rdd.map( lambda x: len(x.split(' ')[metadata_count:])).stats() feat_count_max = stats.max() feat_count_stdev = stats.stdev() feat_count_mean = stats.mean() sample_count = stats.count() print "INFO: Non-Duplicated libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev print "INFO: ,max feature count=", feat_count_max print "INFO: Non-Duplicated Label count list=", cnt_list # clean up libsvm data ==================================== ============ libsvm_data_file = os.path.join(hdfs_folder, libsvm_alldata_filename) #"libsvm_data" print "INFO: libsvm_data_file=", libsvm_data_file try: #hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(libsvm_data_file) #if num_gram == 1: # hdfs.rmr(dnn_data_file) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info( )[0] #codec = "org.apache.hadoop.io.compress.GzipCodec" #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec) libsvm_rdd.saveAsTextFile(libsvm_data_file) # TBD encrypted feat_count_file = libsvm_data_file + "_feat_count" print "INFO: feat_count_file=", feat_count_file try: hdfs.rmr(feat_count_file) except IOError as e: print "WARNING: I/O error({0}): {1} at feat_count clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info( )[0] sc.parallelize([total_feature_numb], 1).saveAsTextFile(feat_count_file) label_dic = {} # assign label a number for idx, label in enumerate(sorted(label_arr)): if not label in label_dic: label_dic[ label] = idx #starting from 0, value = idx, e.g., clean:0, dirty:1 # output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN =========== if num_gram == 1: # special flag to tokenize and keep input orders print "INFO: processing data for DNN..." # create token dict # str_hash_dict: string to hash # all_hashes_seq_dic: hash to seq id if token_dict is None or len(token_dict) == 0: token_dict = {} str_hash_dict = {v: k for k, v in all_hash_str_dic.iteritems()} for k, v in str_hash_dict.iteritems(): token_dict[k] = int(all_hashes_seq_dic[str(v)]) #print "token_dict=",len(token_dict),token_dict dnn_rdd = tmp_rdd \ .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \ .filter(lambda x: len(x) > metadata_count) \ .filter(lambda x: type(x[metadata_count]) is list) #.cache() # filter duplication here #print dnn_rdd.take(3) dnn_data_file = os.path.join(hdfs_folder, dnn_alldata_filename) #"dnn_data" print "INFO: dnn_data_file=", dnn_data_file try: hdfs.rmr(dnn_data_file) except IOError as e: print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info( )[0] # clean up data dnn_npy_gz_file = os.path.join(hdfs_folder, row_id_str + "_dnn_") print "INFO: dnn_npy_gz_file=", dnn_npy_gz_file try: hdfs.rmr(dnn_npy_gz_file + "data.npy.gz") hdfs.rmr(dnn_npy_gz_file + "label.npy.gz") hdfs.rmr(dnn_npy_gz_file + "info.npy.gz") except IOError as e: print "WARNING: I/O error({0}): {1} at dnn_npy clean up".format( e.errno, e.strerror) except: print "WARNING: Unexpected error at dnn_npy file clean up:", sys.exc_info( )[0] # save new data try: dnn_rdd.saveAsTextFile(dnn_data_file) except: print "WARNING: Unexpected error at saving dnn data:", sys.exc_info( )[0] # show data statistics try: stats = dnn_rdd.map(lambda p: len(p[metadata_count])).stats() feat_count_max = stats.max() feat_count_stdev = stats.stdev() feat_count_mean = stats.mean() sample_count = stats.count() print "INFO: DNN data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev print "INFO: ,max feature count=", feat_count_max except: print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info( )[0] # clean up pca data in hdfs ============ ======================== pca_files = '*' + libsvm_alldata_filename + "_pca_*" #print "INFO: pca_files=", pca_files try: f_list = hdfs.ls(hdfs_folder) if len(f_list) > 0: df_list = fnmatch.filter(f_list, pca_files) for f in df_list: print "INFO: rm ", f hdfs.rmr(f) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info( )[0] # clean up pca data in web local ============ ======================== pca_fname = os.path.join(model_data_folder, row_id_str + '_pca_*.pkl*') print "INFO: pca_fname=", pca_fname try: for fl in glob.glob(pca_fname): print "INFO: remove ", fl os.remove(fl) except OSError, e: print("Error: %s - %s." % (e.pca_fname, e.strerror))