def feat_importance_ip(row_id_str, ds_id, hdfs_feat_dir, local_score_file, score_file_IT, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, jobname, uploadtype): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) ''' SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) #SparkContext.setSystemProperty('spark.kryoserializer.buffer.mb', config.get('spark', 'spark_kryoserializer_buffer_mb')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'feature_importance_2ways:'+str(args.row_id)) ''' t0 = time() # get folder list (labels) from hdfs data_out/<id>/metadata ============== dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: folder_list=", folder_list # get feature seq : ngram hash mapping ================================== key = "dic_seq_hashes" #{"123":"136,345"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = dic_list feature_count = len(dic_list) #print "INFO: feature_count=",feature_count #print "dic_list=",dic_list #{u'123,345':u'136'} #print "INFO: dic_all_columns=",dic_all_columns # {1: u'8215,8216'} # end # get hash : raw string mapping ================================== key = "dic_hash_str" #{"123":"openFile"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_hash_str = doc['value'] ''' # get folder list (labels) from hdfs data_out/<id>/libsvm ============== libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data") # based on label, divide RDD into arrays f_rdd = sc.textFile(libsvm_loc).map(lambda x: libsvm2tuple_arr(x)) arr_libsvm=sorted(f_rdd.collect(), key=lambda x:x[0]) # sorted by label ''' # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file print "INFO: feature_count=",feature_count\ # get sample array from hdfs arr_libsvm = zip_feature_util.get_sample_tuple_arr(sc, libsvm_data_file) # sorted by label arr_libsvm = sorted(arr_libsvm, key=lambda x: x[0]) # convert libsvm to features_list, row_list, col_list, sample count, col_num lbl_flag = -1 row_num_training = 0 sparse_mtx_list = [] # for feat impor calculation features_list = [] # for csc_matrix row_list = [] # for csc_matrix col_list = [] # for csc_matrix sample_numbers = [] # for csc_matrix feature_arr = None for idx, i in enumerate(arr_libsvm): #print "idx=",idx,",l=",i[0],",d=",i[1:] if lbl_flag != i[0]: if feature_arr and len(feature_arr) > 0: features_list.append(np.array(feature_arr)) row_list.append(np.array(row_arr)) col_list.append(np.array(col_arr)) sample_numbers.append(cnt) row_arr = [] col_arr = [] feature_arr = [] cnt = 0 lbl_flag += 1 for j in i[1:]: row_arr.append(cnt) col_arr.append(j[0] - 1) feature_arr.append(j[1]) cnt += 1 # for last part if len(feature_arr) > 0: features_list.append(np.array(feature_arr)) row_list.append(np.array(row_arr)) col_list.append(np.array(col_arr)) sample_numbers.append(cnt) #print ",features_list=",features_list #print ",row_list=",row_list #print ",col_list=",col_list print "INFO: sample_numbers=", sample_numbers col_num = len(dic_list) print "INFO: column number: ", col_num #, ",len(max_feat_list)=",len(max_feat_list) for i in range(0, len(features_list)): #print "i=",i #print "features_list=",features_list[i] #print "row_list=",row_list[i] #print "col_list=",col_list[i] #print "sample_numbers=",sample_numbers[i] sparse_mtx = csc_matrix((features_list[i], (row_list[i], col_list[i])), shape=(sample_numbers[i], col_num)) sparse_mtx_list.append(sparse_mtx) #print sparse_mtx_list[0] print "INFO: sparse_mtx_list[0].shape=", sparse_mtx_list[0].shape #print sparse_mtx_list[1] print "INFO: sparse_mtx_list[1].shape=", sparse_mtx_list[1].shape exclusive_feature_set_mal = [] exclusive_feature_set_clean = [] dic_feature_cnt_mal = {} dic_feature_cnt_clean = {} dic_score = {} dic_cnt_mal = {} dic_cnt_clean = {} dic_IT_grain = {} #################################################### ####feature importance algorithms: 2 methods ####### # Only for 2 classes ??? #################################################### if len(sample_numbers) == 2: ################################################### ################## calculate probability ############ ################################################### print "INFO: =======Feature Importance(probability) ================ " for j in range(0, col_num): curr_col_dirty = sparse_mtx_list[0].getcol(j) sum_col = curr_col_dirty.sum(0) cnt_mal = sum_col.tolist()[0][0] curr_col_clean = sparse_mtx_list[1].getcol(j) sum_col = curr_col_clean.sum(0) cnt_clean = sum_col.tolist()[0][0] percnt_mal = cnt_mal / float(sample_numbers[0]) percnt_clean = cnt_clean / float(sample_numbers[1]) score_j = (percnt_mal + 1 - percnt_clean) / 2 dic_score[j + 1] = score_j dic_cnt_clean[j + 1] = cnt_clean dic_cnt_mal[j + 1] = cnt_mal sorted_score = sorted(dic_score.items(), key=operator.itemgetter(1), reverse=True) #print "sorted_score:", sorted_score #print "dic_cnt_clean", dic_cnt_clean #print "dic_cnt_mal", dic_cnt_mal ############output result######################## if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("Error: %s - %s." % (e.local_score_file, e.strerror)) for ii in range(0, len(sorted_score)): (feat, score) = sorted_score[ii] #print feat, score, dic_all_columns[feat] if dic_hash_str: description_str = feats2strs(dic_all_columns[str(feat)], dic_hash_str) else: description_str = "N/A" print "Warning: No mapping found for feature number" str01 = str(feat) + "\t" + str( score) + "\t" + description_str + "\n" with open(local_score_file, "a") as f: f.write(str01) ######################################################## ##################Information Gain (entropy)############ ######################################################## print "INFO: =======Information Gain================ " for j in range(0, col_num): cnt_mal = dic_cnt_mal[j + 1] cnt_clean = dic_cnt_clean[j + 1] total_samples = sample_numbers[0] + sample_numbers[1] p0 = float(sample_numbers[0]) / total_samples p1 = 1 - p0 if p0 == 0 or p1 == 0: parent_entropy = 0 else: parent_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2(p1) if cnt_clean + cnt_mal == 0: information_gain = 0 elif total_samples - cnt_clean - cnt_mal == 0: information_gain = 0 else: p0 = float(cnt_mal) / (cnt_clean + cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_left_entropy = 0 else: child_left_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) p0 = float(sample_numbers[0] - cnt_mal) / (total_samples - cnt_clean - cnt_mal) p1 = 1 - p0 if p0 == 0 or p1 == 0: child_right_entropy = 0 else: child_right_entropy = 0 - p0 * np.log2(p0) - p1 * np.log2( p1) weighted_child_entropy = child_left_entropy * float( cnt_clean + cnt_mal) / total_samples + child_right_entropy * float( total_samples - cnt_clean - cnt_mal) / total_samples information_gain = parent_entropy - weighted_child_entropy dic_IT_grain[j + 1] = information_gain sorted_IT_gain = sorted(dic_IT_grain.items(), key=operator.itemgetter(1), reverse=True) if os.path.exists(score_file_IT): try: os.remove(score_file_IT) except OSError, e: print("Error: %s - %s." % (e.score_file_IT, e.strerror))
def feat_importance_comb(row_id_str, ds_id, num_to_show, w_FIRM, w_IT, w_Prob, mongo_tuples, FIRM_score_file, IT_score_file, Prob_score_file, score_file_combine): human_verified = dict() all_verified = dict() print "INFO: ======= Combine all feature importance info ================" # get feature importance voting data from db all_verified, human_verified = exec_sqlite.get_dict(row_id_str) #print "INFO: human_verified dict=",human_verified #print "INFO: all_verified dict=",all_verified ############begin################## with open(FIRM_score_file, 'r') as f: FIRM_score = f.readlines() with open(IT_score_file, 'r') as f: IT_score = f.readlines() with open(Prob_score_file, 'r') as f: Prob_score = f.readlines() # create file for one table here ====================== dir_name = os.path.dirname(FIRM_score_file) coef_filename = os.path.join(dir_name, row_id_str + '_score_coef_comb.json') #print "INFO: combined fname=",coef_filename # get data from mongo key = "coef_arr" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) coef_arr = doc['value'] #print "INFO: len(coef_arr)=",len(coef_arr) # get sample count =========== key = "feat_sample_count_arr" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data # need to chk if new feature if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) feat_sample_count_arr = None if not doc is None: feat_sample_count_arr = doc['value'] combine_with_coef(row_id_str, coef_arr, FIRM_score, IT_score, Prob_score, coef_filename, feat_sample_count_arr) featurelist_FIRM = [] featurelist_IT = [] featurelist_Prob = [] dic_all_columns = {} print "INFO: num to_show=", num_to_show for i in range(0, num_to_show): ####FIRM#### str_in = FIRM_score[i] feature_id, score, descpt = str_in.split('\t', 3) featurelist_FIRM.append(feature_id) if not feature_id in dic_all_columns: dic_all_columns[feature_id] = descpt ####IT#### str_in = IT_score[i] feature_id, score, descpt = str_in.split('\t', 3) featurelist_IT.append(feature_id) if not feature_id in dic_all_columns: dic_all_columns[feature_id] = descpt ####Prob#### str_in = Prob_score[i] feature_id, score, descpt = str_in.split('\t', 3) featurelist_Prob.append(feature_id) if not feature_id in dic_all_columns: dic_all_columns[feature_id] = descpt list_i = [i + 1 for i in range(0, num_to_show)] zipped_FIRM = zip(featurelist_FIRM, list_i) zipped_IT = zip(featurelist_IT, list_i) zipped_Prob = zip(featurelist_Prob, list_i) FIRM_dict = dict(zipped_FIRM) IT_dict = dict(zipped_IT) Prob_dict = dict(zipped_Prob) list_combine = featurelist_FIRM + featurelist_IT + featurelist_Prob list_unique = OrderedDict.fromkeys(list_combine).keys() #print list_unique, len(list_unique) #human_verified = {'188':7, '218':6} #####get human_verified from database, all click_number > 5 are human_verified### score_combine = {} for i in range(0, len(list_unique)): feat_id = list_unique[i] if feat_id in human_verified: print "INFO: found feat_id=", feat_id continue score = 0 if feat_id in FIRM_dict: score = score + w_FIRM * FIRM_dict[feat_id] else: score = score + w_FIRM * (num_to_show + 1) if feat_id in IT_dict: score = score + w_IT * IT_dict[feat_id] else: score = score + w_IT * (num_to_show + 1) if feat_id in Prob_dict: score = score + w_Prob * Prob_dict[feat_id] else: score = score + w_Prob * (num_to_show + 1) score = score / float(3) #print feat_id, score #############add human feedback########## if feat_id in all_verified: click_number = all_verified[ feat_id] #####get click number from database### else: click_number = 0 #print feat_id, click_number, score #click_number = 3 if click_number == 1: score = score - 2 elif click_number == 2: score = score - 4 elif click_number == 3: score = score - 10 elif click_number == 4: score = score - 20 elif click_number == 5: score = 0 if score < 0: score = 0 #print "***=",feat_id, click_number, score score_combine[feat_id] = score if os.path.exists(score_file_combine): try: os.remove(score_file_combine) except OSError, e: print("Error: %s - %s." % (e.score_file_combine, e.strerror))
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, labelnameflag, fromweb, src_filename , jobname ): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) # start here =================================================================== =============== t0 = time() ### Need to check if PCA available here =========================== libsvm_data_file = os.path.join(hdfs_feat_dir , src_filename) # need to set k numb in filename somehow print "INFO: libsvm_data_file=", libsvm_data_file #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file).cache() # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() feature_count=0 samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, '') # get label as a list labels_list_all = samples_rdd.map(lambda p: int(p[0].label)).collect() total_sample_count=len(labels_list_all) parsedData =samples_rdd.map(lambda p: p[0].features).cache() #for i in parsedData.collect(): #p.features: pyspark.mllib.linalg.SparseVector # print "pd=",type(i),",i=",i t1 = time() print 'INFO: running time: %f' %(t1-t0) t0 = t1 ############################################### ########## build learning model ############### ############################################### ### get the parameters### print "INFO: ============Learning Algorithm and Parameters=============" para_dict = json.loads(ml_opts_jstr) flag_model = para_dict['learning_algorithm'] # kmeans iteration_num = eval(para_dict['iterations']) k=2 if 'k' in para_dict: k = eval(para_dict['k']) print "INFO: Learning Algorithm:", flag_model print "INFO: iterations=", iteration_num #print "training_sample_number=", training_sample_number ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list ### build model ### if flag_model == "kmeans": print "=================== Kmeans ============" model = KMeans.train(parsedData, k, maxIterations=iteration_num) t_cost= model.computeCost(parsedData) print "INFO: cost for training set =", str(t_cost) clusterCenters=model.clusterCenters print "INFO: clusterCenters t=", type(clusterCenters) #list elif flag_model == "gaussian_mixture_model": # didn't work some native lib issue print "=================== Gaussian_Mixture_Model ============" model = GaussianMixture.train(parsedData, k, maxIterations=iteration_num) print "INFO: model.weights =", model.weights else: print "INFO: Training model selection error: no valid ML model selected!" return ### Save model save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "ERROR: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "ERROR: Unexpected error:", sys.exc_info()[0] print "INFO: model saved at hdfs=",save_dir print "INFO: model type=",type(model)," model=",model model.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) ### # (true label, keams label, features list, hash) all_data=samples_rdd.map(lambda t: ( t[0].label, model.predict(t[0].features), t[0].features, t[1] ) ).collect() true_label_arr = np.asarray([int(x) for x,_,_,_ in all_data]) labels_kmeans = np.asarray([int(x) for _,x,_,_ in all_data]) hash_list = np.asarray([x for _,_,_,x in all_data]) print "INFO: all_data len=",len(all_data),"all_data t=",type(labels_list_all) print "INFO: true_label_arr.shape=",true_label_arr.shape,"labels_kmeans.shape=",labels_kmeans.shape print "INFO: true_label_arr t=",type(true_label_arr),"labels_kmeans t=",type(labels_kmeans) mtx_center=np.asarray(clusterCenters) features_array_reduced=np.asarray([x.toArray() for _,_,x,_ in all_data]) print "INFO: mtx_center t=",type(mtx_center),"mtx_center.shape=",mtx_center.shape print "INFO: features_array_reduced t=",type(features_array_reduced),"features_array_reduced.shape",features_array_reduced.shape #Adjusted Mutual Information between two clusterings amis=adjusted_mutual_info_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars=adjusted_rand_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_rand_score=", ars accuracy=0.0 t1 = time() print 'INFO: training run time: %f' %(t1-t0) t0 = t1 ############################################### ########## plot histogram ###### ############################################### n_clusters=k plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4*plot_col_num, 3*int(math.ceil(n_clusters*1.0/plot_col_num))) print "INFO: n_clusters=",n_clusters,",label_dic=",label_dic print "INFO: plot_col_num=",plot_col_num,",figsize=",figsize,",local_out_dir=",local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, folder = local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, normalize = True, folder = local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### num_bars = max(true_label_arr) + 1 figsize = (4*plot_col_num, 3*int(math.ceil(num_bars*1.0/plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures(labels_kmeans, true_label_arr, num_bars, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, reverse = True, folder = local_out_dir, rid=row_id_str) #### plot dot figures #### # dot plot for Kmeans =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='KMeans', filename_3d=filename_3d) # dot plot for True Labels =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster_tl.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_arr, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='True Labels', filename_3d=filename_3d) dataset_info={"training_fraction":1, "class_count":n_clusters,"dataset_count":total_sample_count} # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '"+str(accuracy*100)+"%" print 'INFO: Finished!' return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, labelnameflag, fromweb , training_fraction, jobname, model_data_folder ): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path # ML model filename ==== model_fname=os.path.join(model_data_folder, row_id_str+'.pkl') print "INFO: model_data_folder=",model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str,local_out_dir,model_data_folder,model_fname) # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) t0 = time() t00 = t0 # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat=0 if not ml_opts_jstr is None: ml_opts=json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat=ml_opts["has_excluded_feat"] # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist=ml_util.ml_get_excluded_feat(row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=",excluded_feat_cslist # source libsvm filename libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file feat_count_file=libsvm_data_file+"_feat_count" feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) print "INFO: feature_count=",feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd,feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist) all_data = samples_rdd.collect() sample_count=len(all_data) # 2-D array features_list = [x.features.toArray() for x,_ in all_data] # label array labels_list_all = [x.label for x,_ in all_data] # hash array hash_list_all = [x for _,x in all_data] # convert to np array labels_list_all = array(labels_list_all) features_array = np.array(features_list) hash_list_all=np.array(hash_list_all) # generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) ### randomly split the samples into training and testing data =============== X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \ cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) ) # X_test_sparse is scipy.sparse.csr.csr_matrix testing_sample_count = len(labels_test) training_sample_count=len(labels_train) training_lbl_cnt_list=Counter(labels_train) testing_lbl_cnt_list=Counter(labels_test) print "INFO: training sample count=",training_sample_count,", testing sample count=",testing_sample_count,",sample_count=",sample_count print "INFO: training label list=",training_lbl_cnt_list,", testing label list=",testing_lbl_cnt_list print "INFO: train_hash_list count=",len(train_hash_list),", test_hash_list count=",len(test_hash_list) t1 = time() print 'INFO: running time: %f' %(t1-t0) ############################################### ###########build learning model################ ############################################### ### parse parameters and generate the model ### (clf, model_name, api, cv, param_dic) = parse_param_and_get_model(ml_opts) if model_name == "none": print "ERROR: model name not found!" return -1 #param_jobj=json.loads(ml_opts_jstr); #print "param_jobj=",param_jobj ######################################################## ##########Grid Search with cross validation############# ######################################################## json2save={} json2save["rid"]=int(row_id_str) json2save["key"]="cv_result" #json2save["param_str"]=ml_opts_jstr json2save["param_dic"]=param_dic cv_grid=[] if api == "centralized": #########run with Scikit-learn API (for comparison)###### print "INFO: ******************Grid Search with Scikit-learn API************" t0 = time() # Set the parameters by cross-validation #tuned_parameters = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}] #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], \ # 'C': [1, 10, 100, 1000]}, \ # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = ['accuracy'] json2save["scores"]=scores #print json2save for score in scores: # for one item only? score=accuracy print("INFO: # Tuning hyper-parameters for %s" % score) #print() grid = grid_search.GridSearchCV(estimator = clf, param_grid = param_dic, cv=cv, scoring= score) grid.fit(X_train_sparse, labels_train) print "INFO: Best parameters set found on development set:" print "INFO: grid.best_params_=",grid.best_params_ print "INFO: Grid scores on development set:" for key in grid.best_params_: print "INFO: best_params["+key+"]=", grid.best_params_[key] if key.lower()=="regtype": ml_opts['regularization']=str(grid.best_params_[key]) # add best param to else: ml_opts[key.lower()]=str(grid.best_params_[key]) # add best param to # save best param to db as json string j_str=json.dumps(ml_opts); json2save["param_str"]=j_str; print "INFO: grid_scores_ with params:" for params, mean_score, scores in grid.grid_scores_: print "INFO: %0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) #outstr='%s,%0.3f,%0.03f,%s' % (params,mean_score, scores.std() * 2,"Selected" if params==grid.best_params_ else "") outj={} outj["param"]=params outj["average_accuracy"]="%0.3f" % (mean_score) outj["std_deviation"]="%0.3f" % (scores.std() * 2) outj["selected"]="%s" % ("Selected" if params==grid.best_params_ else "") cv_grid.append(outj) clf_best = grid.best_estimator_ t1 = time() ############# END run with SKlearn ###### print 'INFO: Grid Search with SKlearn running time: %f' %(t1-t0) t0 = time() else: #############run with SPARK###### print "INFO: ******************Grid Search with SPARK************" all_comb_list_of_dic = get_all_combination_list_of_dic(param_dic) print "INFO: Total number of searching combinations=", len(all_comb_list_of_dic) #print "all_comb_list_of_dic: ", all_comb_list_of_dic params_rdd = sc.parallelize(all_comb_list_of_dic) ###broad cast clf, traning data, testing data to all workers### X_broadcast = sc.broadcast(X_train_sparse) y_broadcast = sc.broadcast(labels_train) clf_broadcast = sc.broadcast(clf) ### Grid Search with CV in multiple workers ### models = params_rdd.map(lambda x: learn_with_params(clf_broadcast.value, X_broadcast.value, y_broadcast.value, cv, x)).sortByKey(ascending = False).cache() (ave_accuracy, (clf_best, p_dic_best, std2)) = models.first() # output results # print "INFO: Best parameters set found for ", model_name, " is: " print "INFO: ", for key in p_dic_best: print key, " = ", p_dic_best[key], if key.lower()=="regtype": ml_opts['regularization']=str(p_dic_best[key]) else: ml_opts[key.lower()]=str(p_dic_best[key]) # add best param to # save best param to db as json string print "" j_str=json.dumps(ml_opts); json2save["param_str"]=j_str; print "INFO: Average accuracy with CV = ", cv, ": ", ave_accuracy ######## print complete report ####### print "INFO: Grid scores on development set:" all_results = models.collect() for i in range(0, len(all_results)): (ave_accu_i, (clf_i, p_dic_i, std2_i)) = all_results[i] print "INFO: ",ave_accu_i, " for ", p_dic_i print "INFO: %0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), p_dic_i #outstr='%s,%0.3f,%0.03f,%s' % ( p_dic_i, ave_accu_i, std2_i, "Selected" if p_dic_i==p_dic_best else "") outj={} outj["param"]=p_dic_i outj["average_accuracy"]="%0.3f" % (ave_accu_i) outj["std_deviation"]="%0.3f" % (std2_i) outj["selected"]="%s" % ("Selected" if p_dic_i==p_dic_best else "") cv_grid.append(outj) print " " t1 = time() ############# END run with SPARK###### print 'INFO: Grid search with SPARK running time: %f' %(t1-t0) ################################################################################## #print "cv_grid=",cv_grid #json2save["cv_grid_title"]='param,average_accuracy,std_deviation,selected' json2save["cv_grid_data"]=cv_grid json2save['clf_best']=str(clf_best).replace("\n","").replace(" ","") cv_result=json.dumps(json2save) #print "INFO: cv_result=",cv_result filter='{"rid":'+row_id_str+',"key":"cv_result"}' upsert_flag=True ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true}) ret=query_mongo.upsert_doc_t(mongo_tuples,filter,cv_result,upsert_flag) print "INFO: Upsert count for cv_result: ret=",ret ################################################################################## ##########Retrain with best model for training set and output results############# ################################################################################## print "INFO: **********Retrain with best model for training set and output results************" clf_best.fit(X_train_sparse, labels_train) #### save clf_best for future use #### #joblib.dump(clf_best, model_data_folder + row_id_str+'.pkl') joblib.dump(clf_best, model_fname) ### Evaluating the model on testing data labels_pred = clf_best.predict(X_test_sparse) accuracy = clf_best.score(X_test_sparse, labels_test) print "INFO: Accuracy = ", accuracy ######################################the rest of the code is the same as train_sklean.py (replace clf with clf_best)##################################################################### clf=clf_best print "INFO: model type=",type(clf)," clf=",clf # get data from model ================================ coef=None intercept=None try: if type(clf) in ( classes.SVC , classes.NuSVC) :# svm didn't have coef_ col_num=clf.support_vectors_.shape[1] else: #linear only # coef_ is only available when using a linear kernel col_num = len(clf.coef_[0]) coef=clf.coef_[0] intercept=clf.intercept_[0] # only get 1st item? #print "**model:clf.coef_[0] =",clf.coef_[0] except Exception as e: print "WARNING: Can't get clf.coef_[0]. e=",e,", get total features from meta-data" col_num = 0 #how to get feature number for sparse array? print "INFO: total feature # in the model: ", col_num jfeat_coef_dict={} # create feature coefficient file ================================ if coef is None: print "WARNING: model weights not found!" else: feat_filename=os.path.join(local_out_dir,row_id_str+"_feat_coef.json") print "INFO: feat_filename=",feat_filename # save coef_arr to mongo & create jfeat_coef_dict=== jfeat_coef_dict=ml_util.ml_save_coef_build_feat_coef(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id) #print "INFO: jfeat_coef_dict=", jfeat_coef_dict print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict ) # filename for false pred false_pred_fname=os.path.join(local_out_dir,row_id_str+"_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname # build files for false pred & score graph (score_arr_0, score_arr_1, max_score,min_score)=ml_build_false_pred(X_test_sparse,coef,intercept , labels_test, labels_pred, test_hash_list, model_name, jfeat_coef_dict, false_pred_fname) # save pred output pred_out_arr=[] for i in range(0,len(labels_test)): pred_out_arr.append((labels_test[i], labels_pred[i], test_hash_list[i])) pred_ofname=os.path.join(local_out_dir,row_id_str+"_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname ml_util.ml_pickle_save(pred_out_arr,pred_ofname) ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: ******generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) ### generate sample numbers of each family in testing data### testing_sample_number = len(labels_test) print "INFO: testing_sample_number=", testing_sample_number test_cnt_dic = {} for key in label_dic: test_cnt_dic[key] = 0 for i in range (0, testing_sample_number): for key in label_dic: if labels_test[i] == key: test_cnt_dic[key] = test_cnt_dic[key] + 1 print "INFO: Number of samples in each label is=", test_cnt_dic ############################################### ###########plot prediction result figure####### ############################################### pred_fname=os.path.join(local_out_dir,row_id_str+"_1"+".png") true_fname=os.path.join(local_out_dir,row_id_str+"_2"+".png") pred_xlabel='Prediction (Single Run)' true_xlabel='True Labels (Single Run)' test_cnt_dic=ml_util.ml_plot_predict_figures(labels_pred.tolist(), labels_test.tolist(), labels_list, label_dic, testing_sample_count , pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc=None #fscore=None perf_measures=None class_count=len(labels_list) dataset_info={"training_fraction":training_fraction, "class_count":class_count,"dataset_count":sample_count} ############################################################# ###################for 2 class only (plot ROC curve)######### ############################################################# if len(labels_list) == 2: # build data file for score graph score_graph_fname=os.path.join(local_out_dir,row_id_str+"_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname ml_build_pred_score_graph(score_arr_0,score_arr_1,model_name, score_graph_fname,max_score,min_score) do_ROC=True reverse_label_dic = dict((v,k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!" do_ROC=False if do_ROC: # calculate fscore ========== perf_measures=ml_util.calculate_fscore(labels_test, labels_pred) print "INFO: perf_measures=",perf_measures confidence_score = clf_best.decision_function(X_test_sparse) if flag_clean == 0: scores = [x for x in confidence_score] s_labels = [x for x in labels_test] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x in confidence_score] s_labels = [1-x for x in labels_test] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc=ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P , local_out_dir, row_id_str) perf_measures["roc_auc"]=roc_auc # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"',ml_opts='"+j_str \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '"+str(accuracy*100)+"%" print 'INFO: total running time: %f' %(t1-t00) print 'INFO: Finished!' return 0
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, training_fraction, jobname, uploadtype, description_file): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # get feature seq mapping from mongo if uploadtype == "MD5 List IN-dynamic": ### connect to database to get the column list which contains all column number of the corresponding feature key = "dict_dynamic" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = {} max_feature = 0 # reverse dict{hashes:sequence number} ====== for i in range(0, len(dic_list)): for key in dic_list[i]: dic_all_columns[eval(dic_list[i][key])] = key if eval(dic_list[i][key]) > max_feature: max_feature = eval(dic_list[i][key]) print "INFO: max_feature=", max_feature #print "dic_all_columns=",dic_all_columns # fid:numb,numb dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() #print "INFO: dirFile_loc=",dirFile_loc,", hash_Folders=",hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: hdfs folder_list=", folder_list #['dirty/', 'clean/'] features_training = [] labels_training = [] names_training = [] row_training = [] col_training = [] max_feat_training = 0 row_num_training = 0 features_testing = [] labels_testing = [] names_testing = [] row_testing = [] col_testing = [] max_feat_testing = 0 row_num_testing = 0 # loop through hdfs folders; TBD for folder in folder_list: print "INFO: folder=", folder label = folder_list.index(folder) + 1 print 'INFO: label=', label logFile_name = os.path.join(hdfs_feat_dir, folder, mtx_name_list) #print "logFile_name=",logFile_name logFile_data = os.path.join(hdfs_feat_dir, folder, mtx_libsvm) #print "logFile_data=",logFile_data logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] ##########data seperation###### id_perm = data_seperation_random(name_list) num_names = len(name_list) print 'INFO: num of samples=', num_names num_train = int(training_portion * num_names) print 'INFO: num_train = ', num_train ########generate training data######### i = 0 #print "INFO: generate training data" #print "INFO: len(id_perm)=",len(id_perm) while i < num_train: #print i, id_perm[i] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i + row_num_training) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat_training = max(max_feat_training, int(feat)) j = j + 1 i = i + 1 row_num_training = row_num_training + num_train i = num_train ########generate testing data######### while i < num_names: ####for generating testing data folder#### test_file_name = name_list[id_perm[i]] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_testing.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_testing.append(i - num_train + row_num_testing) col_testing.append(int(feat) - 1) features_testing.append(int(value)) max_feat_testing = max(max_feat_testing, int(feat)) j = j + 1 i = i + 1 row_num_testing = row_num_testing + (num_names - num_train) # end for loop here ======================== col_num = max(max_feat_training, max_feat_testing) if max_feat_training < col_num: for i in range(0, row_num_training): for j in range(max_feat_training, col_num): features_training.append(0) row_training.append(i) col_training.append(j) elif max_feat_testing < col_num: for i in range(0, row_num_testing): for j in range(max_feat_testing, col_num): features_testing.append(0) row_testing.append(i) col_testing.append(j) features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) #print "row_training:", row_training #print "INFO: col_training:", col_training len_col = len(col_training) print "INFO: col_num:", col_num labels_training = array(labels_training) features_testing = array(features_testing) row_testing = array(row_testing) col_testing = array(col_testing) labels_testing = array(labels_testing) sparse_mtx = csc_matrix((features_training, (row_training, col_training)), shape=(row_num_training, col_num)) #print "sparse_mtx.todense(), sparse_mtx.shape=",sparse_mtx.todense(), sparse_mtx.shape sparse_test = csc_matrix((features_testing, (row_testing, col_testing)), shape=(row_num_testing, col_num)) #print " sparse_test.todense(), sparse_test.shape=",sparse_test.todense(), sparse_test.shape clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) #print "labels_training=",labels_training #print "sparse_mtx=",sparse_mtx clf.fit(sparse_mtx, labels_training) #print "INFO: model:intercept=",clf.intercept_ #print "INFO: model:coef=",clf.coef_ labels_pred = clf.predict(sparse_test) #print "labels_pred:", labels_pred accuracy = clf.score(sparse_test, labels_testing) #print "INFO: data folder=", hdfs_feat_dir print "INFO: accuracy=", accuracy ##################################################################### ##################calculate feature importance with predication labels####################### ##################################################################### AA = sparse_mtx.todense() BB = sparse_test.todense() labels_train_pred = clf.predict(sparse_mtx) labels_test_pred = labels_pred #print "###################################################################################" print "INFO: ======= Calculate feature importance with predication labels ==================" #print "###################################################################################" dic_importance_label = {} for j in range(0, col_num): ###for all features in the loop ############################## #print "====new way with sparse matrix=========" curr_col_train = sparse_mtx.getcol(j) sum_col = curr_col_train.sum(0) positive_feature_number = int(sum_col.tolist()[0][0]) labels_value = 3 - labels_train_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train) sum_product = dot_product.sum(1) labels_positive_sum = int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = sum_label_values - labels_positive_sum ############################## #print "====new way with sparse matrix=========" curr_col_test = sparse_test.getcol(j) sum_col = curr_col_test.sum(0) positive_feature_number = positive_feature_number + int( sum_col.tolist()[0][0]) labels_value = 3 - labels_test_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test) sum_product = dot_product.sum(1) labels_positive_sum = labels_positive_sum + int( sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = labels_negitive_sum + sum_label_values - int( sum_product.tolist()[0][0]) n_total = row_num_training + row_num_testing negitive_feature_number = n_total - positive_feature_number if positive_feature_number == 0: #print "feature ", j+1, "all 0s!" dic_importance_label[j + 1] = -100 elif negitive_feature_number == 0: #print "feature ", j+1, "all 1s!" dic_importance_label[j + 1] = -200 else: q_positive = float(labels_positive_sum) / positive_feature_number q_negitive = float(labels_negitive_sum) / negitive_feature_number Q = (q_positive - q_negitive) * sqrt( float(q_positive) * q_negitive / float(n_total) / float(n_total)) dic_importance_label[j + 1] = Q sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True) print "INFO: ======= Feature Importance(FIRM score) ================" if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, training_fraction, jobname, uploadtype): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) ''' SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'feature_importance_FRIM:'+str(args.row_id)) ''' t0 = time() # get folder list (labels) from hdfs data_out/<id>/metadata ============== dirFile_loc = os.path.join(hdfs_feat_dir, "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() print "INFO: dirFile_loc=", dirFile_loc, ", hash_Folders=", hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: folder_list=", folder_list #['dirty/', 'clean/'] # get feature seq : ngram hash mapping ================================== key = "dic_seq_hashes" #{"123":"136,345"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = dic_list feature_count = len(dic_list) #print "INFO: feature_count=",feature_count #print "dic_list=",dic_list #{u'123,345':u'136'} #print "dic_all_columns=",dic_all_columns # {1: u'8215,8216'} # end # get {hash : raw string} mapping ================================== key = "dic_hash_str" #{"123":"openFile"} jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_hash_str = doc['value'] ''' # get folder list (labels) from hdfs data_out/<id>/libsvm ============== libsvm_loc = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_loc=", libsvm_loc samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_loc) ''' # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file #feat_count_file=libsvm_data_file+"_feat_count" #feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) print "INFO: feature_count=", feature_count #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist=None) labels_and_features_rdd = samples_rdd.map(lambda p: (p[0].label, p[0].features)) all_data = labels_and_features_rdd.collect() features_list = [x.toArray() for _, x in all_data] labels_list_all = [x for x, _ in all_data] labels_list_all = np.array(labels_list_all) features_array = np.array(features_list) ### generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) ### randomly split the samples into training and testing data sparse_mtx, sparse_test, labels_training, labels_testing = \ cross_validation.train_test_split(features_sparse_mtx, labels_list_all, test_size=(1-training_fraction)) #print "INFO: sparse_mtx.shape=",sparse_mtx.shape #print "INFO: sparse_test.shape=",sparse_test.shape row_num_training = (sparse_mtx.shape)[0] row_num_testing = (sparse_test.shape)[0] # why use LinearSVC ? clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) #print "labels_training=",labels_training #print "sparse_mtx=",sparse_mtx clf.fit(sparse_mtx, labels_training) #print "**model:intercept***" #print clf.intercept_ #print "**model:coef***" #print clf.coef_ col_num = len(clf.coef_[0]) # for n_classes==2 print "INFO: col_num=", col_num labels_pred = clf.predict(sparse_test) #print "labels_pred:", labels_pred accuracy = clf.score(sparse_test, labels_testing) print "INFO: data folder:", hdfs_feat_dir print "INFO: accuracy: ", accuracy ##################################################################### ##################calculate feature importance with predication labels####################### ##################################################################### AA = sparse_mtx.todense() BB = sparse_test.todense() labels_train_pred = clf.predict(sparse_mtx) labels_test_pred = labels_pred print "INFO: ###################################################################################" print "INFO: ############calculate feature importance with predication labels###################" print "INFO: ###################################################################################" dic_importance_label = {} for j in range(0, col_num): ###for all features in the loop ############################## #print "====new way with sparse matrix=========" curr_col_train = sparse_mtx.getcol(j) sum_col = curr_col_train.sum(0) positive_feature_number = int(sum_col.tolist()[0][0]) labels_value = 3 - labels_train_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train) sum_product = dot_product.sum(1) labels_positive_sum = int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = sum_label_values - labels_positive_sum ############################## #print "====new way with sparse matrix=========" curr_col_test = sparse_test.getcol(j) sum_col = curr_col_test.sum(0) positive_feature_number = positive_feature_number + int( sum_col.tolist()[0][0]) labels_value = 3 - labels_test_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test) sum_product = dot_product.sum(1) labels_positive_sum = labels_positive_sum + int( sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = labels_negitive_sum + sum_label_values - int( sum_product.tolist()[0][0]) n_total = row_num_training + row_num_testing negitive_feature_number = n_total - positive_feature_number if positive_feature_number == 0: #print "feature ", j+1, "all 0s!" dic_importance_label[j + 1] = -100 elif negitive_feature_number == 0: #print "feature ", j+1, "all 1s!" dic_importance_label[j + 1] = -200 else: q_positive = float(labels_positive_sum) / positive_feature_number q_negitive = float(labels_negitive_sum) / negitive_feature_number Q = (q_positive - q_negitive) * sqrt( float(q_positive) * q_negitive / float(n_total) / float(n_total)) dic_importance_label[j + 1] = Q sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True) print "INFO: =======Feature Importance(FIRM score)================" if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print("ERROR: %s - %s." % (e.local_score_file, e.strerror))
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, src_filename, jobname, model_data_folder): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # ML model filename ==== model_fname = os.path.join(model_data_folder, row_id_str + '.pkl') print "INFO: model_data_folder=", model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir, model_data_folder, model_fname) # init Spark context ==== sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) # start here =================================================================== =============== t0 = time() ### load libsvm file: may or may not be PCA-ed ### libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename) print "INFO: libsvm_data_file=", libsvm_data_file # feature count is a variable if PCA feature_count = 0 # samples_rdd may be from PCAed data # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, '') # collect all data to local for processing =============== all_data = samples_rdd.collect() total_sample_count = len(all_data) # 2-D array, may be PCAed features_list = [x.features.toArray() for x, _ in all_data] # label array labels_list_all = [x.label for x, _ in all_data] # hash array hash_list_all = [x for _, x in all_data] # convert to np array features_array_reduced = np.array(features_list) hash_list_all = np.array(hash_list_all) labels_list_all = np.array(labels_list_all) true_label_array = np.array(labels_list_all, dtype=np.int8) print "INFO: total_sample_count=", total_sample_count print "INFO: features_array_reduced.shape=", features_array_reduced.shape print "INFO: labels_list_all.shape=", labels_list_all.shape print "INFO: true_label_array.shape=", true_label_array.shape t1 = time() print 'INFO: data generating time: %f' % (t1 - t0) ############################################### ########## build learning model ############### ############################################### ### parse parameters and generate the model ### (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr) if model is None: return labels_kmeans = None #### fit the model to training dataset #### try: model.fit(features_array_reduced) labels_kmeans = model.labels_ #'numpy.ndarray' except: print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info( )[0] return #### save clf for future use #### #joblib.dump(model, model_data_folder + row_id_str+'.pkl') joblib.dump(model, model_fname) #print "**model:intercept***" #print clf.intercept_ print "INFO: model type=", type(model), " model=", model ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### if labelnameflag == 1: key = "dic_name_label" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list #Adjusted Mutual Information between two clusterings amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars = adjusted_rand_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_rand_score=", ars ################################################### #######plot histogram #### ################################################### plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4 * plot_col_num, 3 * int(math.ceil(n_clusters * 1.0 / plot_col_num))) print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape print "INFO: labels_list_all t=", type( labels_list_all), "labels_kmeans t=", type(labels_kmeans) print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, folder=local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures( labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, normalize=True, folder=local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### #num_bars = len(np.unique(labels_list_all)) num_bars = max(labels_list_all) + 1 figsize = (4 * plot_col_num, 3 * int(math.ceil(num_bars * 1.0 / plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures( labels_kmeans, labels_list_all, num_bars, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, reverse=True, folder=local_out_dir, rid=row_id_str) #### plot dot figures #### #mtx_label = model.labels_ mtx_center = model.cluster_centers_ # dot plot for Kmeans =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='KMeans', filename_3d=filename_3d) #print "features_array_reduced s=",features_array_reduced.shape # dot plot for True Labels =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_array, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='True Labels', filename_3d=filename_3d) dataset_info = { "training_fraction": 1, "class_count": n_clusters, "dataset_count": total_sample_count } # only update db for web request =========== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" t1 = time() print 'INFO: running time: %f' % (t1 - t0) #print 'Finished!' return 0