def del_files(tuple_arr, tgt_dir, tgt_patterns, update_db=1,debug="Y"): count=0 # for each id for r in tuple_arr: id=str(r[0]) # 1st tuple element is id print "id=",id count=count+1 # for each pattern for p in tgt_patterns.split(','): # delete file log/<id>predict.log #filename=os.path.join(tgt_dir,p.replace("*",id)) #print "tgt=",filename fname=os.path.basename(p) dname=os.path.dirname(p) fullname=os.path.join(tgt_dir,dname,fname.replace("*",id)) print "fullname=",fullname list=glob.glob(fullname) # for each file found for f in list: if debug=="N": delete_a_file_or_dir(f) else: print "rm",f #print "tbd=",f #update db if update_db==1 and debug=="N": usql="update atdml_document set status_code=-1 where id="+id+";" uret=exec_sqlite.exec_sql(usql) print "target count=",count # delete folder tmpdata/<id>retrieve return None
def predict(row_id_str, ds_id, cid_str, input_fname, model_fname, univ_pert_fname, label_fname , local_out_dirname, local_out_fname, model_type , ip_address=config.get('mongo', 'out_ip_address'), port=eval(config.get('mongo', 'out_port')) , db_name=config.get('mongo', 'out_db'), tb_name=config.get('mongo', 'out_tb') , username=config.get('mongo', 'out_username'), password=config.get('mongo', 'out_password') , flag_perturbation="Y", fromweb="1" ): t0 = time() # model from input string ============ Load Model ============== #if model_fname is None: # default model_fname? print "INFO: model_fname=",model_fname if model_type == IMG_MDL_INCEPTION: status,str_label_original,predict_val=predict_inception( row_id_str, ds_id, cid_str, input_fname, model_fname, univ_pert_fname, label_fname , local_out_dirname, local_out_fname, flag_perturbation ) elif model_type == IMG_MDL_YOLO: status,str_label_original,predict_val=predict_yolo( row_id_str, ds_id, cid_str, input_fname, model_fname, univ_pert_fname, label_fname , local_out_dirname, local_out_fname, flag_perturbation ) # update sqliteDB =============== # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set status = '"+status+"', processed_date ='" \ +str(datetime.datetime.now())+"', prediction = '"+ str(str_label_original) \ +"', predict_val = '"+str(predict_val) \ +"' where id="+cid_str ret=exec_sqlite.exec_sql(str_sql) #print "Data update done! ret=", str(ret) t1 = time() print 'INFO: total running time: %f' %(t1-t0) return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, labelnameflag, fromweb, src_filename , jobname ): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) # start here =================================================================== =============== t0 = time() ### Need to check if PCA available here =========================== libsvm_data_file = os.path.join(hdfs_feat_dir , src_filename) # need to set k numb in filename somehow print "INFO: libsvm_data_file=", libsvm_data_file #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file).cache() # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() feature_count=0 samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, '') # get label as a list labels_list_all = samples_rdd.map(lambda p: int(p[0].label)).collect() total_sample_count=len(labels_list_all) parsedData =samples_rdd.map(lambda p: p[0].features).cache() #for i in parsedData.collect(): #p.features: pyspark.mllib.linalg.SparseVector # print "pd=",type(i),",i=",i t1 = time() print 'INFO: running time: %f' %(t1-t0) t0 = t1 ############################################### ########## build learning model ############### ############################################### ### get the parameters### print "INFO: ============Learning Algorithm and Parameters=============" para_dict = json.loads(ml_opts_jstr) flag_model = para_dict['learning_algorithm'] # kmeans iteration_num = eval(para_dict['iterations']) k=2 if 'k' in para_dict: k = eval(para_dict['k']) print "INFO: Learning Algorithm:", flag_model print "INFO: iterations=", iteration_num #print "training_sample_number=", training_sample_number ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list ### build model ### if flag_model == "kmeans": print "=================== Kmeans ============" model = KMeans.train(parsedData, k, maxIterations=iteration_num) t_cost= model.computeCost(parsedData) print "INFO: cost for training set =", str(t_cost) clusterCenters=model.clusterCenters print "INFO: clusterCenters t=", type(clusterCenters) #list elif flag_model == "gaussian_mixture_model": # didn't work some native lib issue print "=================== Gaussian_Mixture_Model ============" model = GaussianMixture.train(parsedData, k, maxIterations=iteration_num) print "INFO: model.weights =", model.weights else: print "INFO: Training model selection error: no valid ML model selected!" return ### Save model save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "ERROR: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "ERROR: Unexpected error:", sys.exc_info()[0] print "INFO: model saved at hdfs=",save_dir print "INFO: model type=",type(model)," model=",model model.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) ### # (true label, keams label, features list, hash) all_data=samples_rdd.map(lambda t: ( t[0].label, model.predict(t[0].features), t[0].features, t[1] ) ).collect() true_label_arr = np.asarray([int(x) for x,_,_,_ in all_data]) labels_kmeans = np.asarray([int(x) for _,x,_,_ in all_data]) hash_list = np.asarray([x for _,_,_,x in all_data]) print "INFO: all_data len=",len(all_data),"all_data t=",type(labels_list_all) print "INFO: true_label_arr.shape=",true_label_arr.shape,"labels_kmeans.shape=",labels_kmeans.shape print "INFO: true_label_arr t=",type(true_label_arr),"labels_kmeans t=",type(labels_kmeans) mtx_center=np.asarray(clusterCenters) features_array_reduced=np.asarray([x.toArray() for _,_,x,_ in all_data]) print "INFO: mtx_center t=",type(mtx_center),"mtx_center.shape=",mtx_center.shape print "INFO: features_array_reduced t=",type(features_array_reduced),"features_array_reduced.shape",features_array_reduced.shape #Adjusted Mutual Information between two clusterings amis=adjusted_mutual_info_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars=adjusted_rand_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_rand_score=", ars accuracy=0.0 t1 = time() print 'INFO: training run time: %f' %(t1-t0) t0 = t1 ############################################### ########## plot histogram ###### ############################################### n_clusters=k plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4*plot_col_num, 3*int(math.ceil(n_clusters*1.0/plot_col_num))) print "INFO: n_clusters=",n_clusters,",label_dic=",label_dic print "INFO: plot_col_num=",plot_col_num,",figsize=",figsize,",local_out_dir=",local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, folder = local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, normalize = True, folder = local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### num_bars = max(true_label_arr) + 1 figsize = (4*plot_col_num, 3*int(math.ceil(num_bars*1.0/plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures(labels_kmeans, true_label_arr, num_bars, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, reverse = True, folder = local_out_dir, rid=row_id_str) #### plot dot figures #### # dot plot for Kmeans =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='KMeans', filename_3d=filename_3d) # dot plot for True Labels =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster_tl.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_arr, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='True Labels', filename_3d=filename_3d) dataset_info={"training_fraction":1, "class_count":n_clusters,"dataset_count":total_sample_count} # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '"+str(accuracy*100)+"%" print 'INFO: Finished!' return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, training_fraction, jobname, model_data_folder, random_seed=None): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # ML model filename ==== model_fname = os.path.join(model_data_folder, row_id_str + '.pkl') print "INFO: model_data_folder=", model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir, model_data_folder, model_fname) # init Spark context ==== sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) # start here =================================================================== =============== t0 = time() # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat = 0 if not ml_opts_jstr is None: ml_opts = json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat = ml_opts["has_excluded_feat"] # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist = ml_util.ml_get_excluded_feat( row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=", excluded_feat_cslist # source libsvm filename libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file feat_count_file = libsvm_data_file + "_feat_count" feature_count = zip_feature_util.get_feature_count(sc, feat_count_file) print "INFO: feature_count=", feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist) # collect all data to local for processing =============== all_data = samples_rdd.collect() sample_count = len(all_data) if not random_seed is None and int(random_seed) > 0: np.random.seed(int(random_seed)) all_data = sorted(all_data, key=lambda x: x[1]) # 2-D array features_list = [x.features.toArray() for x, _ in all_data] # label array labels_list_all = [x.label for x, _ in all_data] # hash array hash_list_all = [x for _, x in all_data] # convert to np array labels_list_all = array(labels_list_all) features_array = np.array(features_list) hash_list_all = np.array(hash_list_all) #print "features_list=",features_list # generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) # if ensamble is on, do special split here ### randomly split the samples into training and testing data =============== X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \ cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) ) # X_test_sparse is scipy.sparse.csr.csr_matrix testing_sample_count = len(labels_test) training_sample_count = len(labels_train) training_lbl_cnt_list = Counter(labels_train) testing_lbl_cnt_list = Counter(labels_test) print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count, ",sample_count=", sample_count print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list print "INFO: train_hash_list count=", len( train_hash_list), ", test_hash_list count=", len(test_hash_list) # random_seed testing if not random_seed is None: cnt = 0 for i in train_hash_list: print i cnt = cnt + 1 if cnt > 3: break #print "INFO: labels_list_all=",labels_list_all # too big t1 = time() print 'INFO: running time: %f' % (t1 - t0) ############################################### ###########build learning model ==================================================== =============== ############################################### ### parse parameters and generate the model ### (clf, model_name) = parse_param_and_get_model(ml_opts) if model_name == "none": print "ERROR: model name not found!" return -1 #####fit the model to training dataset =============== try: clf.fit(X_train_sparse, labels_train) except: print "ERROR: clf.fit(): clf=", clf print "ERROR: sys.exc_info:", sys.exc_info()[0] return -1 print "INFO: model type=", type(clf), " clf=", clf #### save clf for future use ================== =============== joblib.dump(clf, model_fname) # get data from model ================================ coef = None intercept = None # get column size ===== try: if type(clf) in (classes.SVC, classes.NuSVC): # svm didn't have coef_ col_num = clf.support_vectors_.shape[1] else: #linear only # coef_ is only available when using a linear kernel col_num = len(clf.coef_[0]) coef = clf.coef_[0] intercept = clf.intercept_[0] # only get 1st item? #print "**model:clf.coef_[0] =",clf.coef_[0] # save coef_ to Mongo except Exception as e: print "Warning: Can't get clf.coef_[0]. e=", e, ", get total features from meta-data" col_num = 0 #how to get feature number for sparse array? print "INFO: total feature # in the model: ", col_num jfeat_coef_dict = {} # create feature coefficient file ================================ if coef is None: print "WARNING: model weights not found!" else: feat_filename = os.path.join(local_out_dir, row_id_str + "_feat_coef.json") print "INFO: feat_filename=", feat_filename # save coef_arr to mongo === #jfeat_coef_dict=save_coef2db(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id) jfeat_coef_dict = ml_util.ml_save_coef_build_feat_coef( row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id) #print "INFO: jfeat_coef_dict=", jfeat_coef_dict print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict) ### Evaluating the model on testing dataset =============== labels_pred = clf.predict(X_test_sparse) accuracy = clf.score(X_test_sparse, labels_test) print "INFO: Accuracy = ", accuracy # filename for false prediction samples =============== false_pred_fname = os.path.join(local_out_dir, row_id_str + "_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname # build files for false pred & score graph (score_arr_0, score_arr_1, max_score, min_score) = ml_build_false_pred(X_test_sparse, coef, intercept, labels_test, labels_pred, test_hash_list, model_name, jfeat_coef_dict, false_pred_fname, row_id_str=row_id_str, ds_id=ds_id, mongo_tuples=mongo_tuples) # save pred output pred_out_arr = [] for i in range(0, len(labels_test)): pred_out_arr.append( (labels_test[i], labels_pred[i], test_hash_list[i])) pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname ml_util.ml_pickle_save(pred_out_arr, pred_ofname) ################################################### ### generate label names (family names) ==================================================== =============== ################################################### if labelnameflag == 1: label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id) print "INFO: label_dic =", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: label_dic=", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) ############################################### ###########plot prediction result figures ==================================================== =============== ############################################### pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png") true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png") pred_xlabel = 'Prediction (Single Run)' true_xlabel = 'True Labels (Single Run)' test_cnt_dic = ml_util.ml_plot_predict_figures(labels_pred.tolist(), labels_test.tolist(), labels_list, label_dic, testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname #print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc = None #fscore=None perf_measures = None class_count = len(labels_list) dataset_info = { "training_fraction": training_fraction, "class_count": class_count, "dataset_count": sample_count } ############################################################# ###################for 2 class only (plot ROC curve) ==================================================== =============== ############################################################# if class_count == 2: # build data file for score graph score_graph_fname = os.path.join(local_out_dir, row_id_str + "_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name, score_graph_fname, max_score, min_score) do_ROC = True # clean is 0; dirty is 1 reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "WARNING: No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!" do_ROC = False if do_ROC: # calculate fscore ========== perf_measures = ml_util.calculate_fscore(labels_test, labels_pred) #fscore=perf_measures["fscore"] #acc=perf_measures["accuracy"] #phi=perf_measures["phi"] print "INFO: perf_measures=", perf_measures confidence_score = clf.decision_function(X_test_sparse) #print "INFO:confidence_score=",confidence_score if flag_clean == 0: scores = [x for x in confidence_score] s_labels = [x for x in labels_test] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x in confidence_score] s_labels = [1 - x for x in labels_test] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc = ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P, local_out_dir, row_id_str) perf_measures["roc_auc"] = roc_auc # only update db for web request ==================================================== =============== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Sqlite update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" t1 = time() print 'INFO: running time: %f' % (t1 - t0) print 'INFO: Train Finished!' return 0
pred_label=str(sing_label_pred) else: pred_label = str(sing_label_pred) print "RESULT: prediction=", pred_label ################################### ############update DB############## ################################### # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set status = 'predicted', processed_date ='" \ +str(datetime.datetime.now())+"', prediction = '"+ pred_label \ +"', predict_val = '"+str(predict_val) \ +"' where id="+cid_str ret=exec_sqlite.exec_sql(str_sql) #print "Data update done! ret=", str(ret) else: print "RESULT: prediction="+ pred_label+"" t1 = time() print 'INFO: total running time: %f' %(t1-t0) return 0 if __name__ == '__main__': __description__ = "single file prediction for generic" main()
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, labelnameflag, fromweb , training_fraction, jobname, model_data_folder ): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path # ML model filename ==== model_fname=os.path.join(model_data_folder, row_id_str+'.pkl') print "INFO: model_data_folder=",model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str,local_out_dir,model_data_folder,model_fname) # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) t0 = time() t00 = t0 # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat=0 if not ml_opts_jstr is None: ml_opts=json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat=ml_opts["has_excluded_feat"] # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist=ml_util.ml_get_excluded_feat(row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=",excluded_feat_cslist # source libsvm filename libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file feat_count_file=libsvm_data_file+"_feat_count" feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) print "INFO: feature_count=",feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd,feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist) all_data = samples_rdd.collect() sample_count=len(all_data) # 2-D array features_list = [x.features.toArray() for x,_ in all_data] # label array labels_list_all = [x.label for x,_ in all_data] # hash array hash_list_all = [x for _,x in all_data] # convert to np array labels_list_all = array(labels_list_all) features_array = np.array(features_list) hash_list_all=np.array(hash_list_all) # generate sparse matrix (csr) for all samples features_sparse_mtx = csr_matrix(features_array) ### randomly split the samples into training and testing data =============== X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \ cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) ) # X_test_sparse is scipy.sparse.csr.csr_matrix testing_sample_count = len(labels_test) training_sample_count=len(labels_train) training_lbl_cnt_list=Counter(labels_train) testing_lbl_cnt_list=Counter(labels_test) print "INFO: training sample count=",training_sample_count,", testing sample count=",testing_sample_count,",sample_count=",sample_count print "INFO: training label list=",training_lbl_cnt_list,", testing label list=",testing_lbl_cnt_list print "INFO: train_hash_list count=",len(train_hash_list),", test_hash_list count=",len(test_hash_list) t1 = time() print 'INFO: running time: %f' %(t1-t0) ############################################### ###########build learning model################ ############################################### ### parse parameters and generate the model ### (clf, model_name, api, cv, param_dic) = parse_param_and_get_model(ml_opts) if model_name == "none": print "ERROR: model name not found!" return -1 #param_jobj=json.loads(ml_opts_jstr); #print "param_jobj=",param_jobj ######################################################## ##########Grid Search with cross validation############# ######################################################## json2save={} json2save["rid"]=int(row_id_str) json2save["key"]="cv_result" #json2save["param_str"]=ml_opts_jstr json2save["param_dic"]=param_dic cv_grid=[] if api == "centralized": #########run with Scikit-learn API (for comparison)###### print "INFO: ******************Grid Search with Scikit-learn API************" t0 = time() # Set the parameters by cross-validation #tuned_parameters = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}] #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], \ # 'C': [1, 10, 100, 1000]}, \ # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = ['accuracy'] json2save["scores"]=scores #print json2save for score in scores: # for one item only? score=accuracy print("INFO: # Tuning hyper-parameters for %s" % score) #print() grid = grid_search.GridSearchCV(estimator = clf, param_grid = param_dic, cv=cv, scoring= score) grid.fit(X_train_sparse, labels_train) print "INFO: Best parameters set found on development set:" print "INFO: grid.best_params_=",grid.best_params_ print "INFO: Grid scores on development set:" for key in grid.best_params_: print "INFO: best_params["+key+"]=", grid.best_params_[key] if key.lower()=="regtype": ml_opts['regularization']=str(grid.best_params_[key]) # add best param to else: ml_opts[key.lower()]=str(grid.best_params_[key]) # add best param to # save best param to db as json string j_str=json.dumps(ml_opts); json2save["param_str"]=j_str; print "INFO: grid_scores_ with params:" for params, mean_score, scores in grid.grid_scores_: print "INFO: %0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) #outstr='%s,%0.3f,%0.03f,%s' % (params,mean_score, scores.std() * 2,"Selected" if params==grid.best_params_ else "") outj={} outj["param"]=params outj["average_accuracy"]="%0.3f" % (mean_score) outj["std_deviation"]="%0.3f" % (scores.std() * 2) outj["selected"]="%s" % ("Selected" if params==grid.best_params_ else "") cv_grid.append(outj) clf_best = grid.best_estimator_ t1 = time() ############# END run with SKlearn ###### print 'INFO: Grid Search with SKlearn running time: %f' %(t1-t0) t0 = time() else: #############run with SPARK###### print "INFO: ******************Grid Search with SPARK************" all_comb_list_of_dic = get_all_combination_list_of_dic(param_dic) print "INFO: Total number of searching combinations=", len(all_comb_list_of_dic) #print "all_comb_list_of_dic: ", all_comb_list_of_dic params_rdd = sc.parallelize(all_comb_list_of_dic) ###broad cast clf, traning data, testing data to all workers### X_broadcast = sc.broadcast(X_train_sparse) y_broadcast = sc.broadcast(labels_train) clf_broadcast = sc.broadcast(clf) ### Grid Search with CV in multiple workers ### models = params_rdd.map(lambda x: learn_with_params(clf_broadcast.value, X_broadcast.value, y_broadcast.value, cv, x)).sortByKey(ascending = False).cache() (ave_accuracy, (clf_best, p_dic_best, std2)) = models.first() # output results # print "INFO: Best parameters set found for ", model_name, " is: " print "INFO: ", for key in p_dic_best: print key, " = ", p_dic_best[key], if key.lower()=="regtype": ml_opts['regularization']=str(p_dic_best[key]) else: ml_opts[key.lower()]=str(p_dic_best[key]) # add best param to # save best param to db as json string print "" j_str=json.dumps(ml_opts); json2save["param_str"]=j_str; print "INFO: Average accuracy with CV = ", cv, ": ", ave_accuracy ######## print complete report ####### print "INFO: Grid scores on development set:" all_results = models.collect() for i in range(0, len(all_results)): (ave_accu_i, (clf_i, p_dic_i, std2_i)) = all_results[i] print "INFO: ",ave_accu_i, " for ", p_dic_i print "INFO: %0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), p_dic_i #outstr='%s,%0.3f,%0.03f,%s' % ( p_dic_i, ave_accu_i, std2_i, "Selected" if p_dic_i==p_dic_best else "") outj={} outj["param"]=p_dic_i outj["average_accuracy"]="%0.3f" % (ave_accu_i) outj["std_deviation"]="%0.3f" % (std2_i) outj["selected"]="%s" % ("Selected" if p_dic_i==p_dic_best else "") cv_grid.append(outj) print " " t1 = time() ############# END run with SPARK###### print 'INFO: Grid search with SPARK running time: %f' %(t1-t0) ################################################################################## #print "cv_grid=",cv_grid #json2save["cv_grid_title"]='param,average_accuracy,std_deviation,selected' json2save["cv_grid_data"]=cv_grid json2save['clf_best']=str(clf_best).replace("\n","").replace(" ","") cv_result=json.dumps(json2save) #print "INFO: cv_result=",cv_result filter='{"rid":'+row_id_str+',"key":"cv_result"}' upsert_flag=True ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true}) ret=query_mongo.upsert_doc_t(mongo_tuples,filter,cv_result,upsert_flag) print "INFO: Upsert count for cv_result: ret=",ret ################################################################################## ##########Retrain with best model for training set and output results############# ################################################################################## print "INFO: **********Retrain with best model for training set and output results************" clf_best.fit(X_train_sparse, labels_train) #### save clf_best for future use #### #joblib.dump(clf_best, model_data_folder + row_id_str+'.pkl') joblib.dump(clf_best, model_fname) ### Evaluating the model on testing data labels_pred = clf_best.predict(X_test_sparse) accuracy = clf_best.score(X_test_sparse, labels_test) print "INFO: Accuracy = ", accuracy ######################################the rest of the code is the same as train_sklean.py (replace clf with clf_best)##################################################################### clf=clf_best print "INFO: model type=",type(clf)," clf=",clf # get data from model ================================ coef=None intercept=None try: if type(clf) in ( classes.SVC , classes.NuSVC) :# svm didn't have coef_ col_num=clf.support_vectors_.shape[1] else: #linear only # coef_ is only available when using a linear kernel col_num = len(clf.coef_[0]) coef=clf.coef_[0] intercept=clf.intercept_[0] # only get 1st item? #print "**model:clf.coef_[0] =",clf.coef_[0] except Exception as e: print "WARNING: Can't get clf.coef_[0]. e=",e,", get total features from meta-data" col_num = 0 #how to get feature number for sparse array? print "INFO: total feature # in the model: ", col_num jfeat_coef_dict={} # create feature coefficient file ================================ if coef is None: print "WARNING: model weights not found!" else: feat_filename=os.path.join(local_out_dir,row_id_str+"_feat_coef.json") print "INFO: feat_filename=",feat_filename # save coef_arr to mongo & create jfeat_coef_dict=== jfeat_coef_dict=ml_util.ml_save_coef_build_feat_coef(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id) #print "INFO: jfeat_coef_dict=", jfeat_coef_dict print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict ) # filename for false pred false_pred_fname=os.path.join(local_out_dir,row_id_str+"_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname # build files for false pred & score graph (score_arr_0, score_arr_1, max_score,min_score)=ml_build_false_pred(X_test_sparse,coef,intercept , labels_test, labels_pred, test_hash_list, model_name, jfeat_coef_dict, false_pred_fname) # save pred output pred_out_arr=[] for i in range(0,len(labels_test)): pred_out_arr.append((labels_test[i], labels_pred[i], test_hash_list[i])) pred_ofname=os.path.join(local_out_dir,row_id_str+"_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname ml_util.ml_pickle_save(pred_out_arr,pred_ofname) ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: ******generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) ### generate sample numbers of each family in testing data### testing_sample_number = len(labels_test) print "INFO: testing_sample_number=", testing_sample_number test_cnt_dic = {} for key in label_dic: test_cnt_dic[key] = 0 for i in range (0, testing_sample_number): for key in label_dic: if labels_test[i] == key: test_cnt_dic[key] = test_cnt_dic[key] + 1 print "INFO: Number of samples in each label is=", test_cnt_dic ############################################### ###########plot prediction result figure####### ############################################### pred_fname=os.path.join(local_out_dir,row_id_str+"_1"+".png") true_fname=os.path.join(local_out_dir,row_id_str+"_2"+".png") pred_xlabel='Prediction (Single Run)' true_xlabel='True Labels (Single Run)' test_cnt_dic=ml_util.ml_plot_predict_figures(labels_pred.tolist(), labels_test.tolist(), labels_list, label_dic, testing_sample_count , pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc=None #fscore=None perf_measures=None class_count=len(labels_list) dataset_info={"training_fraction":training_fraction, "class_count":class_count,"dataset_count":sample_count} ############################################################# ###################for 2 class only (plot ROC curve)######### ############################################################# if len(labels_list) == 2: # build data file for score graph score_graph_fname=os.path.join(local_out_dir,row_id_str+"_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname ml_build_pred_score_graph(score_arr_0,score_arr_1,model_name, score_graph_fname,max_score,min_score) do_ROC=True reverse_label_dic = dict((v,k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!" do_ROC=False if do_ROC: # calculate fscore ========== perf_measures=ml_util.calculate_fscore(labels_test, labels_pred) print "INFO: perf_measures=",perf_measures confidence_score = clf_best.decision_function(X_test_sparse) if flag_clean == 0: scores = [x for x in confidence_score] s_labels = [x for x in labels_test] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x in confidence_score] s_labels = [1-x for x in labels_test] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc=ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P , local_out_dir, row_id_str) perf_measures["roc_auc"]=roc_auc # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"',ml_opts='"+j_str \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '"+str(accuracy*100)+"%" print 'INFO: total running time: %f' %(t1-t00) print 'INFO: Finished!' return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, training_fraction, jobname): if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # zip func in other files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() t00 = t0 # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat = 0 ml_opts = {} if not ml_opts_jstr is None: ml_opts = json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat = ml_opts["has_excluded_feat"] #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist = ml_util.ml_get_excluded_feat( row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=", excluded_feat_cslist ### generate Labeled point libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file:", libsvm_data_file # load feature count file feat_count_file = libsvm_data_file + "_feat_count" feature_count = zip_feature_util.get_feature_count(sc, feat_count_file) print "INFO: feature_count=", feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist) #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) # get distinct label list labels_list_all = samples_rdd.map( lambda p: p[0].label).distinct().collect() ### generate training and testing data training_rdd, testing_rdd = samples_rdd.randomSplit( [training_fraction, 1 - training_fraction]) training_rdd = training_rdd.map(lambda p: p[0]) # keep LabeledPoint only training_rdd.cache() training_sample_count = training_rdd.count() training_lbl_cnt_list = training_rdd.map( lambda p: (p.label, 1)).reduceByKey(add).collect() testing_rdd.cache() testing_sample_count = testing_rdd.count() testing_lbl_cnt_list = testing_rdd.map( lambda p: (p[0].label, 1)).reduceByKey(add).collect() sample_count = training_sample_count + testing_sample_count t1 = time() print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list print "INFO: labels_list_all=", labels_list_all print "INFO: training and testing samples generated!" print 'INFO: running time: %f' % (t1 - t0) t0 = t1 ############################################## ########### Grid Search with CV ############## ############################################## ### get the parameters for cross validation and grid search ### (cv, model_name, param_dict) = generate_param(ml_opts) ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id) print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) #print "labels:", labels_list class_num = len(labels_list) if class_num > 2: print "INFO: Multi-class classification! Number of classes = ", class_num #### generate training and testing rdd(s) for CV##### split_prob = 1.0 / float(cv) split_prob_list = [] for i in range(0, cv): split_prob_list.append(split_prob) list_rdd = training_rdd.randomSplit(split_prob_list) list_train_rdd = [] list_test_rdd = [] for i in range(0, cv): list_rdd[i].cache() for i in range(0, cv): tr_rdd = sc.emptyRDD() for j in range(0, cv): if j == i: pass else: tr_rdd = tr_rdd + list_rdd[j] tr_rdd.cache() list_train_rdd.append(tr_rdd) list_test_rdd.append(list_rdd[i]) all_comb_list_of_dic = get_all_combination_list_of_dic(param_dict) print "INFO: Total number of searching combinations:", len( all_comb_list_of_dic) ### loop for all parameter combinations and search the best parameters with CV### results = [] for p in range(0, len(all_comb_list_of_dic)): params = all_comb_list_of_dic[p] C = params['C'] iteration_num = params['iterations'] regularization = params['regType'] scores = [] for i in range(0, cv): train_rdd = list_train_rdd[i] test_rdd = list_test_rdd[i] train_number = train_rdd.count() regP = C / float(train_number) ### build model ### if model_name == "linear_svm_with_sgd": #print "====================1: Linear SVM=============" model_classification = SVMWithSGD.train( train_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_lbfgs": #print "====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train( train_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_sgd": #print "====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train( train_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "ERROR: Training model selection error: no valid ML model selected!" return ### Evaluating the model on testing data labelsAndPreds = test_rdd.map( lambda p: (p.label, model_classification.predict(p.features))) labelsAndPreds.cache() test_sample_number = test_rdd.count() testErr = labelsAndPreds.filter( lambda (v, p): v != p).count() / float(test_sample_number) accuracy = 1 - testErr #print "Accuracy = ", accuracy scores.append(accuracy) ss = np.asarray(scores) #print "%0.3f (+/-%0.03f) for " % (ss.mean(), ss.std() * 2), params results.append((ss.mean(), ss.std() * 2, params)) sorted_results = sorted(results, key=lambda x: x[0], reverse=1) (best_accuracy, best_std2, best_param) = sorted_results[0] print "INFO: ml_opts_jstr=", ml_opts_jstr print "INFO: best_param=", best_param #ml_opts=json.loads(ml_opts_jstr); print "INFO: ml_opts=", ml_opts ############################################## ######output Grid Search results############## ############################################## json2save = {} json2save["rid"] = int(row_id_str) json2save["key"] = "cv_result" #json2save["param_str"]=ml_opts_jstr json2save["param_dic"] = param_dict cv_grid = [] print "" print "INFO: =====Grid Search Results for SPARK ======" print "INFO: Best parameters set found for ", model_name, " is: " for key in best_param: print "INFO:", key, "=", best_param[key] if key.lower() == "regtype": ml_opts['regularization'] = str(best_param[key]) else: ml_opts[key.lower()] = str(best_param[key]) # add best param to ml_opts_jstr = json.dumps(ml_opts) json2save["param_str"] = ml_opts_jstr print "INFO: Average accuracy with CV = ", cv, ": ", best_accuracy print "" print "INFO: Grid scores on development set:" for i in range(0, len(sorted_results)): (ave_accu_i, std2_i, param_i) = sorted_results[i] print "%0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), param_i #outstr='%s,%0.3f,%0.03f,%s' % (param_i,ave_accu_i, std2_i,"Selected" if param_i==best_param else "") outj = {} outj["param"] = param_i outj["average_accuracy"] = "%0.3f" % (ave_accu_i) outj["std_deviation"] = "%0.3f" % (std2_i) outj["selected"] = "%s" % ("Selected" if param_i == best_param else "") cv_grid.append(outj) print " " t1 = time() print 'INFO: Grid Search with CV run time: %f' % (t1 - t0) t0 = time() ################################################################################## json2save["cv_grid_data"] = cv_grid cv_result = json.dumps(json2save) print "INFO: cv_result=", cv_result filter = '{"rid":' + row_id_str + ',"key":"cv_result"}' upsert_flag = True ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true}) ret = query_mongo.upsert_doc_t(mongo_tuples, filter, cv_result, upsert_flag) print "INFO: Upsert count for mllib cv_result: ret=", ret ############################################################################################ ########### retrain with all training data and generate the final model with results ####### ############################################################################################ C = best_param['C'] iteration_num = best_param['iterations'] regularization = best_param['regType'] regP = C / float(training_sample_count) ######################################the rest of the code is the same as train_MLlib.py ##################################################################### if model_name == "linear_svm_with_sgd": ### 1: linearSVM print "INFO: ====================1: Linear SVM=============" model_classification = SVMWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) #print model_classification elif model_name == "logistic_regression_with_lbfgs": ### 2: LogisticRegressionWithLBFGS print "INFO: ====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_sgd": ### 3: LogisticRegressionWithSGD print "INFO: ====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "INFO: Training model selection error: no valid ML model selected!" return print "INFO: model type=", type(model_classification) # create feature coefficient file ================================ coef_arr = None intercept = None if model_classification.weights is None: print "WARNING: model weights not found!" else: coef_arr = model_classification.weights.toArray().tolist() # save to mongo key = "coef_arr" ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples) # save intercept to mongo key = "coef_intercept" intercept = model_classification.intercept ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples) # feature list + coef file ============= feat_filename = os.path.join(local_out_dir, row_id_str + "_feat_coef.json") print "INFO: feat_filename=", feat_filename # create feature list + coef file =============================================== ============ # expect a dict of {"fid":(coef, feature_raw_string)} jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None, coef_arr, ds_id, mongo_tuples) # special featuring for IN or libsvm if jret is None: jret = ml_util.build_feat_coef_raw_list_t(row_id_str, feat_filename, coef_arr, ds_id, mongo_tuples) if jret is None: print "WARNING: Cannot create sample list for testing dataset. " jfeat_coef_dict = jret print "INFO: coef_arr len=", len( coef_arr), ", feature_count=", feature_count # for multi-class if len(coef_arr) != feature_count: jfeat_coef_dict = {} print "WARNING: feature list can't be shown for multi-class classification" # Calculate prediction and Save testing dataset bt_coef_arr = sc.broadcast(coef_arr) bt_intercept = sc.broadcast(intercept) bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict) ### Evaluating the model on testing dataset: label, predict label, score, feature list print "INFO: intercept=", intercept print "INFO: coef_arr len=", len(coef_arr) print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict) # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ============================== if len(coef_arr) == feature_count: testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \ ,p[0].features \ ,p[1] \ ) ).cache() else: # for multi-class, no prediction score;, TBD for better solution: how to display multiple weights for each class testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,0 \ ,p[0].features \ ,p[1] \ ) ).cache() # save false prediction to local file false_pred_fname = os.path.join(local_out_dir, row_id_str + "_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\ .map(lambda p: (p[0],p[1],p[2] \ ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value) ,p[4] ) ) \ .collect() print "INFO: false predicted count=", len(false_pred_data) false_pred_arr = [] with open(false_pred_fname, "w") as fp: for sp in false_pred_data: jsp = { "tlabel": sp[0], "plabel": sp[1], "score": sp[2], "feat": sp[3], "hash": sp[4] } #print "jsp=",jsp false_pred_arr.append(jsp) fp.write(json.dumps(false_pred_arr)) # save prediction results, format: label, prediction, hash pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname pred_out_arr = testing_pred_rdd.map(lambda p: (p[0], p[1], p[4])).collect() ml_util.ml_pickle_save(pred_out_arr, pred_ofname) ### Evaluating the model on testing data #labelsAndPreds = testing_rdd.map(lambda p: (p.label, model_classification.predict(p.features))) labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1])) labelsAndPreds.cache() #testing_sample_count = testing_rdd.count() testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( testing_sample_count) accuracy = 1 - testErr print "INFO: Accuracy = ", accuracy ### Save model #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/' #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'), config.get('app', 'HDFS_MODEL_DIR'), row_id_str) try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror), ". At HDFS=", save_dir except: print "WARNING: Unexpected error:", sys.exc_info( )[0], ". At HDFS=", save_dir model_classification.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) t1 = time() print 'INFO: training run time: %f' % (t1 - t0) t0 = t1 ############################################### ###########plot prediction result figure####### ############################################### labels = labelsAndPreds.collect() true_label_list = [x for x, _ in labels] pred_label_list = [x for _, x in labels] pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png") true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png") pred_xlabel = 'Prediction (Single Run)' true_xlabel = 'True Labels (Single Run)' test_cnt_dic = ml_util.ml_plot_predict_figures( pred_label_list, true_label_list, labels_list, label_dic, testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname) plt.show() perf_measures = None dataset_info = { "training_fraction": training_fraction, "class_count": class_num, "dataset_count": sample_count } ############################################################# ###################for 2 class only (plot ROC curve)######### ############################################################# if len(labels_list) == 2: do_ROC = True reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "WARNING: No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!" do_ROC = False # build data file for score graph score_graph_fname = os.path.join(local_out_dir, row_id_str + "_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname # build score_arr_0, score_arr_1 # format: tlabel, plabel, score, libsvm, raw feat str, hash graph_arr = testing_pred_rdd.map(lambda p: (int(p[0]), float(p[2]))).collect() score_arr_0 = [] score_arr_1 = [] max_score = 0 min_score = 0 for p in graph_arr: if p[0] == 0: score_arr_0.append(p[1]) else: score_arr_1.append(p[1]) # save max,min score if p[1] > max_score: max_score = p[1] elif p[1] < min_score: min_score = p[1] ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name, score_graph_fname, max_score, min_score) #print "score_arr_0=",score_arr_0 #print "score_arr_1=",score_arr_1 #print "max_score=",max_score #print "min_score=",min_score if do_ROC: perf_measures = ml_util.calculate_fscore(true_label_list, pred_label_list) print "RESULT: perf_measures=", perf_measures model_classification.clearThreshold() scoreAndLabels = testing_rdd.map(lambda p: ( model_classification.predict(p[0].features), int(p[0].label))) #metrics = BinaryClassificationMetrics(scoreAndLabels) #areROC = metrics.areaUnderROC #print areROC scoreAndLabels_list = scoreAndLabels.collect() if flag_clean == 0: scores = [x for x, _ in scoreAndLabels_list] s_labels = [x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x, _ in scoreAndLabels_list] s_labels = [1 - x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] #print scores #print s_labels # create ROC data file ======== ==== roc_auc = ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P, local_out_dir, row_id_str) perf_measures["roc_auc"] = roc_auc # only update db for web request if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"',ml_opts='"+ml_opts_jstr \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" t1 = time() print 'INFO: total run time: %f' % (t1 - t00) print 'INFO: Finished!' return 0
def pca(row_id_str, ds_id, hdfs_feat_dir, local_out_dir , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, fromweb, pca_jstr , jobname, model_data_folder ): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) pca_param=json.loads(pca_jstr) if "k" in pca_param: k=pca_param["k"] else: k=None if "threshold" in pca_param: threshold=pca_param["threshold"] else: threshold=None if "lib" in pca_param: lib=pca_param["lib"] else: lib='mllib' ret=-1 # start here =================================================================== =============== t0 = time() # source libsvm filename libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data") print "INFO: libsvm_data_file=", libsvm_data_file # load sample RDD from text file # format Row(label, features, hash) from get_sample_dataframe() samples_df, feature_count = zip_feature_util.get_sample_dataframe(sc, libsvm_data_file, 0, None) print "INFO: feature_count=",feature_count #df_pcaed format: hash,label, features (df_pcaed, k, pca_model)=PCA_transform(sc, samples_df,feature_count, threshold, k) print "INFO: Doing PCA... threshold=",threshold,",k=",k #print "df_pcaed=",df_pcaed.first() #print "k=",k #print "pca_model=",pca_model #print "pc=",pca_model.pc # pca model filename ============================= =============== if model_data_folder is None: if row_id_str != ds_id: # get from parent dataset model_data_folder = os.path.join(config.get('app', 'HADOOP_MASTER'),config.get('app', 'HDFS_MODEL_DIR'), ds_id+"_pca") else: model_data_folder = os.path.join(config.get('app', 'HADOOP_MASTER'),config.get('app', 'HDFS_MODEL_DIR'), row_id_str+"_pca") # create HDFS folder try: hdfs.mkdir(model_data_folder) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror),". At HDFS=", save_dir except: print "WARNING: Unexpected error:", sys.exc_info()[0] ,". At HDFS=", save_dir if not threshold is None: #pca_fname=os.path.join(hdfs_feat_dir , row_id_str+'_pca_'+str(threshold)+'.ml') pca_fname=os.path.join(model_data_folder , 'pca_model_'+str(threshold)) libsvm_data_pca = os.path.join(hdfs_feat_dir , "libsvm_data_pca_"+str(threshold)+'.ml') else: pca_fname=os.path.join(model_data_folder , 'pca_model_'+str(k)) libsvm_data_pca = os.path.join(hdfs_feat_dir , "libsvm_data_pca_"+str(k)+'.ml') # save pca model to HDFS =============== print "INFO: pca_fname=",pca_fname pca_model.write().overwrite().save(pca_fname) # save pca data to HDFS ============================= =============== print "INFO: libsvm_data_pca=",libsvm_data_pca # construct libsvm string libsvm_rdd=df_pcaed.rdd.map(lambda p: p[0]+" "+str(int(p[1]))+zip_feature_util.dv2libsvm(p[2].toArray())) # clean up old libsvm file ============================= =============== try: hdfs.rmr(libsvm_data_pca) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at rmr():", sys.exc_info()[0] # overwrite pca file at hdfs libsvm_rdd.saveAsTextFile(libsvm_data_pca) t1 = time() print 'INFO: PCA processing time: %f' %(t1-t0) ### insert pca_param into mongoDB ### filter='{"rid":'+row_id_str+',"key":"pca_param"}' if not threshold is None: pca_param["threshold"]=threshold if not k is None: pca_param["k"]=k print "INFO: pca_param=",pca_param upsert_flag=True jstr_insert = '{ "rid":'+row_id_str+',"key":"pca_param", "value":'+json.dumps(pca_param)+'}' ret=query_mongo.upsert_doc_t(mongo_tuples,filter,jstr_insert,upsert_flag) print "INFO: Upsert count for pca_param=",ret # only update db for web request =========== if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set " \ +" status = 'pca-ed', processed_date ='"+str(datetime.datetime.now()) \ +"' , ml_pca_opts = '"+json.dumps(pca_param) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Update Sqlite DB done! ret=", str(ret) t1 = time() print 'INFO: running time: %f' %(t1-t0) #print 'Finished!' return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, training_fraction, jobname, random_seed=None): ### generate data folder and out folder, clean up if needed #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # create zip files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path # get_spark_context sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) t0 = time() # check if ml_opts.has_excluded_feat ==1 =================================== has_excluded_feat = 0 ml_opts = {} if not ml_opts_jstr is None: ml_opts = json.loads(ml_opts_jstr) if "has_excluded_feat" in ml_opts: has_excluded_feat = ml_opts["has_excluded_feat"] #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist # get excluded feature list from mongo ========== === if str(has_excluded_feat) == "1" and excluded_feat_cslist is None: excluded_feat_cslist = ml_util.ml_get_excluded_feat( row_id_str, mongo_tuples) print "INFO: excluded_feat_cslist=", excluded_feat_cslist # filename for featured data libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data") print "INFO: libsvm_data_file:", libsvm_data_file # load feature count file feat_count_file = libsvm_data_file + "_feat_count" feature_count = zip_feature_util.get_feature_count(sc, feat_count_file) print "INFO: feature_count=", feature_count # load sample RDD from text file # also exclude selected features in sample ================ ===== # format (LabeledPoint,hash) from str2LabeledPoint_hash() #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file) samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, excluded_feat_cslist) # get distinct label list labels_list_all = samples_rdd.map( lambda p: p[0].label).distinct().collect() # split samples to training and testing data, format (LabeledPoint,hash) training_rdd, testing_rdd = samples_rdd.randomSplit( [training_fraction, 1 - training_fraction], seed=int(random_seed)) training_rdd = training_rdd.map(lambda p: p[0]) # keep LabeledPoint only training_rdd.cache() training_sample_count = training_rdd.count() training_lbl_cnt_list = training_rdd.map( lambda p: (p.label, 1)).reduceByKey(add).collect() testing_rdd.cache() testing_sample_count = testing_rdd.count() testing_lbl_cnt_list = testing_rdd.map( lambda p: (p[0].label, 1)).reduceByKey(add).collect() sample_count = training_sample_count + testing_sample_count # random_seed testing if not random_seed is None: all_t = testing_rdd.collect() all_t = sorted(all_t, key=lambda x: x[1]) cnt = 0 for i in all_t: print i[1] cnt = cnt + 1 if cnt > 3: break t1 = time() print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list print "INFO: labels_list_all=", labels_list_all print "INFO: training and testing samples generated!" print 'INFO: running time: %f' % (t1 - t0) t0 = t1 ############################################### ###########build learning model################ ############################################### ### get the parameters### print "INFO: ======Learning Algorithm and Parameters=============" #ml_opts = json.loads(ml_opts_jstr) model_name = ml_opts[ 'learning_algorithm'] # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd iteration_num = 0 if 'iterations' in ml_opts: iteration_num = ml_opts['iterations'] C = 0 if 'c' in ml_opts: C = eval(ml_opts['c']) regularization = "" if 'regularization' in ml_opts: regularization = ml_opts['regularization'] print "INFO: Learning Algorithm: ", model_name print "INFO: C = ", C print "INFO: iterations = ", iteration_num print "INFO: regType = ", regularization regP = C / float(training_sample_count) print "INFO: Calculated: regParam = ", regP ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: ''' key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] print "INFO: dic_list=",dic_list label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') ''' label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id) print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels:", labels_list class_num = len(labels_list) if class_num > 2: print "INFO: Multi-class classification! Number of classes = ", class_num ### build model ### if model_name == "linear_svm_with_sgd": ### 1: linearSVM print "INFO: ====================1: Linear SVM=============" model_classification = SVMWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) #print model_classification elif model_name == "logistic_regression_with_lbfgs": ### 2: LogisticRegressionWithLBFGS print "INFO: ====================2: LogisticRegressionWithLBFGS=============" model_classification = LogisticRegressionWithLBFGS.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization, numClasses=class_num) # regParam = 1/(sample_number*C) elif model_name == "logistic_regression_with_sgd": ### 3: LogisticRegressionWithSGD print "INFO: ====================3: LogisticRegressionWithSGD=============" model_classification = LogisticRegressionWithSGD.train( training_rdd, regParam=regP, iterations=iteration_num, regType=regularization) # regParam = 1/(sample_number*C) else: print "INFO: Training model selection error: no valid ML model selected!" return print "INFO: model type=", type(model_classification) # create feature coefficient file ================================ coef_arr = None intercept = None if model_classification.weights is None: print "WARNING: model weights not found!" else: coef_weights = model_classification.weights #print "coef_weights=",coef_weights #print type(coef_weights),coef_weights.shape coef_arr = coef_weights.toArray().tolist() # save coef_arr to mongo key = "coef_arr" ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples) # save coef_arr to local file if ret == 0: # drop old record in mongo filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}' ret = query_mongo.delete_many(mongo_tuples, None, filter) if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) fn_ca = os.path.join(local_out_dir, row_id_str, row_id_str + "_coef_arr.pkl") print ml_util.ml_pickle_save(coef_arr, fn_ca) # save intercept to mongo intercept = model_classification.intercept key = "coef_intercept" ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples) # feature list + coef file ============= feat_filename = os.path.join(local_out_dir, row_id_str + "_feat_coef.json") print "INFO: feat_filename=", feat_filename # create feature, coef & raw string file =============================================== ============ # expect a dict of {"fid":(coef, feature_raw_string)} jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None, coef_arr, ds_id, mongo_tuples) # special featuring for IN or libsvm if jret is None: jret = ml_util.build_feat_coef_raw_list_t(row_id_str, feat_filename, coef_arr, ds_id, mongo_tuples) if jret is None: print "WARNING: Cannot create sample list for testing dataset. " jfeat_coef_dict = jret print "INFO: coef_arr len=", len( coef_arr), ", feature_count=", feature_count # for multi-class if len(coef_arr) != feature_count: jfeat_coef_dict = {} print "WARNING: coef count didn't match feature count. multi-class classification was not supported" # Calculate prediction and Save testing dataset bt_coef_arr = sc.broadcast(coef_arr) bt_intercept = sc.broadcast(intercept) bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict) ### Evaluating the model on testing dataset: label, predict label, score, feature list print "INFO: intercept=", intercept print "INFO: coef_arr len=", len(coef_arr), type(coef_arr) print "INFO: jfeat_coef_dict len=", len( jfeat_coef_dict) #, jfeat_coef_dict # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ============================== if len(coef_arr) == feature_count: testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \ ,p[0].features \ ,p[1] \ ) ).cache() else: # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class testing_pred_rdd = testing_rdd.map(lambda p: ( p[0].label \ ,model_classification.predict(p[0].features) \ ,"-" \ ,p[0].features \ ,p[1] \ ) ).cache() ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \ # Save testing dataset for analysis libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str print "INFO: libsvm_testing_output=", libsvm_testing_output try: hdfs.rmr(libsvm_testing_output) except IOError as e: print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] # save only false prediction? #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output) testing_pred_rdd.saveAsTextFile(libsvm_testing_output) ''' #test_tmp=testing_pred_rdd.collect() # save false prediction to local file false_pred_fname = os.path.join(local_out_dir, row_id_str + "_false_pred.json") print "INFO: false_pred_fname=", false_pred_fname false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\ .map(lambda p: (p[0],p[1],p[2] \ ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value) ,p[4] ) ) \ .collect() print "INFO: false predicted count=", len(false_pred_data) false_pred_arr = [] with open(false_pred_fname, "w") as fp: for sp in false_pred_data: jsp = { "tlabel": sp[0], "plabel": sp[1], "score": sp[2], "feat": sp[3], "hash": sp[4] } #print "jsp=",jsp false_pred_arr.append(jsp) fp.write(json.dumps(false_pred_arr)) # save prediction results, format: label, prediction, hash pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl") print "INFO: pred_ofname=", pred_ofname pred_out_arr = testing_pred_rdd.map(lambda p: (p[0], p[1], p[4])).collect() ml_util.ml_pickle_save(pred_out_arr, pred_ofname) ''' one_item= testing_pred_rdd.first() print "one_item=",one_item sparse_arr=one_item[3] dict_feat=zip_feature_util.sparseVector2dict(sparse_arr) print "len=",len(dict_feat),"dict_feat=",dict_feat dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat) print "len=",len(dict_weit),"dict_weit=",dict_weit ''' # Calculate Accuracy. labelsAndPreds = (true_label,predict_label) labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1])) labelsAndPreds.cache() testing_sample_number = testing_rdd.count() testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( testing_sample_number) accuracy = 1 - testErr print "INFO: Accuracy = ", accuracy ### Save model #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/' #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'), config.get('app', 'HDFS_MODEL_DIR'), row_id_str) try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "WARNING: I/O error({0}): {1}".format( e.errno, e.strerror), ". At HDFS=", save_dir except: print "WARNING: Unexpected error:", sys.exc_info( )[0], ". At HDFS=", save_dir model_classification.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) t1 = time() print 'INFO: training run time: %f' % (t1 - t0) t0 = t1 ############################################### ###########plot prediction result figure ==================================================== =============== ############################################### labels = labelsAndPreds.collect() true_label_list = [x for x, _ in labels] pred_label_list = [x for _, x in labels] pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png") true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png") pred_xlabel = 'Prediction (Single Run)' true_xlabel = 'True Labels (Single Run)' test_cnt_dic = ml_util.ml_plot_predict_figures( pred_label_list, true_label_list, labels_list, label_dic, testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname) print "INFO: figure files: ", pred_fname, true_fname #print "INFO: Number of samples in each label is=", test_cnt_dic roc_auc = None perf_measures = None dataset_info = { "training_fraction": training_fraction, "class_count": class_num, "dataset_count": sample_count } ############################################################# ###################for 2 class only (plot ROC curve) ==================================================== =============== ############################################################# if len(labels_list) == 2: do_ROC = True reverse_label_dic = dict((v, k) for k, v in label_dic.items()) if 'clean' in reverse_label_dic: flag_clean = reverse_label_dic['clean'] elif 'benign' in reverse_label_dic: flag_clean = reverse_label_dic['benign'] elif '0' in reverse_label_dic: flag_clean = 0 else: print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!" do_ROC = False # build data file for score graph score_graph_fname = os.path.join(local_out_dir, row_id_str + "_score_graph.json") print "INFO: score_graph_fname=", score_graph_fname # build score_arr_0, score_arr_1 # format: tlabel, plabel, score, libsvm, raw feat str, hash graph_arr = testing_pred_rdd.map(lambda p: (int(p[0]), float(p[2]))).collect() score_arr_0 = [] score_arr_1 = [] max_score = 0 min_score = 0 for p in graph_arr: if p[0] == 0: score_arr_0.append(p[1]) else: score_arr_1.append(p[1]) # save max,min score if p[1] > max_score: max_score = p[1] elif p[1] < min_score: min_score = p[1] ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name, score_graph_fname, max_score, min_score) if do_ROC: perf_measures = ml_util.calculate_fscore(true_label_list, pred_label_list) print "RESULT: perf_measures=", perf_measures ''' # calculate fscore ========== tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn precision=float(tp)/(tp+fp) recall=float(tp)/(tp+fn) print "RESULT: precision=",precision,",recall=",recall acc=(tp+tn)/(float(testing_sample_number)) fscore=2*((precision*recall)/(precision+recall)) print "RESULT: fscore=",fscore,",acc=",acc ''' model_classification.clearThreshold() scoreAndLabels = testing_rdd.map(lambda p: ( model_classification.predict(p[0].features), int(p[0].label))) #metrics = BinaryClassificationMetrics(scoreAndLabels) #areROC = metrics.areaUnderROC #print areROC scoreAndLabels_list = scoreAndLabels.collect() if flag_clean == 0: scores = [x for x, _ in scoreAndLabels_list] s_labels = [x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[0] testing_P = test_cnt_dic[1] else: scores = [-x for x, _ in scoreAndLabels_list] s_labels = [1 - x for _, x in scoreAndLabels_list] testing_N = test_cnt_dic[1] testing_P = test_cnt_dic[0] # create ROC data file ======== ==== roc_auc = ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P, local_out_dir, row_id_str) #, local_out_dir, file_name_given) perf_measures["roc_auc"] = roc_auc # only update db for web request ==================================================== =============== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', perf_measures='"+json.dumps(perf_measures) \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" print 'INFO: Finished!' return 0
def main(): parser = ArgumentParser(description=__description__) parser.add_argument("-f", "--folder", type=str, metavar="folder of features", help="folder contains features", required=False) parser.add_argument("-n", "--name", type=str, metavar="file name", help="file name for sample folder", required=False) parser.add_argument("-o", "--out", type=str, metavar="out figure folder", help="folder contains output", required=False) parser.add_argument("-r", "--row_id", type=str, metavar="row_id number", help="row_id number in the table", required=False) parser.add_argument("-b", "--bin", type=str, metavar="bin number", help="number of bins for var plot", required=False) parser.add_argument("-mn", "--run", type=str, metavar="run number", help="number of runs for var plot", required=False) parser.add_argument("-u", "--uploadtype", type=str, metavar="upload type", help="data type", required=False) parser.add_argument("-w", "--fromweb", type=str, metavar="flag for web", help="flag for web", required=False) parser.add_argument('-sp', '--sp_master', type=str, dest='sp_master', help='spark.master', default=config.get('spark', 'spark_master')) parser.add_argument('-em', '--exe_memory', type=str, dest='exe_memory', help='spark.executor.memory', default=config.get('spark', 'spark_executor_memory')) parser.add_argument('-cm', '--core_max', type=str, dest='core_max', help='spark.cores.max', default=config.get('spark', 'spark_cores_max')) args = parser.parse_args() if args.folder: feat_dir = args.folder else: feat_dir = config.get( 'app', 'HADOOP_MASTER' ) + '/user/hadoop/yigai/sality_virut_zbot_backdoor_dic_000' if args.name: file_name_given = args.name else: file_name_given = 'bbbb' if args.out: out_dir = args.out else: out_dir = 'out_result' if args.row_id: row_id_str = args.row_id else: row_id_str = '1' if args.bin: bin_number = eval(args.bin) else: bin_number = 10 if args.run: run_number = eval(args.run) else: run_number = 2 if args.uploadtype: uploadtype = args.uploadtype else: uploadtype = None if args.fromweb: fromweb = args.fromweb else: fromweb = None data_folder = feat_dir + "/" out_dir = out_dir + "/" if not os.path.exists(out_dir): os.makedirs(out_dir) SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty( 'spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) #SparkContext.setSystemProperty('spark.kryoserializer.buffer.mb', config.get('spark', 'spark_kryoserializer_buffer_mb')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'multi_run:' + str(args.row_id)) dirFile_loc = data_folder + "metadata" dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() print hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print folder_list features_training = [] labels_training = [] names_training = [] row_training = [] col_training = [] max_feat_training = 0 row_num_training = 0 features_testing = [] labels_testing = [] names_testing = [] row_testing = [] col_testing = [] max_feat_testing = 0 row_num_testing = 0 for folder in folder_list: print "****folder:", folder logFile_name = data_folder + folder + mtx_name_list logFile_data = data_folder + folder + mtx_libsvm logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] ##########data seperation###### id_perm = data_seperation_random(name_list) num_names = len(name_list) print 'num of samples in ', logFile_data, ' = ', num_names num_train = int(portion * num_names) print 'num_train = ', num_train label = folder_list.index(folder) + 1 print 'labe of ', logFile_data, ' is ', label ########generate training data######### i = 0 print "here" print len(id_perm) while i < num_train: #print i, id_perm[i] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i + row_num_training) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat_training = max(max_feat_training, int(feat)) j = j + 1 i = i + 1 row_num_training = row_num_training + num_train i = num_train ########generate testing data######### while i < num_names: #print i, id_perm[i] features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_testing.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_testing.append(i - num_train + row_num_testing) col_testing.append(int(feat) - 1) features_testing.append(int(value)) max_feat_testing = max(max_feat_testing, int(feat)) j = j + 1 i = i + 1 row_num_testing = row_num_testing + (num_names - num_train) col_num = max(max_feat_training, max_feat_testing) if max_feat_training < col_num: for i in range(0, row_num_training): for j in range(max_feat_training, col_num): features_training.append(0) row_training.append(i) col_training.append(j) elif max_feat_testing < col_num: for i in range(0, row_num_testing): for j in range(max_feat_testing, col_num): features_testing.append(0) row_testing.append(i) col_testing.append(j) features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) print "col_training:", col_training len_col = len(col_training) for ii in range(0, len_col): if col_training[ii] < 0: print "=======!! < 0 ====== index:", ii print "value: ", col_training[ii] print "col_num:", col_num labels_training = array(labels_training) features_testing = array(features_testing) row_testing = array(row_testing) col_testing = array(col_testing) labels_testing = array(labels_testing) print "***************" print features_training[0].shape, features_testing[0].shape sparse_mtx = csr_matrix((features_training, (row_training, col_training)), shape=(row_num_training, col_num)) #print sparse_mtx.todense(), sparse_mtx.shape sparse_test = csr_matrix((features_testing, (row_testing, col_testing)), shape=(row_num_testing, col_num)) #print sparse_test.todense(), sparse_test.shape clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) clf.fit(sparse_mtx, labels_training) labels_pred = clf.predict(sparse_test) print "************results*********" print "Predicting results:" print labels_pred print "True testing labels:" print labels_testing with open("result.txt", 'w') as f: f.write("prediction vs true\n") num_err = 0 for i in range(0, len(labels_pred)): if labels_pred[i] != labels_testing[i]: num_err = num_err + 1 with open("result.txt", 'a') as f: f.write('%d ' % (labels_pred[i])) f.write('%d\n' % (labels_testing[i])) print "correct percentage : ", 1 - float(num_err) / len(labels_pred) print labels_pred.shape, labels_testing.shape accuracy = clf.score(sparse_test, labels_testing) print "data folder:", data_folder print "accuracy: ", accuracy ####################################################################### #########plot accuracy variance and distribution######## t0 = time() num_run = run_number ###50 is default accuracy_array = np.zeros(num_run) for rnd in range(0, num_run): dirFile_loc = data_folder + "metadata" dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() folder_list = [x.encode('UTF8') for x in hash_Folders] features_training = [] labels_training = [] names_training = [] row_training = [] col_training = [] max_feat_training = 0 row_num_training = 0 features_testing = [] labels_testing = [] names_testing = [] row_testing = [] col_testing = [] max_feat_testing = 0 row_num_testing = 0 for folder in folder_list: logFile_name = data_folder + folder + mtx_name_list logFile_data = data_folder + folder + mtx_libsvm logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] ##########data separation###### id_perm = data_seperation_random(name_list) num_names = len(name_list) num_train = int(portion * num_names) label = folder_list.index(folder) + 1 ########generate training data######### i = 0 while i < num_train: features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i + row_num_training) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat_training = max(max_feat_training, int(feat)) j = j + 1 i = i + 1 row_num_training = row_num_training + num_train i = num_train ########generate testing data######### while i < num_names: features = feature_list[id_perm[i]] features = features.strip() feature_array = features.split(' ') labels_testing.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_testing.append(i - num_train + row_num_testing) col_testing.append(int(feat) - 1) features_testing.append(int(value)) max_feat_testing = max(max_feat_testing, int(feat)) j = j + 1 i = i + 1 row_num_testing = row_num_testing + (num_names - num_train) col_num = max(max_feat_training, max_feat_testing) if max_feat_training < col_num: for i in range(0, row_num_training): for j in range(max_feat_training, col_num): features_training.append(0) row_training.append(i) col_training.append(j) elif max_feat_testing < col_num: for i in range(0, row_num_testing): for j in range(max_feat_testing, col_num): features_testing.append(0) row_testing.append(i) col_testing.append(j) features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) len_col = len(col_training) labels_training = array(labels_training) features_testing = array(features_testing) row_testing = array(row_testing) col_testing = array(col_testing) labels_testing = array(labels_testing) sparse_mtx = csr_matrix( (features_training, (row_training, col_training)), shape=(row_num_training, col_num)) sparse_test = csr_matrix( (features_testing, (row_testing, col_testing)), shape=(row_num_testing, col_num)) clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) clf.fit(sparse_mtx, labels_training) labels_pred = clf.predict(sparse_test) with open("result.txt", 'w') as f: f.write("prediction vs true\n") num_err = 0 for i in range(0, len(labels_pred)): if labels_pred[i] != labels_testing[i]: num_err = num_err + 1 with open("result.txt", 'a') as f: f.write('%d ' % (labels_pred[i])) f.write('%d\n' % (labels_testing[i])) accuracy = clf.score(sparse_test, labels_testing) accuracy_array[rnd] = accuracy print "current round: ", rnd #######plot distribution and variance##### plt.figure(1) num_bins = bin_number ####10 is default n, bins, patches = plt.hist(accuracy_array, num_bins, normed=1, facecolor='green', alpha=0.5) ave = np.mean(accuracy_array) print "Accuracy mean: ", ave variance = np.std(accuracy_array) print "Accuracy variance: ", variance print "bins: ", bins # add a 'best fit' line y = mlab.normpdf(bins, ave, variance) print "y: ", y plt.plot(bins, y, 'r--') plt.title('Accuracy distribution of ' + str(num_run) + ' runs:') plt.xlabel('Accuracy Values') plt.ylabel('Probability') plt.savefig(out_dir + file_name_given + "_var_" + str(num_run) + ".png") t1 = time() print 'running time: %f' % (t1 - t0) # only update db for web request if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set "+"mean = '"+str(ave*100)+"%"+"', variance = '"+str(variance*100) \ +"%',status = 'mruned', processed_date ='"+str(datetime.datetime.now()) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "Data update done! ret=", str(ret) else: print "mean = '" + str(mean * 100) + "%" print "variance = '" + str(variance * 100) + "%" print 'Finished!'
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name, mongo_tuples, labelnameflag, fromweb, src_filename, jobname, model_data_folder): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=", zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # ML model filename ==== model_fname = os.path.join(model_data_folder, row_id_str + '.pkl') print "INFO: model_data_folder=", model_data_folder # create out folders and clean up old model files ==== ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir, model_data_folder, model_fname) # init Spark context ==== sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max, jobname, [zip_file_path]) # start here =================================================================== =============== t0 = time() ### load libsvm file: may or may not be PCA-ed ### libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename) print "INFO: libsvm_data_file=", libsvm_data_file # feature count is a variable if PCA feature_count = 0 # samples_rdd may be from PCAed data # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() samples_rdd, feature_count = zip_feature_util.get_sample_rdd( sc, libsvm_data_file, feature_count, '') # collect all data to local for processing =============== all_data = samples_rdd.collect() total_sample_count = len(all_data) # 2-D array, may be PCAed features_list = [x.features.toArray() for x, _ in all_data] # label array labels_list_all = [x.label for x, _ in all_data] # hash array hash_list_all = [x for _, x in all_data] # convert to np array features_array_reduced = np.array(features_list) hash_list_all = np.array(hash_list_all) labels_list_all = np.array(labels_list_all) true_label_array = np.array(labels_list_all, dtype=np.int8) print "INFO: total_sample_count=", total_sample_count print "INFO: features_array_reduced.shape=", features_array_reduced.shape print "INFO: labels_list_all.shape=", labels_list_all.shape print "INFO: true_label_array.shape=", true_label_array.shape t1 = time() print 'INFO: data generating time: %f' % (t1 - t0) ############################################### ########## build learning model ############### ############################################### ### parse parameters and generate the model ### (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr) if model is None: return labels_kmeans = None #### fit the model to training dataset #### try: model.fit(features_array_reduced) labels_kmeans = model.labels_ #'numpy.ndarray' except: print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info( )[0] return #### save clf for future use #### #joblib.dump(model, model_data_folder + row_id_str+'.pkl') joblib.dump(model, model_fname) #print "**model:intercept***" #print clf.intercept_ print "INFO: model type=", type(model), " model=", model ################################################### ### generate label names (family names) ########### ### connect to database to get the column list which contains all column number of the corresponding feature#### ################################################### if labelnameflag == 1: key = "dic_name_label" jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}' jstr_proj = '{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}' doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list #Adjusted Mutual Information between two clusterings amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars = adjusted_rand_score(labels_list_all, labels_kmeans) print "INFO: Adjusted_rand_score=", ars ################################################### #######plot histogram #### ################################################### plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4 * plot_col_num, 3 * int(math.ceil(n_clusters * 1.0 / plot_col_num))) print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape print "INFO: labels_list_all t=", type( labels_list_all), "labels_kmeans t=", type(labels_kmeans) print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, folder=local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures( labels_list_all, labels_kmeans, n_clusters, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, normalize=True, folder=local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### #num_bars = len(np.unique(labels_list_all)) num_bars = max(labels_list_all) + 1 figsize = (4 * plot_col_num, 3 * int(math.ceil(num_bars * 1.0 / plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures( labels_kmeans, labels_list_all, num_bars, names=label_dic, plot_col_num=plot_col_num, figsize=figsize, reverse=True, folder=local_out_dir, rid=row_id_str) #### plot dot figures #### #mtx_label = model.labels_ mtx_center = model.cluster_centers_ # dot plot for Kmeans =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='KMeans', filename_3d=filename_3d) #print "features_array_reduced s=",features_array_reduced.shape # dot plot for True Labels =========== filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png') filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_array, mtx_center, n_clusters, figsize=(10, 7), filename=filename, title='True Labels', filename_3d=filename_3d) dataset_info = { "training_fraction": 1, "class_count": n_clusters, "dataset_count": total_sample_count } # only update db for web request =========== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret = exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '" + str(accuracy * 100) + "%" t1 = time() print 'INFO: running time: %f' % (t1 - t0) #print 'Finished!' return 0
def predict(row_id_str, pid_str, input_gz, local_out_dir, ds_list, fromweb, verbose='0', label_idx=0, data_idx=3, metadata_count=3, pattern_str=None, ln_delimitor='\t', sp_master=config.get('spark', 'spark_master'), exe_memory=config.get('spark', 'spark_executor_memory'), core_max=config.get('spark', 'spark_cores_max'), ip_address=config.get('mongo', 'out_ip_address'), port=eval(config.get('mongo', 'out_port')), db_name=config.get('mongo', 'out_db'), tb_name=config.get('mongo', 'out_tb'), username=config.get('mongo', 'out_username'), password=config.get('mongo', 'out_password'), feat_cnt_threshold=config.get('machine_learning', 'feature_count_threshold'), flag_local_model="Y"): t0 = time() if ds_list is None or len(ds_list) == 0: # get ds_list from ensemble record id=row_id_str str_sql = 'select id, ds_list from atdml_document where id=' + row_id_str + ' and file_type="ensemble" ' ret = exec_sqlite.query_db(str_sql) if not ret is None and len(ret) > 0: ds_list = eval(ret[0][1]) if ds_list is None or len(ds_list) == 0: print "ERROR: ensemble list not found for id=", row_id_str return -1 print "INFO: classifier count=", len(ds_list) #, ds_list # expect local_out_dir to result folder; remove rid in path if any bname = os.path.basename(local_out_dir) if bname.isdigit(): local_out_dir = os.path.dirname(local_out_dir) # one spark session for the for loop sc = None # for each id in ds_list ====================== ret_list = [] icnt = 1 # local parallel if flag_local_model == "Y": # get one_line from gz file one_line = None try: one_line = zip_preprocess_pattern.convert_to_line_by_bash( input_gz, metadata_count, ln_delimitor ) # check if one line, if raw file then convert to 1 line #print "one_line=",one_line[:100].replace('\t',',') #print "one_line=",one_line.replace('\t',',') print "INFO: one_line len=", len(one_line) except Exception as e: print "ERROR: load data file [" + input_gz + "] failed.", e return -5 # parallel processing ret_list=Parallel(n_jobs=PARALLEL_CNT)(delayed(predict_one) \ (id,one_line, local_out_dir,metadata_count,ln_delimitor,label_idx, data_idx,feat_cnt_threshold) for id in ds_list) else: # by Spark for id in ds_list: #t3 = time() pred_ret = None # get info from ds id (rid, num_gram,ml_opts_str,ds_id_str,lib_mode,option_state,pattern_str,label_arr,ml_opts \ ,learning_algorithm, file_name) = query_db(id) if id is None: # no result found for classifier print "WARNING: dataset id=", id, "not found!" continue # get one spark context if needed if lib_mode == 'mllib' and sc is None: sc = predict_single_file_pattern.get_sc( str(id), sp_master, exe_memory, core_max) #local_out_dir was used to get model/mapping files id_out_dir = os.path.join(local_out_dir, str(id)) # call prediction(): may need Spark context for mllib try: #print "before pred",str(id),ds_id_str pred_ret = predict_single_file_pattern.predict( row_id_str=str(id), ds_id=ds_id_str, num_gram=num_gram, j_str=ml_opts_str, lib_mode=lib_mode, cid_str=str(pid_str), input_gz=input_gz, local_out_dir=id_out_dir, fromweb="2" # force to return json output , verbose=str(verbose), label_idx=label_idx, data_idx=data_idx, metadata_count=metadata_count, pattern_str=pattern_str, ln_delimitor=ln_delimitor, feat_cnt_threshold=feat_cnt_threshold, sc=sc) #print "**** ret=",pred_ret ret_list.append(pred_ret) except: print "ERROR: pid=", row_id_str, ",classifier id=", id, ",msg=", sys.exc_info( )[0] ret_list.append({"id":int(pid_str),"opt_id":int(id),"ds_id": int(ds_id_str) \ ,"prediction":None,"predict_val":None,"learning_algorithm":None \ ,"lib":None, "ml_opts":None,"predict_index": None }) #print "INFO: id=",id,",ret=", ret icnt = icnt + 1 print "INFO: ret_list len=", len(ret_list) # TBD find max predict value to pick a label. need review ====================== ========== status = "ensemble predicted" max_val = sys.maxint * -1 max_id = 0 alg = None lib = None prediction = None mix_algs = 0 for rt in ret_list: alg = rt["learning_algorithm"] if not rt is None and "predict_val" in rt and rt[ "predict_val"] > max_val: max_val = rt["predict_val"] max_id = rt["opt_id"] prediction = rt["prediction"] lib = rt["lib"] if "svm" in alg: mix_algs = mix_algs | 1 if "logistic" in alg: mix_algs = mix_algs | 2 # check if mixing model types if mix_algs == 3: print "WARNING: Mixing SVM and Logistic Regression algorithms found in Ensemble datasets. The prediction may be INVALID!!!" outj={"prediction":prediction,"predict_val":max_val,"status":status,"predict_ds":max_id \ ,"lib":lib,"learning_algorithm":alg,"processed_date":str(datetime.datetime.now()),"returns":ret_list} result_fname = os.path.join(local_out_dir, pid_str, pid_str + "_predict_output.json") #print "INFO: result_fname=",result_fname print "RESULT: predict_val=", max_val, ",prediction=", prediction # create/clean up folder ml_util.ml_prepare_output_dirs(pid_str, os.path.join(local_out_dir, pid_str), local_out_dir, result_fname) # write output to a file with open(result_fname, "w") as fo: json.dump(outj, fo) # Update prediction records here ====================== ========== if fromweb == "1": #print "database update" str_sql="UPDATE atdml_document set status = '"+status+"', processed_date ='" \ +str(datetime.datetime.now())+"', prediction = '"+ str(prediction) \ +"', predict_val = '"+str(max_val) \ +"', dataset_info = '"+str(max_id) \ +"' where id="+pid_str ret = exec_sqlite.exec_sql(str_sql) t1 = time() print 'INFO: running time: %f' % (t1 - t0) return 0