コード例 #1
0
def train(row_id_str,
          ds_id,
          hdfs_feat_dir,
          local_out_dir,
          ml_opts_jstr,
          excluded_feat_cslist,
          sp_master,
          spark_rdd_compress,
          spark_driver_maxResultSize,
          sp_exe_memory,
          sp_core_max,
          zipout_dir,
          zipcode_dir,
          zip_file_name,
          mongo_tuples,
          labelnameflag,
          fromweb,
          training_fraction,
          jobname,
          model_data_folder,
          random_seed=None):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # ML model filename ====
    model_fname = os.path.join(model_data_folder, row_id_str + '.pkl')
    print "INFO: model_data_folder=", model_data_folder
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir,
                                   model_data_folder, model_fname)

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    # start here =================================================================== ===============
    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    # source libsvm filename
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    sample_count = len(all_data)

    if not random_seed is None and int(random_seed) > 0:
        np.random.seed(int(random_seed))
        all_data = sorted(all_data, key=lambda x: x[1])

    # 2-D array
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]

    # convert to np array
    labels_list_all = array(labels_list_all)
    features_array = np.array(features_list)
    hash_list_all = np.array(hash_list_all)
    #print "features_list=",features_list

    # generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    # if ensamble is on, do special split here
    ### randomly split the samples into training and testing data ===============
    X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \
            cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) )
    # X_test_sparse is scipy.sparse.csr.csr_matrix
    testing_sample_count = len(labels_test)
    training_sample_count = len(labels_train)
    training_lbl_cnt_list = Counter(labels_train)
    testing_lbl_cnt_list = Counter(labels_test)
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count, ",sample_count=", sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: train_hash_list count=", len(
        train_hash_list), ", test_hash_list count=", len(test_hash_list)

    # random_seed testing
    if not random_seed is None:
        cnt = 0
        for i in train_hash_list:
            print i
            cnt = cnt + 1
            if cnt > 3:
                break

    #print "INFO: labels_list_all=",labels_list_all # too big
    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    ###############################################
    ###########build learning model ==================================================== ===============
    ###############################################

    ### parse parameters and generate the model ###
    (clf, model_name) = parse_param_and_get_model(ml_opts)
    if model_name == "none":
        print "ERROR: model name not found!"
        return -1

    #####fit the model to training dataset  ===============
    try:
        clf.fit(X_train_sparse, labels_train)
    except:
        print "ERROR: clf.fit(): clf=", clf
        print "ERROR: sys.exc_info:", sys.exc_info()[0]
        return -1

    print "INFO: model type=", type(clf), " clf=", clf
    #### save clf for future use ================== ===============
    joblib.dump(clf, model_fname)

    # get data from model ================================
    coef = None
    intercept = None
    # get column size =====
    try:
        if type(clf) in (classes.SVC, classes.NuSVC):  # svm didn't have coef_
            col_num = clf.support_vectors_.shape[1]
        else:  #linear only
            # coef_ is only available when using a linear kernel
            col_num = len(clf.coef_[0])
            coef = clf.coef_[0]
            intercept = clf.intercept_[0]  # only get 1st item?
            #print "**model:clf.coef_[0] =",clf.coef_[0]
            # save coef_ to Mongo
    except Exception as e:
        print "Warning: Can't get clf.coef_[0]. e=", e, ", get total features from meta-data"
        col_num = 0  #how to get feature number for sparse array?
    print "INFO: total feature # in the model: ", col_num

    jfeat_coef_dict = {}
    # create feature coefficient file ================================
    if coef is None:
        print "WARNING: model weights not found!"
    else:
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename
        # save coef_arr to mongo ===
        #jfeat_coef_dict=save_coef2db(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
        jfeat_coef_dict = ml_util.ml_save_coef_build_feat_coef(
            row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
    #print "INFO: jfeat_coef_dict=", jfeat_coef_dict
    print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict)

    ### Evaluating the model on testing dataset  ===============
    labels_pred = clf.predict(X_test_sparse)
    accuracy = clf.score(X_test_sparse, labels_test)
    print "INFO: Accuracy = ", accuracy

    # filename for false prediction samples  ===============
    false_pred_fname = os.path.join(local_out_dir,
                                    row_id_str + "_false_pred.json")
    print "INFO: false_pred_fname=", false_pred_fname
    # build files for false pred & score graph
    (score_arr_0, score_arr_1, max_score,
     min_score) = ml_build_false_pred(X_test_sparse,
                                      coef,
                                      intercept,
                                      labels_test,
                                      labels_pred,
                                      test_hash_list,
                                      model_name,
                                      jfeat_coef_dict,
                                      false_pred_fname,
                                      row_id_str=row_id_str,
                                      ds_id=ds_id,
                                      mongo_tuples=mongo_tuples)

    # save pred output
    pred_out_arr = []
    for i in range(0, len(labels_test)):
        pred_out_arr.append(
            (labels_test[i], labels_pred[i], test_hash_list[i]))
    pred_ofname = os.path.join(local_out_dir, row_id_str + "_pred_output.pkl")
    print "INFO: pred_ofname=", pred_ofname
    ml_util.ml_pickle_save(pred_out_arr, pred_ofname)

    ###################################################
    ### generate label names (family names) ==================================================== ===============
    ###################################################
    if labelnameflag == 1:
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic =", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: label_dic=", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])

    ###############################################
    ###########plot prediction result figures ==================================================== ===============
    ###############################################
    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(labels_pred.tolist(),
                                                   labels_test.tolist(),
                                                   labels_list, label_dic,
                                                   testing_sample_count,
                                                   pred_xlabel, pred_fname,
                                                   true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    #print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc = None
    #fscore=None
    perf_measures = None
    class_count = len(labels_list)
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_count,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve) ==================================================== ===============
    #############################################################
    if class_count == 2:

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname
        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)

        do_ROC = True
        # clean is 0; dirty is 1
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "WARNING: No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!"
            do_ROC = False

        if do_ROC:
            # calculate fscore  ==========
            perf_measures = ml_util.calculate_fscore(labels_test, labels_pred)
            #fscore=perf_measures["fscore"]
            #acc=perf_measures["accuracy"]
            #phi=perf_measures["phi"]
            print "INFO: perf_measures=", perf_measures

            confidence_score = clf.decision_function(X_test_sparse)
            #print "INFO:confidence_score=",confidence_score

            if flag_clean == 0:
                scores = [x for x in confidence_score]
                s_labels = [x for x in labels_test]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x in confidence_score]
                s_labels = [1 - x for x in labels_test]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]

            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request ==================================================== ===============
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Sqlite update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    print 'INFO: Train Finished!'
    return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr,
          sp_master, spark_rdd_compress, spark_driver_maxResultSize,
          sp_exe_memory, sp_core_max, zipout_dir, zipcode_dir, zip_file_name,
          mongo_tuples, labelnameflag, fromweb, src_filename, jobname,
          model_data_folder):

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir,
                                      zipcode_dir,
                                      zip_file_name,
                                      prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    #data_folder = hdfs_feat_dir + "/"
    #local_out_dir = local_out_dir + "/"
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # ML model filename ====
    model_fname = os.path.join(model_data_folder, row_id_str + '.pkl')
    print "INFO: model_data_folder=", model_data_folder
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str, local_out_dir,
                                   model_data_folder, model_fname)

    # init Spark context ====
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    # start here =================================================================== ===============
    t0 = time()

    ### load libsvm file: may or may not be PCA-ed ###
    libsvm_data_file = os.path.join(hdfs_feat_dir, src_filename)
    print "INFO: libsvm_data_file=", libsvm_data_file

    # feature count is a variable if PCA
    feature_count = 0

    # samples_rdd may be from PCAed data
    # load sample RDD from text file
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, '')

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    total_sample_count = len(all_data)
    # 2-D array, may be PCAed
    features_list = [x.features.toArray() for x, _ in all_data]
    # label array
    labels_list_all = [x.label for x, _ in all_data]
    # hash array
    hash_list_all = [x for _, x in all_data]
    # convert to np array
    features_array_reduced = np.array(features_list)
    hash_list_all = np.array(hash_list_all)
    labels_list_all = np.array(labels_list_all)
    true_label_array = np.array(labels_list_all, dtype=np.int8)

    print "INFO: total_sample_count=", total_sample_count
    print "INFO: features_array_reduced.shape=", features_array_reduced.shape
    print "INFO: labels_list_all.shape=", labels_list_all.shape
    print "INFO: true_label_array.shape=", true_label_array.shape

    t1 = time()
    print 'INFO: data generating time: %f' % (t1 - t0)

    ###############################################
    ########## build learning model ###############
    ###############################################

    ### parse parameters and generate the model ###
    (model, alg, n_clusters) = parse_para_and_get_model(ml_opts_jstr)
    if model is None:
        return

    labels_kmeans = None
    #### fit the model to training dataset ####
    try:
        model.fit(features_array_reduced)
        labels_kmeans = model.labels_  #'numpy.ndarray'

    except:
        print "ERROR: Error in model.fit(): ", "model=", model, ", sys.exc_info:", sys.exc_info(
        )[0]
        return

    #### save clf for future use ####
    #joblib.dump(model, model_data_folder + row_id_str+'.pkl')
    joblib.dump(model, model_fname)

    #print "**model:intercept***"
    #print clf.intercept_

    print "INFO: model type=", type(model), " model=", model

    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################

    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter = '{"rid":' + row_id_str + ',"key":"' + key + '"}'
        jstr_proj = '{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter = '{"rid":' + ds_id + ',"key":"' + key + '"}'

        doc = query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']

        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels_list=", labels_list

    #Adjusted Mutual Information between two clusterings
    amis = adjusted_mutual_info_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_mutual_info_score=", amis
    #Similarity measure between two clusterings
    ars = adjusted_rand_score(labels_list_all, labels_kmeans)
    print "INFO: Adjusted_rand_score=", ars

    ###################################################
    #######plot histogram                       ####
    ###################################################
    plot_col_num = int(math.ceil(math.sqrt(n_clusters)))
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(n_clusters * 1.0 / plot_col_num)))
    print "INFO: labels_list_all.shape=", labels_list_all.shape, "labels_kmeans.shape=", labels_kmeans.shape
    print "INFO: labels_list_all t=", type(
        labels_list_all), "labels_kmeans t=", type(labels_kmeans)
    print "INFO: n_clusters=", n_clusters, ",label_dic=", label_dic
    print "INFO: plot_col_num=", plot_col_num, ",figsize=", figsize, ",local_out_dir=", local_out_dir

    # kmeans histogram
    _, p_true = ml_plot_kmeans_histogram_subfigures(labels_list_all,
                                                    labels_kmeans,
                                                    n_clusters,
                                                    names=label_dic,
                                                    plot_col_num=plot_col_num,
                                                    figsize=figsize,
                                                    folder=local_out_dir,
                                                    rid=row_id_str)
    # normalized kmeans histogram
    _, p_true_norm = ml_plot_kmeans_histogram_subfigures(
        labels_list_all,
        labels_kmeans,
        n_clusters,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        normalize=True,
        folder=local_out_dir,
        rid=row_id_str)

    ####plot "reverse" histogram with labels ####
    #num_bars = len(np.unique(labels_list_all))
    num_bars = max(labels_list_all) + 1
    figsize = (4 * plot_col_num,
               3 * int(math.ceil(num_bars * 1.0 / plot_col_num)))

    _, p_cluster = ml_plot_kmeans_histogram_subfigures(
        labels_kmeans,
        labels_list_all,
        num_bars,
        names=label_dic,
        plot_col_num=plot_col_num,
        figsize=figsize,
        reverse=True,
        folder=local_out_dir,
        rid=row_id_str)

    #### plot dot figures ####
    #mtx_label = model.labels_
    mtx_center = model.cluster_centers_
    # dot plot for Kmeans   ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster.png')
    filename_3d = os.path.join(local_out_dir, row_id_str + '_cluster_3d.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       labels_kmeans,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='KMeans',
                                       filename_3d=filename_3d)
    #print "features_array_reduced s=",features_array_reduced.shape

    # dot plot for True Labels  ===========
    filename = os.path.join(local_out_dir, row_id_str + '_cluster_tl.png')
    filename_3d = os.path.join(local_out_dir,
                               row_id_str + '_cluster_3d_tl.json')
    ml_plot_kmeans_dot_graph_save_file(features_array_reduced,
                                       true_label_array,
                                       mtx_center,
                                       n_clusters,
                                       figsize=(10, 7),
                                       filename=filename,
                                       title='True Labels',
                                       filename_3d=filename_3d)

    dataset_info = {
        "training_fraction": 1,
        "class_count": n_clusters,
        "dataset_count": total_sample_count
    }
    # only update db for web request   ===========
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set accuracy = '" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', total_feature_numb='"+str(feature_count) \
            +"', perf_measures='{}" \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)

    #print 'Finished!'
    return 0
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr, excluded_feat_cslist
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples, labelnameflag, fromweb
    , training_fraction, jobname, model_data_folder ): 
    

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path
    

    # ML model filename ====
    model_fname=os.path.join(model_data_folder, row_id_str+'.pkl')
    print "INFO: model_data_folder=",model_data_folder    
    # create out folders and clean up old model files ====
    ml_util.ml_prepare_output_dirs(row_id_str,local_out_dir,model_data_folder,model_fname)   

    # init Spark context ====
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path]) 

    
    t0 = time()
    t00 = t0
    
    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat=0
    if not ml_opts_jstr is None:
        ml_opts=json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat=ml_opts["has_excluded_feat"]

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist=ml_util.ml_get_excluded_feat(row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=",excluded_feat_cslist
            
    # source libsvm filename  
    libsvm_data_file = os.path.join(hdfs_feat_dir , "libsvm_data")
    print "INFO: libsvm_data_file=", libsvm_data_file

    # load feature count file
    feat_count_file=libsvm_data_file+"_feat_count"
    feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    print "INFO: feature_count=",feature_count

    
    # load sample RDD from text file   
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash() 
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd,feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    all_data = samples_rdd.collect()
    sample_count=len(all_data)
    # 2-D array
    features_list = [x.features.toArray() for x,_ in all_data]
    # label array
    labels_list_all = [x.label for x,_ in all_data]
    # hash array
    hash_list_all = [x for _,x in all_data]

    # convert to np array
    labels_list_all = array(labels_list_all)
    features_array = np.array(features_list)
    hash_list_all=np.array(hash_list_all)
    
    # generate sparse matrix (csr) for all samples
    features_sparse_mtx = csr_matrix(features_array)

    ### randomly split the samples into training and testing data ===============
    X_train_sparse, X_test_sparse, labels_train, labels_test, train_hash_list, test_hash_list = \
            cross_validation.train_test_split(features_sparse_mtx, labels_list_all, hash_list_all, test_size=(1-training_fraction) )
    # X_test_sparse is scipy.sparse.csr.csr_matrix
    testing_sample_count = len(labels_test)
    training_sample_count=len(labels_train)
    training_lbl_cnt_list=Counter(labels_train)
    testing_lbl_cnt_list=Counter(labels_test)
    
    print "INFO: training sample count=",training_sample_count,", testing sample count=",testing_sample_count,",sample_count=",sample_count
    print "INFO: training label list=",training_lbl_cnt_list,", testing label list=",testing_lbl_cnt_list
    print "INFO: train_hash_list count=",len(train_hash_list),", test_hash_list count=",len(test_hash_list)
    t1 = time()
    print 'INFO: running time: %f' %(t1-t0)
    
    ###############################################
    ###########build learning model################
    ###############################################
    
    ### parse parameters and generate the model ###
    (clf, model_name, api, cv, param_dic) = parse_param_and_get_model(ml_opts)
    if model_name == "none":
        print "ERROR: model name not found!"
        return -1

    #param_jobj=json.loads(ml_opts_jstr);
    #print "param_jobj=",param_jobj
        
    ########################################################
    ##########Grid Search with cross validation#############
    ########################################################    
    json2save={}
    json2save["rid"]=int(row_id_str)
    json2save["key"]="cv_result"
    #json2save["param_str"]=ml_opts_jstr
    json2save["param_dic"]=param_dic
    cv_grid=[]
    if api == "centralized":
        #########run with Scikit-learn API (for comparison)######
        print "INFO: ******************Grid Search with Scikit-learn API************"

        t0 = time()
        
        # Set the parameters by cross-validation
        #tuned_parameters = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}]
        #tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], \
        #                 'C': [1, 10, 100, 1000]}, \
        #                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

        scores = ['accuracy']
        json2save["scores"]=scores
        #print json2save
        
        for score in scores: # for one item only? score=accuracy
            print("INFO: # Tuning hyper-parameters for %s" % score)
            #print()

            grid = grid_search.GridSearchCV(estimator = clf, param_grid = param_dic, cv=cv, scoring= score)
            grid.fit(X_train_sparse, labels_train)
            
            print "INFO: Best parameters set found on development set:"
            print "INFO: grid.best_params_=",grid.best_params_
            print "INFO: Grid scores on development set:" 
            for key in grid.best_params_:
                print "INFO: best_params["+key+"]=", grid.best_params_[key]
                if key.lower()=="regtype":
                    ml_opts['regularization']=str(grid.best_params_[key]) # add best param to 
                else:
                    ml_opts[key.lower()]=str(grid.best_params_[key]) # add best param to 
            # save best param to db as json string
            j_str=json.dumps(ml_opts);
            json2save["param_str"]=j_str;
            print "INFO: grid_scores_ with params:"
            for params, mean_score, scores in grid.grid_scores_:
                print "INFO: %0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
                #outstr='%s,%0.3f,%0.03f,%s' % (params,mean_score, scores.std() * 2,"Selected" if params==grid.best_params_ else "")
                outj={}
                outj["param"]=params
                outj["average_accuracy"]="%0.3f" % (mean_score)
                outj["std_deviation"]="%0.3f" % (scores.std() * 2)
                outj["selected"]="%s" % ("Selected" if params==grid.best_params_ else "")
                
                cv_grid.append(outj)
        
        clf_best = grid.best_estimator_
        t1 = time()
        ############# END run with SKlearn ######
        print 'INFO: Grid Search with SKlearn running time: %f' %(t1-t0)
        t0 = time()
    else:
    
        #############run with SPARK######
        
        print "INFO: ******************Grid Search with SPARK************"
            
        all_comb_list_of_dic = get_all_combination_list_of_dic(param_dic) 
        print "INFO: Total number of searching combinations=", len(all_comb_list_of_dic) 
        #print "all_comb_list_of_dic: ", all_comb_list_of_dic
        params_rdd = sc.parallelize(all_comb_list_of_dic)
        
        ###broad cast clf, traning data, testing data to all workers###
        X_broadcast = sc.broadcast(X_train_sparse)
        y_broadcast = sc.broadcast(labels_train)
        clf_broadcast = sc.broadcast(clf)
        
        ### Grid Search with CV in multiple workers ###
        models = params_rdd.map(lambda x: learn_with_params(clf_broadcast.value, X_broadcast.value, y_broadcast.value, cv, x)).sortByKey(ascending = False).cache()
        
        (ave_accuracy, (clf_best, p_dic_best, std2))  = models.first()
        # output results #

        print "INFO: Best parameters set found for ", model_name, " is: "
        print "INFO: ",
        for key in p_dic_best:
            print key, " = ", p_dic_best[key],
            if key.lower()=="regtype":
                ml_opts['regularization']=str(p_dic_best[key]) 
            else:
                ml_opts[key.lower()]=str(p_dic_best[key]) # add best param to 
            # save best param to db as json string
        print ""
        j_str=json.dumps(ml_opts);
        json2save["param_str"]=j_str;

        print "INFO: Average accuracy with CV = ", cv, ": ", ave_accuracy
        
        ######## print complete report #######
        print "INFO: Grid scores on development set:"
        all_results = models.collect()
        for i in range(0, len(all_results)):
            (ave_accu_i, (clf_i, p_dic_i, std2_i)) = all_results[i]
            print "INFO: ",ave_accu_i, " for ", p_dic_i
            print "INFO: %0.3f (+/-%0.03f) for " % (ave_accu_i, std2_i), p_dic_i
            #outstr='%s,%0.3f,%0.03f,%s' % ( p_dic_i, ave_accu_i, std2_i, "Selected" if p_dic_i==p_dic_best else "")
            outj={}
            outj["param"]=p_dic_i
            outj["average_accuracy"]="%0.3f" % (ave_accu_i)
            outj["std_deviation"]="%0.3f" % (std2_i)
            outj["selected"]="%s" % ("Selected" if p_dic_i==p_dic_best else "")
            
            cv_grid.append(outj)
        print " "
        
        t1 = time()
        
        ############# END run with SPARK######
        print 'INFO: Grid search with SPARK running time: %f' %(t1-t0)
    
    ##################################################################################
    #print "cv_grid=",cv_grid
    #json2save["cv_grid_title"]='param,average_accuracy,std_deviation,selected' 
    json2save["cv_grid_data"]=cv_grid
    json2save['clf_best']=str(clf_best).replace("\n","").replace("    ","")
    cv_result=json.dumps(json2save)
    #print "INFO: cv_result=",cv_result
    filter='{"rid":'+row_id_str+',"key":"cv_result"}'
    upsert_flag=True
    ## write to mongoDB.myml.dataset_info, ignore doc with duplicated key
    # db.dataset_info.createIndex({"rid":1,"key":1},{unique:true})
    ret=query_mongo.upsert_doc_t(mongo_tuples,filter,cv_result,upsert_flag)
    print "INFO: Upsert count for cv_result: ret=",ret
 
    ##################################################################################
    ##########Retrain with best model for training set and output results#############
    ##################################################################################
    print "INFO: **********Retrain with best model for training set and output results************"
    
    clf_best.fit(X_train_sparse, labels_train)
    #### save clf_best for future use ####
    #joblib.dump(clf_best, model_data_folder + row_id_str+'.pkl')
    joblib.dump(clf_best, model_fname) 
    
    ### Evaluating the model on testing data
    labels_pred = clf_best.predict(X_test_sparse)
    accuracy = clf_best.score(X_test_sparse, labels_test)
    print "INFO: Accuracy = ", accuracy
    
    
    ######################################the rest of the code is the same as train_sklean.py (replace clf with clf_best)#####################################################################
    clf=clf_best
    print "INFO: model type=",type(clf)," clf=",clf

    # get data from model ================================
    coef=None
    intercept=None
    try:
        if type(clf) in ( classes.SVC , classes.NuSVC) :# svm didn't have coef_
            col_num=clf.support_vectors_.shape[1]
        else: #linear only
            # coef_ is only available when using a linear kernel
            col_num = len(clf.coef_[0])
            coef=clf.coef_[0]
            intercept=clf.intercept_[0] # only get 1st item?
            #print "**model:clf.coef_[0] =",clf.coef_[0]
    except Exception as e:
        print "WARNING: Can't get clf.coef_[0]. e=",e,", get total features from meta-data"
        col_num = 0 #how to get feature number for sparse array? 
    print "INFO: total feature # in the model: ", col_num

    jfeat_coef_dict={}
    # create feature coefficient file ================================
    if coef is None:
        print "WARNING: model weights not found!"    
    else:
        feat_filename=os.path.join(local_out_dir,row_id_str+"_feat_coef.json")
        print "INFO: feat_filename=",feat_filename
        # save coef_arr to mongo & create jfeat_coef_dict===
        jfeat_coef_dict=ml_util.ml_save_coef_build_feat_coef(row_id_str, mongo_tuples, coef, intercept, feat_filename, ds_id)
    #print "INFO: jfeat_coef_dict=", jfeat_coef_dict
    print "INFO: jfeat_coef_dict len=", len(jfeat_coef_dict )


    # filename for false pred 
    false_pred_fname=os.path.join(local_out_dir,row_id_str+"_false_pred.json")
    print "INFO: false_pred_fname=", false_pred_fname

    # build files for false pred & score graph
    (score_arr_0, score_arr_1, max_score,min_score)=ml_build_false_pred(X_test_sparse,coef,intercept
        , labels_test, labels_pred, test_hash_list, model_name, jfeat_coef_dict, false_pred_fname) 

    # save pred output
    pred_out_arr=[]
    for i in range(0,len(labels_test)):
        pred_out_arr.append((labels_test[i], labels_pred[i], test_hash_list[i]))
    pred_ofname=os.path.join(local_out_dir,row_id_str+"_pred_output.pkl")
    print "INFO: pred_ofname=", pred_ofname
    ml_util.ml_pickle_save(pred_out_arr,pred_ofname)
    
    ###################################################
    ### generate label names (family names) ###########
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    ###################################################
    
    if labelnameflag == 1:
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
        
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: ******generated label_dic:", label_dic 
    
    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    
    ### generate sample numbers of each family in testing data###
    testing_sample_number = len(labels_test)
    print "INFO: testing_sample_number=", testing_sample_number
    test_cnt_dic = {}
    for key in label_dic:
        test_cnt_dic[key] = 0
    for i in range (0, testing_sample_number):
        for key in label_dic:
            if labels_test[i] == key:
                test_cnt_dic[key] = test_cnt_dic[key] + 1
    print "INFO: Number of samples in each label is=", test_cnt_dic
    
    ###############################################
    ###########plot prediction result figure#######
    ###############################################
    pred_fname=os.path.join(local_out_dir,row_id_str+"_1"+".png")
    true_fname=os.path.join(local_out_dir,row_id_str+"_2"+".png")
    pred_xlabel='Prediction (Single Run)'
    true_xlabel='True Labels (Single Run)'
    test_cnt_dic=ml_util.ml_plot_predict_figures(labels_pred.tolist(), labels_test.tolist(), labels_list, label_dic, testing_sample_count 
        , pred_xlabel, pred_fname, true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc=None
    #fscore=None 
    perf_measures=None
    class_count=len(labels_list)
    dataset_info={"training_fraction":training_fraction, "class_count":class_count,"dataset_count":sample_count}
    #############################################################
    ###################for 2 class only (plot ROC curve)#########
    #############################################################
    if len(labels_list) == 2:

        # build data file for score graph
        score_graph_fname=os.path.join(local_out_dir,row_id_str+"_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname
        ml_build_pred_score_graph(score_arr_0,score_arr_1,model_name, score_graph_fname,max_score,min_score)

            
        do_ROC=True
        reverse_label_dic = dict((v,k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "No ROC curve generated: 'clean' or '0' must be a label for indicating negative class!"
            do_ROC=False
            
        if do_ROC:
            # calculate fscore  ==========
            perf_measures=ml_util.calculate_fscore(labels_test, labels_pred)
            print "INFO: perf_measures=",perf_measures
            
            confidence_score = clf_best.decision_function(X_test_sparse)
                    
            if flag_clean == 0:
                scores = [x for x in confidence_score]
                s_labels = [x for x in labels_test]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x in confidence_score]
                s_labels = [1-x for x in labels_test]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]
                
            # create ROC data file ======== ==== 
            roc_auc=ml_create_roc_files(row_id_str, scores, s_labels, testing_N, testing_P
                , local_out_dir, row_id_str)
                
            perf_measures["roc_auc"]=roc_auc
            
                
    # only update db for web request
    if fromweb=="1": 
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"',ml_opts='"+j_str \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret=exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '"+str(accuracy*100)+"%"
    
    print 'INFO: total running time: %f' %(t1-t00)
    
    print 'INFO: Finished!'
    return 0
コード例 #4
0
def predict(row_id_str,
            pid_str,
            input_gz,
            local_out_dir,
            ds_list,
            fromweb,
            verbose='0',
            label_idx=0,
            data_idx=3,
            metadata_count=3,
            pattern_str=None,
            ln_delimitor='\t',
            sp_master=config.get('spark', 'spark_master'),
            exe_memory=config.get('spark', 'spark_executor_memory'),
            core_max=config.get('spark', 'spark_cores_max'),
            ip_address=config.get('mongo', 'out_ip_address'),
            port=eval(config.get('mongo', 'out_port')),
            db_name=config.get('mongo', 'out_db'),
            tb_name=config.get('mongo', 'out_tb'),
            username=config.get('mongo', 'out_username'),
            password=config.get('mongo', 'out_password'),
            feat_cnt_threshold=config.get('machine_learning',
                                          'feature_count_threshold'),
            flag_local_model="Y"):
    t0 = time()
    if ds_list is None or len(ds_list) == 0:
        # get ds_list from ensemble record id=row_id_str
        str_sql = 'select id, ds_list from atdml_document where id=' + row_id_str + ' and file_type="ensemble" '
        ret = exec_sqlite.query_db(str_sql)
        if not ret is None and len(ret) > 0:
            ds_list = eval(ret[0][1])
        if ds_list is None or len(ds_list) == 0:
            print "ERROR: ensemble list not found for id=", row_id_str
            return -1
    print "INFO: classifier count=", len(ds_list)  #, ds_list

    # expect local_out_dir to result folder; remove rid in path if any
    bname = os.path.basename(local_out_dir)
    if bname.isdigit():
        local_out_dir = os.path.dirname(local_out_dir)

    # one spark session for the for loop
    sc = None
    # for each id in ds_list  ======================
    ret_list = []
    icnt = 1
    # local parallel
    if flag_local_model == "Y":
        # get one_line from gz file
        one_line = None
        try:
            one_line = zip_preprocess_pattern.convert_to_line_by_bash(
                input_gz, metadata_count, ln_delimitor
            )  # check if one line, if raw file then convert to 1 line
            #print "one_line=",one_line[:100].replace('\t',',')
            #print "one_line=",one_line.replace('\t',',')
            print "INFO: one_line len=", len(one_line)
        except Exception as e:
            print "ERROR: load data file [" + input_gz + "] failed.", e
            return -5

        # parallel processing
        ret_list=Parallel(n_jobs=PARALLEL_CNT)(delayed(predict_one) \
            (id,one_line, local_out_dir,metadata_count,ln_delimitor,label_idx, data_idx,feat_cnt_threshold) for id in ds_list)

    else:  # by Spark
        for id in ds_list:
            #t3 = time()
            pred_ret = None
            # get info from ds id
            (rid, num_gram,ml_opts_str,ds_id_str,lib_mode,option_state,pattern_str,label_arr,ml_opts \
                ,learning_algorithm, file_name) = query_db(id)

            if id is None:  # no result found for classifier
                print "WARNING: dataset id=", id, "not found!"
                continue

            # get one spark context if needed
            if lib_mode == 'mllib' and sc is None:
                sc = predict_single_file_pattern.get_sc(
                    str(id), sp_master, exe_memory, core_max)

            #local_out_dir was used to get model/mapping files
            id_out_dir = os.path.join(local_out_dir, str(id))

            # call prediction(): may need Spark context for mllib
            try:
                #print "before pred",str(id),ds_id_str
                pred_ret = predict_single_file_pattern.predict(
                    row_id_str=str(id),
                    ds_id=ds_id_str,
                    num_gram=num_gram,
                    j_str=ml_opts_str,
                    lib_mode=lib_mode,
                    cid_str=str(pid_str),
                    input_gz=input_gz,
                    local_out_dir=id_out_dir,
                    fromweb="2"  # force to return json output
                    ,
                    verbose=str(verbose),
                    label_idx=label_idx,
                    data_idx=data_idx,
                    metadata_count=metadata_count,
                    pattern_str=pattern_str,
                    ln_delimitor=ln_delimitor,
                    feat_cnt_threshold=feat_cnt_threshold,
                    sc=sc)
                #print "**** ret=",pred_ret
                ret_list.append(pred_ret)
            except:
                print "ERROR: pid=", row_id_str, ",classifier id=", id, ",msg=", sys.exc_info(
                )[0]
                ret_list.append({"id":int(pid_str),"opt_id":int(id),"ds_id": int(ds_id_str) \
                    ,"prediction":None,"predict_val":None,"learning_algorithm":None \
                    ,"lib":None, "ml_opts":None,"predict_index": None })
            #print "INFO: id=",id,",ret=", ret

            icnt = icnt + 1

    print "INFO: ret_list len=", len(ret_list)

    # TBD find max predict value to pick a label. need review  ====================== ==========
    status = "ensemble predicted"
    max_val = sys.maxint * -1
    max_id = 0
    alg = None
    lib = None
    prediction = None
    mix_algs = 0
    for rt in ret_list:
        alg = rt["learning_algorithm"]
        if not rt is None and "predict_val" in rt and rt[
                "predict_val"] > max_val:
            max_val = rt["predict_val"]
            max_id = rt["opt_id"]
            prediction = rt["prediction"]
            lib = rt["lib"]
        if "svm" in alg:
            mix_algs = mix_algs | 1
        if "logistic" in alg:
            mix_algs = mix_algs | 2
    # check if mixing model types
    if mix_algs == 3:
        print "WARNING: Mixing SVM and Logistic Regression algorithms found in Ensemble datasets. The prediction may be INVALID!!!"
    outj={"prediction":prediction,"predict_val":max_val,"status":status,"predict_ds":max_id \
        ,"lib":lib,"learning_algorithm":alg,"processed_date":str(datetime.datetime.now()),"returns":ret_list}

    result_fname = os.path.join(local_out_dir, pid_str,
                                pid_str + "_predict_output.json")
    #print "INFO: result_fname=",result_fname
    print "RESULT: predict_val=", max_val, ",prediction=", prediction
    # create/clean up folder
    ml_util.ml_prepare_output_dirs(pid_str,
                                   os.path.join(local_out_dir, pid_str),
                                   local_out_dir, result_fname)

    # write output to a file
    with open(result_fname, "w") as fo:
        json.dump(outj, fo)

    # Update prediction records here     ====================== ==========
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set status = '"+status+"', processed_date ='" \
            +str(datetime.datetime.now())+"', prediction = '"+ str(prediction)  \
            +"', predict_val = '"+str(max_val) \
            +"', dataset_info = '"+str(max_id) \
            +"' where id="+pid_str
        ret = exec_sqlite.exec_sql(str_sql)

    t1 = time()
    print 'INFO: running time: %f' % (t1 - t0)
    return 0