Python delete_many Exemples, query_mongo.delete_many Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : train_MLlib.py Projet : yanhuizen/Resilient-ML-Research-Platform

def train(row_id_str,
          ds_id,
          hdfs_feat_dir,
          local_out_dir,
          ml_opts_jstr,
          excluded_feat_cslist,
          sp_master,
          spark_rdd_compress,
          spark_driver_maxResultSize,
          sp_exe_memory,
          sp_core_max,
          zipout_dir,
          zipcode_dir,
          zip_file_name,
          mongo_tuples,
          labelnameflag,
          fromweb,
          training_fraction,
          jobname,
          random_seed=None):

    ### generate data folder and out folder, clean up if needed
    #local_out_dir = local_out_dir + "/"
    #if os.path.exists(local_out_dir):
    #    shutil.rmtree(local_out_dir) # to keep smaplelist file
    if not os.path.exists(local_out_dir):
        os.makedirs(local_out_dir)

    # create zip files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              prefix='zip_feature_util')
    print "INFO: zip_file_path=", zip_file_path

    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])

    t0 = time()

    # check if ml_opts.has_excluded_feat ==1 ===================================
    has_excluded_feat = 0
    ml_opts = {}
    if not ml_opts_jstr is None:
        ml_opts = json.loads(ml_opts_jstr)
        if "has_excluded_feat" in ml_opts:
            has_excluded_feat = ml_opts["has_excluded_feat"]
    #print "has_excluded_feat=",has_excluded_feat,",excluded_feat_cslist=",excluded_feat_cslist

    # get excluded feature list from mongo ========== ===
    if str(has_excluded_feat) == "1" and excluded_feat_cslist is None:
        excluded_feat_cslist = ml_util.ml_get_excluded_feat(
            row_id_str, mongo_tuples)
    print "INFO: excluded_feat_cslist=", excluded_feat_cslist

    # filename for featured data
    libsvm_data_file = os.path.join(hdfs_feat_dir, "libsvm_data")
    print "INFO: libsvm_data_file:", libsvm_data_file

    # load feature count file
    feat_count_file = libsvm_data_file + "_feat_count"
    feature_count = zip_feature_util.get_feature_count(sc, feat_count_file)
    print "INFO: feature_count=", feature_count

    # load sample RDD from text file
    #   also exclude selected features in sample ================ =====
    # format (LabeledPoint,hash) from str2LabeledPoint_hash()
    #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file)
    samples_rdd, feature_count = zip_feature_util.get_sample_rdd(
        sc, libsvm_data_file, feature_count, excluded_feat_cslist)

    # get distinct label list
    labels_list_all = samples_rdd.map(
        lambda p: p[0].label).distinct().collect()

    # split samples to training and testing data, format (LabeledPoint,hash)
    training_rdd, testing_rdd = samples_rdd.randomSplit(
        [training_fraction, 1 - training_fraction], seed=int(random_seed))
    training_rdd = training_rdd.map(lambda p: p[0])  # keep LabeledPoint only
    training_rdd.cache()
    training_sample_count = training_rdd.count()
    training_lbl_cnt_list = training_rdd.map(
        lambda p: (p.label, 1)).reduceByKey(add).collect()
    testing_rdd.cache()
    testing_sample_count = testing_rdd.count()
    testing_lbl_cnt_list = testing_rdd.map(
        lambda p: (p[0].label, 1)).reduceByKey(add).collect()
    sample_count = training_sample_count + testing_sample_count

    # random_seed testing
    if not random_seed is None:
        all_t = testing_rdd.collect()
        all_t = sorted(all_t, key=lambda x: x[1])
        cnt = 0
        for i in all_t:
            print i[1]
            cnt = cnt + 1
            if cnt > 3:
                break

    t1 = time()
    print "INFO: training sample count=", training_sample_count, ", testing sample count=", testing_sample_count
    print "INFO: training label list=", training_lbl_cnt_list, ", testing label list=", testing_lbl_cnt_list
    print "INFO: labels_list_all=", labels_list_all
    print "INFO: training and testing samples generated!"
    print 'INFO: running time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########build learning model################
    ###############################################

    ### get the parameters###
    print "INFO: ======Learning Algorithm and Parameters============="
    #ml_opts = json.loads(ml_opts_jstr)
    model_name = ml_opts[
        'learning_algorithm']  # 1: linear_svm_with_sgd; 2: logistic_regression_with_lbfgs; 3: logistic_regression_with_sgd
    iteration_num = 0
    if 'iterations' in ml_opts:
        iteration_num = ml_opts['iterations']
    C = 0
    if 'c' in ml_opts:
        C = eval(ml_opts['c'])
    regularization = ""
    if 'regularization' in ml_opts:
        regularization = ml_opts['regularization']

    print "INFO: Learning Algorithm: ", model_name
    print "INFO: C = ", C
    print "INFO: iterations = ", iteration_num
    print "INFO: regType = ", regularization
    regP = C / float(training_sample_count)
    print "INFO: Calculated: regParam = ", regP

    ### generate label names (family names) #####
    ### connect to database to get the column list which contains all column number of the corresponding feature####
    if labelnameflag == 1:
        '''
        key = "dic_name_label"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'
 
        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
 
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        print "INFO: dic_list=",dic_list
        
        label_dic = {}
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                label_dic[dic_list[i][key]] = key.encode('UTF8')
        '''
        label_dic = ml_util.ml_get_label_dict(row_id_str, mongo_tuples, ds_id)
        print "INFO: label_dic:", label_dic
    else:
        label_dic = {}
        label_set = set(labels_list_all)
        for label_value in label_set:
            label_dic[int(label_value)] = str(int(label_value))
        print "INFO: generated label_dic:", label_dic

    labels_list = []
    for key in sorted(label_dic):
        labels_list.append(label_dic[key])
    print "INFO: labels:", labels_list
    class_num = len(labels_list)
    if class_num > 2:
        print "INFO: Multi-class classification! Number of classes = ", class_num

    ### build model ###

    if model_name == "linear_svm_with_sgd":
        ### 1: linearSVM
        print "INFO: ====================1: Linear SVM============="
        model_classification = SVMWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
        #print model_classification
    elif model_name == "logistic_regression_with_lbfgs":
        ### 2: LogisticRegressionWithLBFGS
        print "INFO: ====================2: LogisticRegressionWithLBFGS============="
        model_classification = LogisticRegressionWithLBFGS.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization,
            numClasses=class_num)  # regParam = 1/(sample_number*C)
    elif model_name == "logistic_regression_with_sgd":
        ### 3: LogisticRegressionWithSGD
        print "INFO: ====================3: LogisticRegressionWithSGD============="
        model_classification = LogisticRegressionWithSGD.train(
            training_rdd,
            regParam=regP,
            iterations=iteration_num,
            regType=regularization)  # regParam = 1/(sample_number*C)
    else:
        print "INFO: Training model selection error: no valid ML model selected!"
        return

    print "INFO: model type=", type(model_classification)

    # create feature coefficient file ================================
    coef_arr = None
    intercept = None
    if model_classification.weights is None:
        print "WARNING: model weights not found!"
    else:
        coef_weights = model_classification.weights
        #print "coef_weights=",coef_weights
        #print type(coef_weights),coef_weights.shape
        coef_arr = coef_weights.toArray().tolist()
        # save coef_arr to mongo
        key = "coef_arr"
        ret = ml_util.save_json_t(row_id_str, key, coef_arr, mongo_tuples)

        # save coef_arr to local file
        if ret == 0:
            # drop old record in mongo
            filter = '{"rid":' + row_id_str + ',"key":"coef_arr"}'
            ret = query_mongo.delete_many(mongo_tuples, None, filter)
            if not os.path.exists(local_out_dir):
                os.makedirs(local_out_dir)
            fn_ca = os.path.join(local_out_dir, row_id_str,
                                 row_id_str + "_coef_arr.pkl")
            print
            ml_util.ml_pickle_save(coef_arr, fn_ca)

        # save intercept to mongo
        intercept = model_classification.intercept
        key = "coef_intercept"
        ret = ml_util.save_json_t(row_id_str, key, intercept, mongo_tuples)

        # feature list + coef file =============
        feat_filename = os.path.join(local_out_dir,
                                     row_id_str + "_feat_coef.json")
        print "INFO: feat_filename=", feat_filename

        # create feature, coef & raw string file =============================================== ============
        # expect a dict of {"fid":(coef, feature_raw_string)}
        jret = ml_util.build_feat_list_t(row_id_str, feat_filename, None, None,
                                         coef_arr, ds_id, mongo_tuples)

        # special featuring for IN or libsvm
        if jret is None:
            jret = ml_util.build_feat_coef_raw_list_t(row_id_str,
                                                      feat_filename, coef_arr,
                                                      ds_id, mongo_tuples)
        if jret is None:
            print "WARNING: Cannot create sample list for testing dataset. "

        jfeat_coef_dict = jret
        print "INFO: coef_arr len=", len(
            coef_arr), ", feature_count=", feature_count
        # for multi-class
        if len(coef_arr) != feature_count:
            jfeat_coef_dict = {}
            print "WARNING: coef count didn't match feature count.  multi-class classification was not supported"

        # Calculate prediction and Save testing dataset
        bt_coef_arr = sc.broadcast(coef_arr)
        bt_intercept = sc.broadcast(intercept)
        bt_jfeat_coef_dict = sc.broadcast(jfeat_coef_dict)
        ### Evaluating the model on testing dataset: label, predict label, score, feature list
        print "INFO: intercept=", intercept
        print "INFO: coef_arr len=", len(coef_arr), type(coef_arr)
        print "INFO: jfeat_coef_dict len=", len(
            jfeat_coef_dict)  #, jfeat_coef_dict

        # get prediction of testing dataset : (tlabel, plabel, score, libsvm, raw feat str, hash) ==============================
        if len(coef_arr) == feature_count:
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,zip_feature_util.calculate_hypothesis(p[0].features, bt_coef_arr.value, bt_intercept.value, model_name) \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        else:  # for multi-class, no prediction score; TBD for better solution: how to display multiple weights for each class
            testing_pred_rdd = testing_rdd.map(lambda p: (
                 p[0].label \
                ,model_classification.predict(p[0].features) \
                ,"-" \
                ,p[0].features \
                ,p[1] \
            ) ).cache()
        ''',p[0].features.dot(bt_coef_arr.value)+bt_intercept.value \
        # Save testing dataset for analysis
        libsvm_testing_output = hdfs_feat_dir + "libsvm_testing_output_"+row_id_str
        print "INFO: libsvm_testing_output=", libsvm_testing_output
        try:
            hdfs.rmr(libsvm_testing_output)
        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm_testing_output file clean up:", sys.exc_info()[0] 
        # save only false prediction?
        #testing_pred_rdd.filter(lambda p: p[0] != p[1]).saveAsTextFile(libsvm_testing_output)
        testing_pred_rdd.saveAsTextFile(libsvm_testing_output)
        
        '''
        #test_tmp=testing_pred_rdd.collect()

        # save false prediction to local file
        false_pred_fname = os.path.join(local_out_dir,
                                        row_id_str + "_false_pred.json")
        print "INFO: false_pred_fname=", false_pred_fname
        false_pred_data=testing_pred_rdd.filter(lambda p: p[0] != p[1])\
            .map(lambda p: (p[0],p[1],p[2] \
            ,zip_feature_util.get_dict_coef_raw4feat(zip_feature_util.sparseVector2dict(p[3]), bt_jfeat_coef_dict.value)
            ,p[4]  ) ) \
            .collect()
        print "INFO: false predicted count=", len(false_pred_data)
        false_pred_arr = []
        with open(false_pred_fname, "w") as fp:
            for sp in false_pred_data:
                jsp = {
                    "tlabel": sp[0],
                    "plabel": sp[1],
                    "score": sp[2],
                    "feat": sp[3],
                    "hash": sp[4]
                }
                #print "jsp=",jsp
                false_pred_arr.append(jsp)
            fp.write(json.dumps(false_pred_arr))

        # save prediction results, format: label, prediction, hash
        pred_ofname = os.path.join(local_out_dir,
                                   row_id_str + "_pred_output.pkl")
        print "INFO: pred_ofname=", pred_ofname
        pred_out_arr = testing_pred_rdd.map(lambda p:
                                            (p[0], p[1], p[4])).collect()
        ml_util.ml_pickle_save(pred_out_arr, pred_ofname)
        '''
        one_item= testing_pred_rdd.first()
        print "one_item=",one_item
        sparse_arr=one_item[3]

        dict_feat=zip_feature_util.sparseVector2dict(sparse_arr)
        print "len=",len(dict_feat),"dict_feat=",dict_feat
        dict_weit=zip_feature_util.add_coef2dict(coef_arr,dict_feat)
        print "len=",len(dict_weit),"dict_weit=",dict_weit
        '''
    # Calculate Accuracy. labelsAndPreds = (true_label,predict_label)
    labelsAndPreds = testing_pred_rdd.map(lambda p: (p[0], p[1]))
    labelsAndPreds.cache()
    testing_sample_number = testing_rdd.count()
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
        testing_sample_number)
    accuracy = 1 - testErr
    print "INFO: Accuracy = ", accuracy

    ### Save model
    #save_dir = config.get('app', 'HADOOP_MASTER')+'/user/hadoop/yigai/row_6/'
    #save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str
    save_dir = os.path.join(config.get('app', 'HADOOP_MASTER'),
                            config.get('app', 'HDFS_MODEL_DIR'), row_id_str)
    try:
        hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(save_dir)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(
            e.errno, e.strerror), ". At HDFS=", save_dir
    except:
        print "WARNING: Unexpected error:", sys.exc_info(
        )[0], ". At HDFS=", save_dir
    model_classification.save(sc, save_dir)

    ###load model if needed
    #sameModel = SVMModel.load(sc, save_dir)

    t1 = time()
    print 'INFO: training run time: %f' % (t1 - t0)
    t0 = t1

    ###############################################
    ###########plot prediction result figure ==================================================== ===============
    ###############################################

    labels = labelsAndPreds.collect()
    true_label_list = [x for x, _ in labels]
    pred_label_list = [x for _, x in labels]

    pred_fname = os.path.join(local_out_dir, row_id_str + "_1" + ".png")
    true_fname = os.path.join(local_out_dir, row_id_str + "_2" + ".png")
    pred_xlabel = 'Prediction (Single Run)'
    true_xlabel = 'True Labels (Single Run)'
    test_cnt_dic = ml_util.ml_plot_predict_figures(
        pred_label_list, true_label_list, labels_list, label_dic,
        testing_sample_count, pred_xlabel, pred_fname, true_xlabel, true_fname)
    print "INFO: figure files: ", pred_fname, true_fname
    #print "INFO: Number of samples in each label is=", test_cnt_dic

    roc_auc = None
    perf_measures = None
    dataset_info = {
        "training_fraction": training_fraction,
        "class_count": class_num,
        "dataset_count": sample_count
    }
    #############################################################
    ###################for 2 class only (plot ROC curve) ==================================================== ===============
    #############################################################
    if len(labels_list) == 2:

        do_ROC = True
        reverse_label_dic = dict((v, k) for k, v in label_dic.items())
        if 'clean' in reverse_label_dic:
            flag_clean = reverse_label_dic['clean']
        elif 'benign' in reverse_label_dic:
            flag_clean = reverse_label_dic['benign']
        elif '0' in reverse_label_dic:
            flag_clean = 0
        else:
            print "INFO: No ROC curve generated: 'clean','benign' or '0' must be a label for indicating negative class!"
            do_ROC = False

        # build data file for score graph
        score_graph_fname = os.path.join(local_out_dir,
                                         row_id_str + "_score_graph.json")
        print "INFO: score_graph_fname=", score_graph_fname

        # build score_arr_0, score_arr_1
        #    format: tlabel, plabel, score, libsvm, raw feat str, hash
        graph_arr = testing_pred_rdd.map(lambda p:
                                         (int(p[0]), float(p[2]))).collect()
        score_arr_0 = []
        score_arr_1 = []
        max_score = 0
        min_score = 0
        for p in graph_arr:
            if p[0] == 0:
                score_arr_0.append(p[1])
            else:
                score_arr_1.append(p[1])
            # save max,min score
            if p[1] > max_score:
                max_score = p[1]
            elif p[1] < min_score:
                min_score = p[1]

        ml_build_pred_score_graph(score_arr_0, score_arr_1, model_name,
                                  score_graph_fname, max_score, min_score)

        if do_ROC:

            perf_measures = ml_util.calculate_fscore(true_label_list,
                                                     pred_label_list)
            print "RESULT: perf_measures=", perf_measures
            '''
            # calculate fscore  ==========
            tp = labelsAndPreds.filter(lambda (v, p): v == 1 and p==1 ).count() 
            fp = labelsAndPreds.filter(lambda (v, p): v == 0 and p==1 ).count() 
            fn = labelsAndPreds.filter(lambda (v, p): v == 1 and p==0 ).count() 
            tn = labelsAndPreds.filter(lambda (v, p): v == 0 and p==0 ).count() 
            print "RESULT: tp=",tp,",fp=",fp,",fn=",fn,",tn=",tn
            precision=float(tp)/(tp+fp)
            recall=float(tp)/(tp+fn)
            print "RESULT: precision=",precision,",recall=",recall
            acc=(tp+tn)/(float(testing_sample_number))
            fscore=2*((precision*recall)/(precision+recall))
            print "RESULT: fscore=",fscore,",acc=",acc  
            '''
            model_classification.clearThreshold()
            scoreAndLabels = testing_rdd.map(lambda p: (
                model_classification.predict(p[0].features), int(p[0].label)))
            #metrics = BinaryClassificationMetrics(scoreAndLabels)
            #areROC = metrics.areaUnderROC
            #print areROC
            scoreAndLabels_list = scoreAndLabels.collect()

            if flag_clean == 0:
                scores = [x for x, _ in scoreAndLabels_list]
                s_labels = [x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[0]
                testing_P = test_cnt_dic[1]
            else:
                scores = [-x for x, _ in scoreAndLabels_list]
                s_labels = [1 - x for _, x in scoreAndLabels_list]
                testing_N = test_cnt_dic[1]
                testing_P = test_cnt_dic[0]

            # create ROC data file ======== ====
            roc_auc = ml_create_roc_files(row_id_str, scores, s_labels,
                                          testing_N, testing_P, local_out_dir,
                                          row_id_str)
            #, local_out_dir, file_name_given)

            perf_measures["roc_auc"] = roc_auc

    # only update db for web request ==================================================== ===============
    if fromweb == "1":
        #print "database update"
        str_sql="UPDATE atdml_document set "+"accuracy = '"+str(accuracy*100)+"%" \
            +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \
            +"', perf_measures='"+json.dumps(perf_measures) \
            +"', dataset_info='"+json.dumps(dataset_info) \
            +"' where id="+row_id_str
        ret = exec_sqlite.exec_sql(str_sql)
        print "INFO: Data update done! ret=", str(ret)
    else:
        print "INFO: accuracy = '" + str(accuracy * 100) + "%"

    print 'INFO: Finished!'
    return 0

Exemple #2

0

Afficher le fichier

Fichier : feature_extraction_custom.py Projet : yanhuizen/Resilient-ML-Research-Platform

    # insert into mongoDB; all_hash_str_dic:  {hash:'str1', ... }
    filter = '{"rid":' + row_id_str + ',"key":"dic_hash_str"}'
    upsert_flag = True
    jo_insert = {}
    jo_insert["rid"] = eval(row_id_str)
    jo_insert["key"] = "dic_hash_str"
    jo_insert["value"] = all_hash_str_dic
    jstr_insert = json.dumps(jo_insert)
    #print "jstr_insert=",jstr_insert
    ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert,
                                   upsert_flag)
    print "INFO: Upsert count for hash_str_dic=", ret
    # insert failed, save to local
    if ret == 0:
        # drop old record in mongo
        ret = query_mongo.delete_many(mongo_tuples, None, filter)
        if not os.path.exists(local_out_dir):
            os.makedirs(local_out_dir)
        fn_hs = os.path.join(local_out_dir, row_id_str,
                             row_id_str + "_dic_hash_str.pkl")
        print "WARNING: save hash_str_dic to local"
        ml_util.ml_pickle_save(all_hash_str_dic, fn_hs)

    # reverse key value from all_hashes_seq_dic-> all_seq_hashes_dic: { sequential_numb : hash }
    all_seq_hashes_dic = {y: str(x) for x, y in all_hashes_seq_dic.iteritems()}

    # insert all_seq_hashes_dic into mongoDB  ================ TBD may over 16Mb limit
    filter = '{"rid":' + row_id_str + ',"key":"dic_seq_hashes"}'
    upsert_flag = True
    jo_insert = {}
    jo_insert["rid"] = eval(row_id_str)

Exemple #3

0

Afficher le fichier

Fichier : feature_extraction_custom.py Projet : yanhuizen/Resilient-ML-Research-Platform

def feat_extraction(row_id_str,
                    hdfs_dir_list,
                    hdfs_feat_dir,
                    model_data_folder,
                    sp_master,
                    spark_rdd_compress,
                    spark_driver_maxResultSize,
                    sp_exe_memory,
                    sp_core_max,
                    zipout_dir,
                    zipcode_dir,
                    zip_file_name,
                    mongo_tuples,
                    fromweb,
                    label_arr,
                    metadata_count,
                    label_idx,
                    data_idx,
                    pattern_str,
                    ln_delimitor,
                    data_field_list,
                    jkey_dict,
                    jobname,
                    num_gram,
                    feature_count_threshold,
                    token_dict=None,
                    HDFS_RETR_DIR=None,
                    remove_duplicated="N",
                    cust_featuring=None,
                    cust_featuring_params=None,
                    local_out_dir=None,
                    filter_ratio=None,
                    binary_flag=False):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              user_custom=cust_featuring)

    # get_spark_context
    spark = ml_util.ml_get_spark_session(sp_master, spark_rdd_compress,
                                         spark_driver_maxResultSize,
                                         sp_exe_memory, sp_core_max, jobname,
                                         zip_file_path)
    if spark:
        sc = spark.sparkContext
    # log time ================================================================ ================
    t0 = time()

    # input filename
    input_filename = "*"
    ext_type = '.gz'
    gz_list = None

    # single hdfs file
    if not ',' in hdfs_dir_list:  # single dir having *.gz ==== =========
        # read raw data from HDFS as .gz format ==========
        hdfs_files = os.path.join(hdfs_dir_list, input_filename + ext_type)
        # check if gz files in hdfs ============
        try:
            gz_list = hdfs.ls(hdfs_dir_list)
            print "INFO: check hdfs folder=", hdfs_dir_list

        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Error at checking HDFS file:", sys.exc_info()[0]
        # use whole folder
        #print "gz_list",gz_list
        if gz_list is None or len(gz_list) == 0:
            print "ERROR: No file found by ", input_filename + ext_type  #,", use",hdfs_dir_list,"instead"
            return -2
        elif len(gz_list) == 1:
            # use dir as filename
            hdfs_files = hdfs_dir_list[0:-1]

    else:  # multiple dirs ==== =========
        hdfs_files = ""
        cnt = 0
        temp_lbl_list = []
        comma = ""
        print "INFO: before label_arr=", label_arr

        # check each folder
        for dr in hdfs_dir_list.split(','):
            #print "****=",dr
            if not len(dr) > 0:
                continue
            try:
                # remove space etc.
                dr = dr.strip()
                fdr = os.path.join(HDFS_RETR_DIR, dr)
                # ls didn't like "*"
                if '*' in fdr:
                    #gz_list=hdfs.ls(fdr.replace("*",""))
                    dn = os.path.dirname(fdr).strip()
                    bn = os.path.basename(
                        fdr).strip()  #print "dn=",dn,",bn=",bn
                    # get all names under folder and do filtering
                    gz_list = fnmatch.filter(hdfs.ls(dn), '*' + bn)
                else:
                    gz_list = hdfs.ls(fdr)
                cnt = cnt + len(gz_list)

                if len(gz_list) > 0:
                    hdfs_files = hdfs_files + comma + fdr
                    comma = ","
            except IOError as e:
                print "WARNING: I/O error({0}): {1}".format(
                    e.errno, e.strerror)
            except:
                print "WARNING: Error at checking HDFS file:", sys.exc_info(
                )[0]
        # use whole folder
        if cnt is None or cnt == 0:
            print "ERROR: No file found at", hdfs_files
            return -2
        else:
            print "INFO: total file count=", cnt
        # set convert flag only when multiple dir and label_arr has dirty label

        if not label_arr is None and len(
                label_arr) == 2 and label_arr[1] == "dirty":
            convert2dirty = "Y"
    print "INFO: hdfs_dir_list=", hdfs_dir_list
    print "INFO: hdfs_files=", hdfs_files

    cust_featuring_jparams = None
    # custom featuring
    if not cust_featuring is None and len(cust_featuring) > 0:
        # load user module =======
        user_func, cust_featuring_jparams = get_user_custom_func(
            cust_featuring, cust_featuring_params)
        # TBD apply    user_func

        all_hashes_cnt_dic = None
        all_hash_str_dic = None
        all_hashes_seq_dic = None
    else:
        print "ERROR: custom featuring type is needed"
    print "INFO: cust_featuring=", cust_featuring, "cust_featuring_jparams=", cust_featuring_jparams

    dnn_flag = False
    has_header = None
    label_col = None
    label_index = None
    # get featuring params
    if cust_featuring_jparams:
        if 'label_index' in cust_featuring_jparams:  # idx number for label, 0 based
            label_index = cust_featuring_jparams['label_index']
        if 'has_header' in cust_featuring_jparams:  # True/False
            has_header = eval(cust_featuring_jparams['has_header'])
            if has_header == 1:
                has_header = True
        if 'dnn_flag' in cust_featuring_jparams:  # True/False
            dnn_flag = cust_featuring_jparams['dnn_flag']
            if dnn_flag == 1:
                dnn_flag = True
            elif dnn_flag == 0:
                dnn_flag = False

    if label_index is None:
        label_index = 0
    elif not isinstance(label_index, int):
        label_index = eval(label_index)

    print "INFO: label_index=", label_index, ",has_header=", has_header, ",dnn_flag=", dnn_flag

    # read as DataFrame ===============================================
    df = spark.read.csv(hdfs_files, header=has_header)

    df.show()
    print "INFO: col names=", df.columns

    # get column name for label
    label_col = None
    for i, v in enumerate(df.columns):
        if i == label_index:
            label_col = v

    # get all distinct labels into an array  =============== provided by parameter?
    if label_arr is None and not label_col is None:
        label_arr = sorted([
            rw[label_col] for rw in df.select(label_col).distinct().collect()
        ])
    print "INFO: label_arr=", label_arr

    label_dic = {}
    # convert label_arr to dict; {label:number|
    for idx, label in enumerate(sorted(label_arr)):
        if not label in label_dic:
            label_dic[
                label] = idx  #starting from 0, value = idx, e.g., clean:0, dirty:1

    # add params for dataframe conversion
    cust_featuring_jparams["label_dict"] = label_dic
    # convert to int
    cust_featuring_jparams["label_index"] = label_index
    featuring_params = json.dumps(cust_featuring_jparams)
    # convert DataFrame row to libsvm string
    libsvm_rdd = df.rdd.map(lambda x: user_func(list(x), featuring_params))
    print "INFO: sample df row=", (libsvm_rdd.collect()[0])
    print "INFO: featuring_params=", featuring_params

    total_input_count = df.count()
    print "INFO: Total input sample count=", total_input_count
    #print "INFO: feature_count_threshold=",feature_count_threshold

    #get all hashes and total occurring count ===============
    #   all_hashes_cnt_dic: {'col index': total count,... }
    # build all_hashes_cnt_dic
    cnt_df = df.select(
        [count(when(~isnull(c), c)).alias(c) for c in df.columns])
    #cnt_df.show()
    cnt_arr = cnt_df.rdd.map(lambda x: list(x)).collect()
    feat_sample_count_arr = cnt_arr[0]
    #print "feat_sample_count_arr=",feat_sample_count_arr

    if all_hashes_cnt_dic is None:
        all_hashes_cnt_dic = {}
        idx = 1
        for i, v in enumerate(feat_sample_count_arr):
            if i != label_index:
                all_hashes_cnt_dic[idx] = v
                idx += 1
    #print "all_hashes_cnt_dic=",all_hashes_cnt_dic

    #get all hashes and their extracted string  ===============
    #   all_hash_str_dic: {hash:'str1', ...
    if all_hash_str_dic is None:
        # convert header to dict=index:string; excude label column
        all_hash_str_dic = {}
        idx = 1
        for i, v in enumerate(df.schema.names):
            if i != label_index:
                all_hash_str_dic[idx] = v
                idx += 1
    #print "all_hash_str_dic=",all_hash_str_dic

    # save labels to hdfs as text file==================================== ============
    hdfs_folder = hdfs_feat_dir  #+ "/"   # "/" is needed to create the folder correctly
    print "INFO: hdfs_folder=", hdfs_folder
    try:
        hdfs.mkdir(hdfs_folder)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0]

    # clean up metadata_file
    metadata_file = os.path.join(hdfs_folder, metadata)  #"metadata"
    print "INFO: metadata_file=", metadata_file
    try:
        hdfs.rmr(metadata_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at rmr():", sys.exc_info()[0]
    sc.parallelize(label_arr, 1).saveAsTextFile(metadata_file)

    #remap all hash values to continuous key/feature number ==============
    #     all_hashes_seq_dic: { hash : sequential_numb }
    if all_hashes_seq_dic is None:
        all_hashes_seq_dic = {}
        # csv column index as sequentail number
        remap2seq(all_hash_str_dic, all_hashes_seq_dic)
    #print "all_hashes_seq_dic=",all_hashes_seq_dic
    total_feature_numb = len(all_hashes_seq_dic)
    print "INFO: Total feature count=", len(all_hashes_seq_dic)

    # save feat_sample_count_arr data ==================================== ============
    filter = '{"rid":' + row_id_str + ',"key":"feat_sample_count_arr"}'
    upsert_flag = True
    jo_insert = {}
    jo_insert["rid"] = eval(row_id_str)
    jo_insert["key"] = "feat_sample_count_arr"
    jo_insert["value"] = feat_sample_count_arr
    jstr_insert = json.dumps(jo_insert)
    ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert,
                                   upsert_flag)
    print "INFO: Upsert count for feat_sample_count_arr=", ret
    # insert failed, save to local
    if ret == 0:
        # drop old record in mongo
        ret = query_mongo.delete_many(mongo_tuples, None, filter)
        if not os.path.exists(local_out_dir):
            os.makedirs(local_out_dir)
        fsca_hs = os.path.join(local_out_dir, row_id_str,
                               row_id_str + "_feat_sample_count_arr.pkl")
        print "WARNING: save feat_sample_count_arr to local"
        ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs)

    # get rdd statistics info
    # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx
    if remove_duplicated == "Y":
        libsvm_rdd=libsvm_rdd \
            .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \
            .groupByKey().map(lambda x: list(x[1])[0] ) \
            .cache()
    cnt_list = libsvm_rdd.map(lambda x: (x.split(' ')[1], 1)).reduceByKey(
        add).collect()
    stats = libsvm_rdd.map(
        lambda x: len(x.split(' ')[metadata_count:])).stats()
    feat_count_max = stats.max()
    feat_count_stdev = stats.stdev()
    feat_count_mean = stats.mean()
    sample_count = stats.count()
    print "INFO: Non-Duplicated libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
    print "INFO:   ,max feature count=", feat_count_max
    print "INFO: Non-Duplicated Label count list=", cnt_list

    # clean up libsvm data ==================================== ============
    libsvm_data_file = os.path.join(hdfs_folder,
                                    libsvm_alldata_filename)  #"libsvm_data"
    print "INFO: libsvm_data_file=", libsvm_data_file
    try:
        hdfs.rmr(libsvm_data_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
        )[0]

    #codec = "org.apache.hadoop.io.compress.GzipCodec"
    #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec)
    libsvm_rdd.saveAsTextFile(libsvm_data_file)  # TBD encrypted

    feat_count_file = libsvm_data_file + "_feat_count"
    print "INFO: feat_count_file=", feat_count_file
    try:
        hdfs.rmr(feat_count_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at feat_count clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info(
        )[0]
    sc.parallelize([total_feature_numb], 1).saveAsTextFile(feat_count_file)

    # TBD ???  output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN ===========
    if dnn_flag:  # special flag to tokenize and keep input orders
        print "INFO: processing data for DNN..."
        # create token dict
        # str_hash_dict: string to hash
        # all_hashes_seq_dic: hash to seq id
        if token_dict is None or len(token_dict) == 0:
            token_dict = {}
            str_hash_dict = {v: k for k, v in all_hash_str_dic.iteritems()}
            for k, v in str_hash_dict.iteritems():
                token_dict[k] = int(all_hashes_seq_dic[str(v)])
            #print "token_dict=",len(token_dict),token_dict

        # TBD here: need to implement non-binary feature
        dnn_rdd=df.rdd \
            .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list)
        #.cache()
        # filter duplication here
        #print dnn_rdd.take(3)

        dnn_data_file = os.path.join(hdfs_folder,
                                     dnn_alldata_filename)  #"dnn_data"
        print "INFO: dnn_data_file=", dnn_data_file
        try:
            hdfs.rmr(dnn_data_file)
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
            )[0]

        # clean up data
        dnn_npy_gz_file = os.path.join(hdfs_folder, row_id_str + "_dnn_")
        print "INFO: dnn_npy_gz_file=", dnn_npy_gz_file
        try:
            hdfs.rmr(dnn_npy_gz_file + "data.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "label.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "info.npy.gz")
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_npy clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at dnn_npy file clean up:", sys.exc_info(
            )[0]
        # save new data
        try:
            dnn_rdd.saveAsTextFile(dnn_data_file)
        except:
            print "WARNING: Unexpected error at saving dnn data:", sys.exc_info(
            )[0]
        # show data statistics
        try:
            stats = dnn_rdd.map(lambda p: len(p[metadata_count])).stats()
            feat_count_max = stats.max()
            feat_count_stdev = stats.stdev()
            feat_count_mean = stats.mean()
            sample_count = stats.count()
            print "INFO: DNN data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
            print "INFO:   ,max feature count=", feat_count_max
        except:
            print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info(
            )[0]

    # clean up pca data in hdfs ============ ========================
    pca_files = '*' + libsvm_alldata_filename + "_pca_*"
    #print "INFO: pca_files=", pca_files
    try:
        f_list = hdfs.ls(hdfs_folder)
        if len(f_list) > 0:
            df_list = fnmatch.filter(f_list, pca_files)
            for f in df_list:
                print "INFO: rm ", f
                hdfs.rmr(f)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info(
        )[0]

    # clean up pca data in web local ============ ========================
    pca_fname = os.path.join(model_data_folder, row_id_str + '_pca_*.pkl*')
    print "INFO: pca_fname=", pca_fname

    try:
        for fl in glob.glob(pca_fname):
            print "INFO: remove ", fl
            os.remove(fl)
    except OSError, e:
        print("Error: %s - %s." % (e.pca_fname, e.strerror))

Exemple #4

0

Afficher le fichier

Fichier : feature_extraction_ngram.py Projet : yanhuizen/Resilient-ML-Research-Platform

def feat_extr_ngram(row_id_str,
                    hdfs_dir_list,
                    hdfs_feat_dir,
                    model_data_folder,
                    sp_master,
                    spark_rdd_compress,
                    spark_driver_maxResultSize,
                    sp_exe_memory,
                    sp_core_max,
                    zipout_dir,
                    zipcode_dir,
                    zip_file_name,
                    mongo_tuples,
                    fromweb,
                    label_arr,
                    metadata_count,
                    label_idx,
                    data_idx,
                    pattern_str,
                    ln_delimitor,
                    data_field_list,
                    jkey_dict,
                    jobname,
                    num_gram,
                    feature_count_threshold,
                    token_dict=None,
                    HDFS_RETR_DIR=None,
                    remove_duplicated="N",
                    cust_featuring=None,
                    cust_featuring_params=None,
                    local_out_dir=None,
                    filter_ratio=None,
                    binary_flag=True):

    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir,
                                              zipcode_dir,
                                              zip_file_name,
                                              user_custom=cust_featuring)
    # get_spark_context
    sc = ml_util.ml_get_spark_context(sp_master, spark_rdd_compress,
                                      spark_driver_maxResultSize,
                                      sp_exe_memory, sp_core_max, jobname,
                                      [zip_file_path])
    # log time ================================================================ ================
    t0 = time()

    # input filename
    input_filename = "*"
    ext_type = '.gz'
    gz_list = None
    convert2dirty = "N"
    if not ',' in hdfs_dir_list:  # single dir having *.gz ==== =========
        # read raw data from HDFS as .gz format ==========
        rdd_files = os.path.join(hdfs_dir_list, input_filename + ext_type)
        # check if gz files in hdfs ============
        try:
            gz_list = hdfs.ls(hdfs_dir_list)
            print "INFO: check hdfs folder=", hdfs_dir_list

        except IOError as e:
            print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "WARNING: Error at checking HDFS file:", sys.exc_info()[0]
        # use whole folder
        if gz_list is None or len(gz_list) == 0:
            print "ERROR: No file found by ", input_filename + ext_type  #,", use",hdfs_dir_list,"instead"
            return -2
        elif len(gz_list) == 1:
            # use dir as filename
            rdd_files = hdfs_dir_list[0:-1]

    else:  # multiple dirs ==== =========
        rdd_files = ""
        cnt = 0
        temp_lbl_list = []
        comma = ""
        print "INFO: before label_arr=", label_arr

        # check each folder
        for dr in hdfs_dir_list.split(','):
            #print "****=",dr
            if not len(dr) > 0:
                continue
            try:
                # remove space etc.
                dr = dr.strip()
                fdr = os.path.join(HDFS_RETR_DIR, dr)
                #print "fdr=",fdr
                # ls didn't like "*"
                if '*' in fdr:
                    #gz_list=hdfs.ls(fdr.replace("*",""))
                    dn = os.path.dirname(fdr).strip()
                    bn = os.path.basename(fdr).strip()
                    #print "dn=",dn,",bn=",bn
                    # get all names under folder and do filtering
                    gz_list = fnmatch.filter(hdfs.ls(dn), '*' + bn)
                    #print "gz_list=",gz_list
                else:
                    gz_list = hdfs.ls(fdr)
                cnt = cnt + len(gz_list)

                if len(gz_list) > 0:
                    rdd_files = rdd_files + comma + fdr
                    comma = ","
            except IOError as e:
                print "WARNING: I/O error({0}): {1}".format(
                    e.errno, e.strerror)
            except:
                print "WARNING: Error at checking HDFS file:", sys.exc_info(
                )[0]
        # use whole folder
        if cnt is None or cnt == 0:
            print "ERROR: No file found at", rdd_files
            return -2
        else:
            print "INFO: total file count=", cnt
        # set convert flag only when multiple dir and label_arr has dirty label
        #if label_arr is None: # create label arr if None
        #    label_arr=temp_lbl_list
        if not label_arr is None and len(
                label_arr) == 2 and label_arr[1] == "dirty":
            convert2dirty = "Y"
    print "INFO: rdd_files=", rdd_files

    txt_rdd = sc.textFile(rdd_files)  #, use_unicode=False

    total_input_count = txt_rdd.count()
    print "INFO: Total input sample count=", total_input_count
    # debug only
    #for x in txt_rdd.collect():
    #    print "t=",x
    print "INFO: hdfs_dir_list=", hdfs_dir_list
    print "INFO: label_arr=", label_arr
    print "INFO: feature_count_threshold=", feature_count_threshold

    #jkey_dict={"meta_list":["label","md5","mdate"], "data_key":"logs"}
    #   this dict depends on the format of input data
    if not data_field_list is None:
        jkey_dict = json.loads(jkey_dict)

        data_key = jkey_dict["data_key"]
        meta_list = jkey_dict["meta_list"]

        metadata_count = len(meta_list)
        data_idx = metadata_count
        print "INFO: jkey_dict=", jkey_dict
        print "INFO: meta_list=", meta_list
        print "INFO: data_key=", data_key
        print "INFO: data_field_list=", data_field_list
        print "INFO: metadata_count=", metadata_count

        featured_rdd = txt_rdd \
            .map(lambda x: preprocess_json(x,meta_list,data_key,data_field_list)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()

        #print "INFO: featured_rdd="
        #for x in featured_rdd.collect():
        #    print "INFO: **** f=",x
    # user custom code for featuring  ============================================= ==========
    #   input txt_rdd format (string):  each text row for each sample
    #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    elif not cust_featuring is None and len(cust_featuring) > 0:
        user_module = None
        user_func = None
        user_func_dnn = None
        # load user module =======
        try:
            modules = map(__import__, [CUSTOM_PREFIX + cust_featuring])
            user_module = modules[0]
            user_func = getattr(user_module, CUSTOM_FUNC)
        except Exception as e:
            print "ERROR: module=", CUSTOM_PREFIX + cust_featuring
            print "ERROR: user module error.", e.__doc__, e.message
            return -101
        try:
            jparams = json.loads(cust_featuring_params)
            if jparams and 'n-gram' in jparams:
                num_gram = jparams['n-gram']
            elif jparams and 'ngram' in jparams:
                num_gram = jparams['ngram']
            if jparams and 'binary_flag' in jparams:
                binary_flag = eval(jparams['binary_flag'])
        except Exception as e:
            print "ERROR: user params error.", e.__doc__, e.message
            return -200

        # convert feast into array. output format: [ meta1,meta2,..., [feat1,feat2,...]]
        tmp_rdd = txt_rdd.map(lambda x: user_func(x, cust_featuring_params)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list).cache()
        print " tmp_rdd cnt=", tmp_rdd.count(
        ), ",ix=", data_idx, ",max f=", MAX_FEATURES, "ngram=", num_gram
        print "take(1) rdd=", tmp_rdd.take(1)

        # TBD for multivariant output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]]

        # TBD only for num_gram available
        # for traditional ML, feat in a dict
        # output format: [ meta1,meta2,..., [[feat1,feat2,...],[feat1,feat2,...],...]]
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()

        all_hashes_cnt_dic = None
        all_hash_str_dic = None
        all_hashes_seq_dic = None
    else:
        print "INFO: pattern_str=", pattern_str + "<--"
        print "INFO: ln_delimitor=", ln_delimitor + "<--"
        print "INFO: label_idx=", label_idx
        print "INFO: data_idx=", data_idx
        print "INFO: metadata_count=", metadata_count
        print "INFO: filter_ratio=", filter_ratio

        # filter top and least percentage of feature
        if not filter_ratio is None and filter_ratio > 0 and filter_ratio < 1:
            # check total count here before continue
            upper_cnt = total_input_count * (1 - filter_ratio)
            lower_cnt = total_input_count * filter_ratio
            # set limit for lower bound. if total count is large, lower_cnt may exclude all features...
            # max lower count =  min( MAX_FILTER_LOWER_CNT, total_input_count/100 )
            if not MAX_FILTER_LOWER_CNT is None and lower_cnt > MAX_FILTER_LOWER_CNT:
                if MAX_FILTER_LOWER_CNT > total_input_count / 100:
                    lower_cnt = total_input_count / 100
                else:
                    lower_cnt = MAX_FILTER_LOWER_CNT

            print "INFO: filtering by count, upper bound=", upper_cnt, ",lower bound=", lower_cnt
            # find unique feature, count them, remove them if in highest and lowest % and then create a dict
            f_feat_set = Set (txt_rdd.map(lambda x:x.split(ln_delimitor)).flatMap(lambda x:Set(x[metadata_count:])) \
                .map(lambda x:(x,1)).reduceByKey(lambda a, b: a + b) \
                .filter(lambda x:x[1]<= upper_cnt and x[1]>= lower_cnt) \
                .map(lambda x:x[0]).collect() )

            print "INFO: f_feat_set len=", len(f_feat_set)
            broadcast_f_set = sc.broadcast(f_feat_set)

            #txt_rdd=txt_rdd.map(lambda x: filter_by_list(x, metadata_count,ln_delimitor, broadcast_f_list.value ))
            txt_rdd=txt_rdd.map(lambda x: x.split(ln_delimitor)) \
                        .map(lambda x: x[:metadata_count]+ [w for w in x[metadata_count:] if w and w in broadcast_f_set.value]) \
                        .map(lambda x: ln_delimitor.join(x))

        # preprocess by pattern matching and then extract n-gram features   #.encode('UTF8')
        #   input txt_rdd format (string):  meta-data1\tmeta-data2\t...\tdataline1\tdataline2\t...datalineN\n
        #   output featured_rdd format (list):[meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
        #       hash_cnt_dic: {hash,hash:count,...}  hash_str_dic: {hash: 'str1',... }
        tmp_rdd = txt_rdd \
            .map(lambda x: preprocess_pattern(x, metadata_count, pattern_str, ln_delimitor \
                                                , label_idx, label_arr, convert2dirty )) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list) #.cache() memory issue...
        #tmp_rdd_count=tmp_rdd.count()
        #print "INFO: After preprocessing count=",tmp_rdd_count
        featured_rdd = tmp_rdd \
            .map(lambda x: feature_extraction_ngram(x, data_idx, MAX_FEATURES, num_gram)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is dict) \
            .filter(lambda x: type(x[metadata_count+1]) is dict) \
            .filter(lambda x: len(x[metadata_count])> int(feature_count_threshold) ) \
            .cache()
        #feat_rdd_count=featured_rdd.count()
        #print "INFO: After featuring count=",feat_rdd_count

        all_hashes_cnt_dic = None
        all_hash_str_dic = None
        all_hashes_seq_dic = None

    #get all hashes and total occurring count ===============
    #   all_hashes_cnt_dic: {'hash,hash': total count,... }
    if all_hashes_cnt_dic is None:
        #all_hashes_cnt_dic = featured_rdd.map(lambda x: x[metadata_count]).reduce(lambda a, b: combine_dic_cnt(a, b))
        all_hashes_cnt_dic = dict(
            featured_rdd.flatMap(lambda x: x[metadata_count].items()).
            reduceByKey(lambda a, b: a + b).collect())

    #get all hashes and their extracted string  ===============
    #   all_hash_str_dic: {hash:'str1', ...
    if all_hash_str_dic is None:
        #all_hash_str_dic = featured_rdd.map(lambda x: x[metadata_count+1]).reduce(lambda a, b: combine_dic(a, b))
        all_hash_str_dic = dict(
            featured_rdd.flatMap(
                lambda x: x[metadata_count + 1].items()).distinct().collect())

    # get all labels into an array  =============== provided by parameter?
    if label_arr is None:
        # will force "clean" be 0 here
        label_arr = sorted(
            featured_rdd.map(
                lambda x: x[label_idx].lower()).distinct().collect())
        # debug only
        print "INFO: label_arr.=", json.dumps(sorted(label_arr))

    # save labels to hdfs as text file==================================== ============
    hdfs_folder = hdfs_feat_dir  #+ "/"   # "/" is needed to create the folder correctly
    print "INFO: hdfs_folder=", hdfs_folder
    try:
        hdfs.mkdir(hdfs_folder)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at mkdir:", sys.exc_info()[0]

    # clean up metadata_file
    metadata_file = os.path.join(hdfs_folder, metadata)  #"metadata"
    print "INFO: metadata_file=", metadata_file
    try:
        hdfs.rmr(metadata_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at rmr():", sys.exc_info()[0]
    sc.parallelize(label_arr, 1).saveAsTextFile(metadata_file)

    #remap all hash values to continuous key/feature number ==============
    #     all_hashes_seq_dic: { hash : sequential_numb }
    if all_hashes_seq_dic is None:
        all_hashes_seq_dic = {}
        remap2seq(
            all_hashes_cnt_dic,
            all_hashes_seq_dic)  #all_hashes_seq_dic has continuous key number
    #print "all_hashes_seq_dic=",all_hashes_seq_dic
    total_feature_numb = len(all_hashes_seq_dic)
    print "INFO: Total feature count=", len(all_hashes_seq_dic)

    # featured_rdd (list):    [meta-data1,meta-data2,..., hash_cnt_dic, hash_str_dic]
    # seq_featured_rdd(list): [meta-data1,meta-data2,..., hash_cnthsh_dict, hash_str_dic] (feat id in sorted sequence)
    # hash_cnt_dic: {hash: count}  hash_str_dic: {hash: 'str1,str2...' }
    #     set binary_flag to True, all feature:value will be 1
    broadcast_dic = sc.broadcast(all_hashes_seq_dic)
    seq_featured_rdd = featured_rdd.map(lambda x: convert2seq(
        x, label_idx, data_idx, broadcast_dic.value, binary_flag=binary_flag)
                                        ).cache()

    # get hash_cnthsh_dict then flatMap and reduce to (feat id, count)
    ct_rdd = seq_featured_rdd.flatMap(lambda x: [(i[0], i[1]) for i in x[
        data_idx].iteritems()]).reduceByKey(lambda a, b: a + b)
    # sorted by feature id as int
    feat_sample_count_arr = ct_rdd.sortBy(lambda x: int(x[0])).map(
        lambda x: x[1]).collect()
    # sort after collect may fail when rdd is huge
    #feat_sample_count_arr=[]
    #for i in sorted(ct_rdd.collect(), key=lambda t: int(t[0])):
    #    feat_sample_count_arr.append(i[1])
    print "INFO: feat_sample_count_arr len=", len(feat_sample_count_arr)

    # save feat_sample_count_arr data ==================================== ============
    filter = '{"rid":' + row_id_str + ',"key":"feat_sample_count_arr"}'
    upsert_flag = True
    jo_insert = {}
    jo_insert["rid"] = eval(row_id_str)
    jo_insert["key"] = "feat_sample_count_arr"
    jo_insert["value"] = feat_sample_count_arr
    jstr_insert = json.dumps(jo_insert)
    ret = query_mongo.upsert_doc_t(mongo_tuples, filter, jstr_insert,
                                   upsert_flag)
    print "INFO: Upsert count for feat_sample_count_arr=", ret
    # insert failed, save to local
    if ret == 0:
        # drop old record in mongo
        ret = query_mongo.delete_many(mongo_tuples, None, filter)
        if not os.path.exists(local_out_dir):
            os.makedirs(local_out_dir)
        fsca_hs = os.path.join(local_out_dir, row_id_str,
                               row_id_str + "_feat_sample_count_arr.pkl")
        print "WARNING: save feat_sample_count_arr to local"
        ml_util.ml_pickle_save(feat_sample_count_arr, fsca_hs)

    # save feature data; TBD. not used. ==================================== ============

    #libsvm_rdd=seq_featured_rdd.map(lambda x: convert2libsvm(x,label_idx,data_idx,label_arr))
    # put hash to the front of each row, assume hash is after label
    libsvm_rdd = seq_featured_rdd.map(
        lambda x: x[label_idx + 1] + " " + convert2libsvm(
            x, label_idx, data_idx, label_arr))
    # debug only
    #print "libsvm_rdd="
    #for i in libsvm_rdd.collect():
    #    print i

    # get rdd statistics info
    stats = featured_rdd.map(lambda p: len(p[metadata_count])).stats()
    feat_count_max = stats.max()
    feat_count_stdev = stats.stdev()
    feat_count_mean = stats.mean()
    sample_count = stats.count()
    print "INFO: libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
    print "INFO:   ,max feature count=", feat_count_max
    # find sample count
    lbl_arr = featured_rdd.map(lambda x: (x[label_idx], 1)).reduceByKey(
        add).collect()
    print "INFO: Sample count by label=", lbl_arr

    # remove duplicated libsvm string; only keep the first duplicated item, assume space following key_idx
    if remove_duplicated == "Y":
        libsvm_rdd=libsvm_rdd \
            .map(lambda x: ( ','.join(x.split(' ')[metadata_count:]), x)) \
            .groupByKey().map(lambda x: list(x[1])[0] ) \
            .cache()
        cnt_list = libsvm_rdd.map(lambda x: (x.split(' ')[1], 1)).reduceByKey(
            add).collect()
        stats = libsvm_rdd.map(
            lambda x: len(x.split(' ')[metadata_count:])).stats()
        feat_count_max = stats.max()
        feat_count_stdev = stats.stdev()
        feat_count_mean = stats.mean()
        sample_count = stats.count()
        print "INFO: Non-Duplicated libsvm data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
        print "INFO:   ,max feature count=", feat_count_max
        print "INFO: Non-Duplicated Label count list=", cnt_list

    # clean up libsvm data ==================================== ============
    libsvm_data_file = os.path.join(hdfs_folder,
                                    libsvm_alldata_filename)  #"libsvm_data"
    print "INFO: libsvm_data_file=", libsvm_data_file
    try:
        #hdfs.ls(save_dir)
        #print "find hdfs folder"
        hdfs.rmr(libsvm_data_file)
        #if num_gram == 1:
        #   hdfs.rmr(dnn_data_file)
        #print "all files removed"
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at libsvm_data_file clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
        )[0]

    #codec = "org.apache.hadoop.io.compress.GzipCodec"
    #libsvm_rdd.saveAsTextFile(libsvm_data_file, codec)
    libsvm_rdd.saveAsTextFile(libsvm_data_file)  # TBD encrypted

    feat_count_file = libsvm_data_file + "_feat_count"
    print "INFO: feat_count_file=", feat_count_file
    try:
        hdfs.rmr(feat_count_file)
    except IOError as e:
        print "WARNING: I/O error({0}): {1} at feat_count clean up".format(
            e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm feature count clean up:", sys.exc_info(
        )[0]
    sc.parallelize([total_feature_numb], 1).saveAsTextFile(feat_count_file)

    label_dic = {}
    # assign label a number
    for idx, label in enumerate(sorted(label_arr)):
        if not label in label_dic:
            label_dic[
                label] = idx  #starting from 0, value = idx, e.g., clean:0, dirty:1

    # output text for DNN:[meta-data1,meta-data2,..., [feature tokens]] ================= DNN ===========
    if num_gram == 1:  # special flag to tokenize and keep input orders
        print "INFO: processing data for DNN..."
        # create token dict
        # str_hash_dict: string to hash
        # all_hashes_seq_dic: hash to seq id
        if token_dict is None or len(token_dict) == 0:
            token_dict = {}
            str_hash_dict = {v: k for k, v in all_hash_str_dic.iteritems()}
            for k, v in str_hash_dict.iteritems():
                token_dict[k] = int(all_hashes_seq_dic[str(v)])
            #print "token_dict=",len(token_dict),token_dict

        dnn_rdd = tmp_rdd \
            .map(lambda x: tokenize_by_dict(x, data_idx, token_dict,label_idx, label_dic)) \
            .filter(lambda x: len(x) > metadata_count) \
            .filter(lambda x: type(x[metadata_count]) is list)
        #.cache()
        # filter duplication here
        #print dnn_rdd.take(3)

        dnn_data_file = os.path.join(hdfs_folder,
                                     dnn_alldata_filename)  #"dnn_data"
        print "INFO: dnn_data_file=", dnn_data_file
        try:
            hdfs.rmr(dnn_data_file)
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_data_file clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at libsvm file clean up:", sys.exc_info(
            )[0]

        # clean up data
        dnn_npy_gz_file = os.path.join(hdfs_folder, row_id_str + "_dnn_")
        print "INFO: dnn_npy_gz_file=", dnn_npy_gz_file
        try:
            hdfs.rmr(dnn_npy_gz_file + "data.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "label.npy.gz")
            hdfs.rmr(dnn_npy_gz_file + "info.npy.gz")
        except IOError as e:
            print "WARNING: I/O error({0}): {1} at dnn_npy clean up".format(
                e.errno, e.strerror)
        except:
            print "WARNING: Unexpected error at dnn_npy file clean up:", sys.exc_info(
            )[0]
        # save new data
        try:
            dnn_rdd.saveAsTextFile(dnn_data_file)
        except:
            print "WARNING: Unexpected error at saving dnn data:", sys.exc_info(
            )[0]
        # show data statistics
        try:
            stats = dnn_rdd.map(lambda p: len(p[metadata_count])).stats()
            feat_count_max = stats.max()
            feat_count_stdev = stats.stdev()
            feat_count_mean = stats.mean()
            sample_count = stats.count()
            print "INFO: DNN data: sample count=", sample_count, ",Feat count mean=", feat_count_mean, ",Stdev=", feat_count_stdev
            print "INFO:   ,max feature count=", feat_count_max
        except:
            print "WARNING: Unexpected error at getting stats of dnn_rdd:", sys.exc_info(
            )[0]

    # clean up pca data in hdfs ============ ========================
    pca_files = '*' + libsvm_alldata_filename + "_pca_*"
    #print "INFO: pca_files=", pca_files
    try:
        f_list = hdfs.ls(hdfs_folder)
        if len(f_list) > 0:
            df_list = fnmatch.filter(f_list, pca_files)
            for f in df_list:
                print "INFO: rm ", f
                hdfs.rmr(f)
    except IOError as e:
        print "WARNING: I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "WARNING: Unexpected error at libsvm pca file clean up:", sys.exc_info(
        )[0]

    # clean up pca data in web local ============ ========================
    pca_fname = os.path.join(model_data_folder, row_id_str + '_pca_*.pkl*')
    print "INFO: pca_fname=", pca_fname

    try:
        for fl in glob.glob(pca_fname):
            print "INFO: remove ", fl
            os.remove(fl)
    except OSError, e:
        print("Error: %s - %s." % (e.pca_fname, e.strerror))