Example #1
0
def train(train_docs, main_save_path,
          config_name, model_name, train_name, param_name, svm_param, ratio, delete,
          param_select, global_fun, local_fun):
    '''
    训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "model")) is False:
            os.makedirs(os.path.join(main_save_path, "model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "temp")) is False:
            os.makedirs(os.path.join(main_save_path, "temp"))

    #读取停用词文件
    if stopword_filename == "":
        stop_words_dic = dict()
    else:
        stop_words_dic = utils.read_dic(stopword_filename)

    print "-----------------现在正在进行特征选择---------------"
    dic_path = os.path.join(main_save_path, "model", "dic.key")
    feature_select.feature_select(train_docs, global_fun, dic_path, ratio, stop_words_dic)

    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path = os.path.join(main_save_path, "temp", train_name)
    label = cons_train_sample_for_cla(train_docs, measure.local_f(local_fun), dic_path, problem_save_path, delete)

    print"--------------------选择最优的c,g------------------------------"
    if param_select is True:
        search_result_save_path = os.path.join(main_save_path, "temp", param_name)

        coarse_c_range = (-5, 7, 2)
        coarse_g_range = (1, 1, 1)
        fine_c_step = 0.5
        fine_g_step = 0
        c, g = grid_search_param.grid(problem_save_path, search_result_save_path, coarse_c_range,
                                      coarse_g_range, fine_c_step, fine_g_step)
        svm_param = " -c " + str(c)

    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path = os.path.join(main_save_path, "model", model_name)
    ctm_train_model(problem_save_path, svm_param, model_save_path)

    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path, "model", config_name), 'w')
    save_config(f_config, model_name, local_fun, global_fun, svm_param, label)
    f_config.close()

    print "-----------------训练结束---------------------"
def ctm_feature_select(filename,indexes,global_fun,main_save_path,dic_name,ratio,stopword_filename,str_splitTag,tc_splitTag):
    #如果模型文件保存的路径不存在,则创建该文件夹
    dic_path= main_save_path+"model/"+dic_name
    if os.path.exists(main_save_path):
        if os.path.exists(main_save_path+"model/") is False:
            os.makedirs(main_save_path+"model/")
    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename =="":
        stop_words_dic=dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)
    feature_select(filename,indexes,global_fun,dic_path,ratio,stop_words_dic,str_splitTag,tc_splitTag)
def estimate_lift(model_data, item_id, talk=True):
    """ Estimates the lift of the given item. """

    y_normalization = 10000  # Used to fix units of the y variables
    pd.options.mode.chained_assignment = None  # Stops printing a warning that is not relevant

    # Get the category
    category = 0
    for i in range(1, 7):
        if model_data.loc[((model_data['item_id'] == item_id) &
                           (model_data['is_cat_' + str(i)] == 1))].empty:
            continue
        else:
            category = i
            break

    # Get the rows for this category (all items)
    X, y = feature_select.feature_select(
        prep_data.get_category(model_data, category), category)

    # Get the the promo period range
    start_week = X.loc[((X['on_promo'] == 1) & (X['item_id'] == item_id)),
                       'week'].min()
    end_week = X.loc[((X['on_promo'] == 1) & (X['item_id'] == item_id)),
                     'week'].max()

    # Get the total normalized sales during the promotion
    promotion_sales = model_data.loc[((model_data['item_id'] == item_id) &
                                      (model_data['week'] >= start_week) &
                                      (model_data['week'] <= end_week)),
                                     'normalized_sales'].sum()

    # Estimate the sales during the same period if there was no promotion
    X_item = X.loc[((X['item_id'] == item_id) & (X['week'] >= start_week) &
                    (X['week'] <= end_week))]
    X_item['on_promo'] = 0
    y_no_promo = model.main_model(X, y, X_item) / y_normalization

    if talk:
        print("Item", item_id)
        print("Promo period:", end_week - start_week, "weeks")
        print("Available data points were:", X_item.shape[0])
        print("Estimated lift per week: ",
              round(
                  100 * (promotion_sales - y_no_promo.sum()) /
                  (end_week - start_week), 2),
              "%\n",
              sep='')
    MAX_ITER = 1000

    leuk = fetch_mldata('leukemia', transpose_data=True)
    X = leuk['data']
    y = leuk['target']

    # split the data for testing
    (X_train,X_test,y_train,y_test) = train_test_split(X,y,test_size=0.3,random_state=RANDOM_SEED)

    # perform feature selection
    num_features_to_select = 25
    K_MAX = 1000
    estimator = depmeas.mi_tau
    n_jobs = -1
    feature_ranking_idxs = feature_select.feature_select(X_train,y_train,
        num_features_to_select=num_features_to_select,K_MAX=K_MAX,
        estimator=estimator,n_jobs=n_jobs)
    num_selected_features = len(feature_ranking_idxs)
    # for each feature, compute the accuracy on the test data as we add features
    mean_acc = np.empty((num_selected_features,))
    std_acc  = np.empty((num_selected_features,))
    for ii in tqdm(range(num_selected_features),desc='Computing Classifier Performance...'):
        classifier = svm.SVC(random_state=RANDOM_SEED,max_iter=MAX_ITER)
        X_test_in = X_test[:,feature_ranking_idxs[0:ii+1]]
        scores = cross_val_score(classifier, X_test_in, y_test, cv=NUM_CV, n_jobs=-1)

        mu = scores.mean()
        sigma_sq = scores.std()
        
        mean_acc[ii] = mu
        std_acc[ii] = sigma_sq
Example #5
0
def ctm_train(filename, indexes, main_save_path, stopword_filename, svm_param,
              config_name, dic_name, model_name, train_name, svm_type,
              param_name, ratio, delete, str_splitTag, tc_splitTag, seg,
              param_select, global_fun, local_fun, label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "model")) is False:
            os.makedirs(os.path.join(main_save_path, "model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "temp")) is False:
            os.makedirs(os.path.join(main_save_path, "temp"))

    #设定SVM模型的类型。

    tms_svm.set_svm_type(svm_type)

    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename == "":
        stop_words_dic = dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)

    #如果需要分词,则对原文件进行分词
    if seg != 0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename) + "/segmented"
        segment.file_seg(filename, indexes, segment_file, str_splitTag,
                         tc_splitTag, seg)
        filename = segment_file

    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename, str_splitTag, tc_splitTag)

    print "-----------------现在正在进行特征选择---------------"
    dic_path = os.path.join(main_save_path, "model", dic_name)
    feature_select(filename,
                   indexes,
                   global_fun,
                   dic_path,
                   ratio,
                   stop_words_dic,
                   str_splitTag=str_splitTag,
                   tc_splitTag=tc_splitTag)

    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path = os.path.join(main_save_path, "temp", train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename, indexes, local_fun, dic_path,
                                      problem_save_path, delete, str_splitTag,
                                      tc_splitTag)

    if param_select == True:
        print "--------------------选择最优的c,g------------------------------"
        search_result_save_path = main_save_path + "temp/" + param_name
        if svm_type == "libsvm":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (3, -10, -2)
            fine_c_step = 0.5
            fine_g_step = 0.5
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c) + " -g " + str(g)
        if svm_type == "liblinear" or (svm_type == "libsvm" and
                                       is_linear_kernal(svm_param) is True):
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (1, 1, 1)
            fine_c_step = 0.5
            fine_g_step = 0
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c)

    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path = main_save_path + "model/" + model_name
    ctm_train_model(problem_save_path, svm_type, svm_param, model_save_path)

    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path, "model", config_name), 'w')
    save_config(f_config, dic_name, model_name, local_fun_str, global_fun, seg,
                svm_type, svm_param, label_file, label)
    f_config.close()
def ctm_train(filename,indexes,main_save_path,stopword_filename,svm_param,config_name,dic_name,model_name,train_name,svm_type,param_name,ratio,delete,str_splitTag,tc_splitTag,seg,param_select,global_fun,local_fun,label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"model")) is False:
            os.makedirs(os.path.join(main_save_path,"model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"temp")) is False:
            os.makedirs(os.path.join(main_save_path,"temp"))
    
    #设定SVM模型的类型。  
    
    tms_svm.set_svm_type(svm_type)   
        
    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename =="":
        stop_words_dic=dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)
    
    #如果需要分词,则对原文件进行分词
    if seg!=0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename)+"/segmented"
        segment.file_seg(filename,indexes,segment_file,str_splitTag,tc_splitTag,seg)
        filename = segment_file
    
    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename,str_splitTag,tc_splitTag)
    
    print "-----------------现在正在进行特征选择---------------"  
    dic_path= os.path.join(main_save_path,"model",dic_name)    
    feature_select(filename,indexes,global_fun,dic_path,ratio,stop_words_dic,str_splitTag=str_splitTag,tc_splitTag=tc_splitTag)
    
    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path  = os.path.join(main_save_path,"temp",train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename,indexes,local_fun,dic_path,problem_save_path,delete,str_splitTag,tc_splitTag)
    
    if param_select ==True:
        print"--------------------选择最优的c,g------------------------------"
        search_result_save_path  = main_save_path +"temp/"+param_name
        if svm_type=="libsvm":
           coarse_c_range=(-5,7,2)
           coarse_g_range=(3,-10,-2)
           fine_c_step=0.5
           fine_g_step=0.5
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)+" -g "+str(g)
        if svm_type=="liblinear" or (svm_type=="libsvm" and is_linear_kernal(svm_param) is True):
           coarse_c_range=(-5,7,2)
           coarse_g_range=(1,1,1)
           fine_c_step=0.5
           fine_g_step=0
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)
    
    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path  = main_save_path+"model/"+model_name
    ctm_train_model(problem_save_path,svm_type,svm_param,model_save_path)
    
    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path,"model",config_name),'w')
    save_config(f_config,dic_name,model_name,local_fun_str,global_fun,seg,svm_type,svm_param,label_file,label)
    f_config.close()
Example #7
0
pred_index = "11-13-2020"
model_path = "D:/Trend_reporter/saved_model/"
data_path = "D:/Trend_reporter/data/catgwise/생활+건강/"
s = 60
pred_time = 30

temp = pd.DataFrame()

for d in os.listdir(data_path):
    orig = pd.read_csv(data_path + d)
    orig["date"] = pd.to_datetime(orig["date"])
    orig_dat = orig.set_index("date")
    #print(orig_dat.iloc[:,0:1])
    for m in os.listdir(model_path):
        if "{}".format(d.replace(".csv", "")) in m and "best" in m:
            feature = feature_select.feature_select(m)
            model = models.load_model(model_path + m)
            pred_df = predict_lstm.predict_lstm(orig_dat, model, s, pred_index,
                                                pred_time, "clicks_ma_ratio",
                                                feature)
            del pred_df["clicks_ma_ratio"]
            hap = orig_dat.iloc[:, 0:1].append(pred_df.iloc[1:, 0:1])
            hap.columns = [d]
            if temp.empty:
                temp = hap
            else:
                temp = temp.join(hap, how="left")

temp.to_csv("output2.csv", encoding="cp949")
Example #8
0
        print(feature_reducer.upper() + ' - %s features' %
              (str(component_num)))
        dimension_model = fre_.feature_reduce(feature_reducer, X_train,
                                              y_train, component_num)
        # print(len(dimension_model))
        estimators.append((feature_reducer, dimension_model))

################################################
##	    	   Feature selection              ##
################################################
if select_features == True:
    import feature_select as fse_
    for i in range(len(default_selectors)):
        feature_selector = default_selectors[i]
        print(feature_selector.upper() + ' - %s features' % (str(feature_num)))
        selection_model = fse_.feature_select(feature_selector, X_train,
                                              y_train, feature_num)
        estimators.append((feature_selector, selection_model))

print(estimators)
model = Pipeline(estimators)

# make all train and test data into binary labels
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
if train_type == 'c':
    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)
'''
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
Example #9
0
        sys.exit()
train_file = args[0]
test_file = args[1]

train_X, train_y = load_svmlight_file(train_file)
train_x = train_X.toarray()
test_X, test_y = load_svmlight_file(test_file)
test_x = test_X.toarray()
number = len(train_x[0])

corrcoef = pcorr.corrcoef_result(train_x, train_y)
infogain = info.infogain_result(train_x, train_y)

n_i = len(train_x[0])
if feature_number == 'best':
    del_feature = fs.feature_select(train_x, train_y, test_x, test_y, infogain)
    print '\n特征选择后的特征数量:\n'
    remain_feature = n_i - del_feature
    print(remain_feature)
    info_s = infogain[:]
    x_s = train_x[:]
    y_s = test_x[:]
    del_n = fs.del_feature_select(info_s, del_feature)
    x_train_array = fs.del_data(x_s, del_n)
    x_test_array = fs.del_data(y_s, del_n)
    np.savetxt('train.txt',
               x_train_array,
               fmt=('%f\t' * remain_feature),
               newline='\n')
    np.savetxt('test.txt',
               x_test_array,