def cons_train_sample_for_cla(filename,indexs,local_fun,dic_path,sample_save_path,delete,str_splitTag,tc_splitTag):
    '''根据提供的词典,将指定文件中的指定位置上的内容构造成SVM所需的问题格式,并进行保存'''
    dic_list,global_weight = fileutil.read_dic_ex(dic_path,dtype=str)
    if type(local_fun)==types.StringType:
        local_fun = measure.local_f(local_fun)
    label = set()
    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename,str_splitTag,tc_splitTag)    
    
    f= file(filename,'r')
    fs = file(sample_save_path,'w')
    for line in f.readlines():
        text = line.strip().split(tc_splitTag)
        text_temp=""
        if len(text)<indexs[len(indexs)-1]+1:
            continue
        for i in indexs:
          text_temp+=str_splitTag+text[i]  
        y,x = ctmutil.cons_pro_for_svm(text[0],text_temp.strip().split(str_splitTag),dic_list,local_fun,global_weight)
        if delete == True and len(x[0])==0:
            continue
        save_dic_train_sample(fs,y,x)
        label.add(y[0])
    f.close()
    fs.close()
    return label
Ejemplo n.º 2
0
def to_svm(tids, global_weight_dic, local_fun, class_id=None):
    """ 根据词典和权重词典构造svm分类所需的输入格式
    :param tids:
    :param tok_dic: 词典
    :param class2id: 类映射
    :param global_weight_dic:权重词典
    :return: feat
    """
    local_fun = measure.local_f(local_fun)
    feat = {}

    #buidl feature vector
    for tid in tids:
        if tid in global_weight_dic:
            if tid in feat:
                feat[tid] += 1.0
            else:
                feat[tid] = 1.0

    #compute feature weight
    for tid, weight in feat.items():
        feat[tid] = 1.0 * local_fun(weight) * global_weight_dic[tid]
    #normalize
    vec_sum = sum([weight**2.0 for weight in feat.values()])

    vec_length = math.sqrt(vec_sum)
    if vec_length != 0:
        for tok, weight in feat.items():
            feat[tok] = 1.0 * weight / vec_length
    if class_id is not None:
        return feat, class_id
    else:
        return feat
Ejemplo n.º 3
0
def cons_train_sample_for_cla(filename, indexs, local_fun, dic_path,
                              sample_save_path, delete, str_splitTag,
                              tc_splitTag):
    '''根据提供的词典,将指定文件中的指定位置上的内容构造成SVM所需的问题格式,并进行保存'''
    dic_list, global_weight = fileutil.read_dic_ex(dic_path, dtype=str)
    if type(local_fun) == types.StringType:
        local_fun = measure.local_f(local_fun)
    label = set()
    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename, str_splitTag, tc_splitTag)

    f = file(filename, 'r')
    fs = file(sample_save_path, 'w')
    for line in f.readlines():
        text = line.strip().split(tc_splitTag)
        text_temp = ""
        if len(text) < indexs[len(indexs) - 1] + 1:
            continue
        for i in indexs:
            text_temp += str_splitTag + text[i]
        y, x = ctmutil.cons_pro_for_svm(text[0],
                                        text_temp.strip().split(str_splitTag),
                                        dic_list, local_fun, global_weight)
        if delete == True and len(x[0]) == 0:
            continue
        save_dic_train_sample(fs, y, x)
        label.add(y[0])
    f.close()
    fs.close()
    return label
Ejemplo n.º 4
0
def to_svm(tids, global_weight_dic, local_fun, class_id=None):
    """ 根据词典和权重词典构造svm分类所需的输入格式
    :param tids:
    :param tok_dic: 词典
    :param class2id: 类映射
    :param global_weight_dic:权重词典
    :return: feat
    """
    local_fun = measure.local_f(local_fun)
    feat = {}

    #buidl feature vector
    for tid in tids:
        if tid in global_weight_dic:
            if tid in feat:
                feat[tid] += 1.0
            else:
                feat[tid] = 1.0

    #compute feature weight
    for tid,weight in feat.items():
        feat[tid] = 1.0 * local_fun(weight) * global_weight_dic[tid]
    #normalize
    vec_sum = sum([weight**2.0 for weight in feat.values()])

    vec_length = math.sqrt(vec_sum)
    if vec_length!=0:
        for tok, weight in feat.items():
            feat[tok] = 1.0*weight/vec_length
    if class_id is not None:
        return feat,class_id
    else:
        return feat
Ejemplo n.º 5
0
def load_conf(model_dir, conf_file):
    f = file(os.path.join(model_dir, conf_file), 'r')
    for line in f.readlines():
        text = line.split(":")
        if text[0].strip() == "LocalFun":
            local_fun = measure.local_f(text[1].strip())
    return local_fun
Ejemplo n.º 6
0
def cal_sc_optim(lab,m,text,dic_list,local_fun,global_weight,str_splitTag):
    '''输入标签,模型,待预测的文本,词典,以及词分词用的符号
    返回的是一个预测标签与得分
    '''
    local_fun = measure.local_f(local_fun)
    y,x = cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight)
    p_lab,p_acc,p_sc=tms_svm.predict(y,x,m)  
    return p_lab[0],tms_svm.classer_value(p_sc[0])
def cal_sc_optim(lab,m,text,dic_list,local_fun,global_weight,str_splitTag):
    '''输入标签,模型,待预测的文本,词典,以及词分词用的符号
    返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。
    '''
    local_fun = measure.local_f(local_fun)
    y,x = ctmutil.cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight)
    p_lab,p_acc,p_sc=tms_svm.predict(y,x,m)  
    #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。
    return p_lab[0],tms_svm.classer_value(p_sc[0])
Ejemplo n.º 8
0
def train(train_docs, main_save_path,
          config_name, model_name, train_name, param_name, svm_param, ratio, delete,
          param_select, global_fun, local_fun):
    '''
    训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "model")) is False:
            os.makedirs(os.path.join(main_save_path, "model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "temp")) is False:
            os.makedirs(os.path.join(main_save_path, "temp"))

    #读取停用词文件
    if stopword_filename == "":
        stop_words_dic = dict()
    else:
        stop_words_dic = utils.read_dic(stopword_filename)

    print "-----------------现在正在进行特征选择---------------"
    dic_path = os.path.join(main_save_path, "model", "dic.key")
    feature_select.feature_select(train_docs, global_fun, dic_path, ratio, stop_words_dic)

    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path = os.path.join(main_save_path, "temp", train_name)
    label = cons_train_sample_for_cla(train_docs, measure.local_f(local_fun), dic_path, problem_save_path, delete)

    print"--------------------选择最优的c,g------------------------------"
    if param_select is True:
        search_result_save_path = os.path.join(main_save_path, "temp", param_name)

        coarse_c_range = (-5, 7, 2)
        coarse_g_range = (1, 1, 1)
        fine_c_step = 0.5
        fine_g_step = 0
        c, g = grid_search_param.grid(problem_save_path, search_result_save_path, coarse_c_range,
                                      coarse_g_range, fine_c_step, fine_g_step)
        svm_param = " -c " + str(c)

    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path = os.path.join(main_save_path, "model", model_name)
    ctm_train_model(problem_save_path, svm_param, model_save_path)

    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path, "model", config_name), 'w')
    save_config(f_config, model_name, local_fun, global_fun, svm_param, label)
    f_config.close()

    print "-----------------训练结束---------------------"
Ejemplo n.º 9
0
def cal_sc_optim(lab, m, text, dic_list, local_fun, global_weight,
                 str_splitTag):
    '''输入标签,模型,待预测的文本,词典,以及词分词用的符号
    返回的是一个预测标签与得分
    '''
    local_fun = measure.local_f(local_fun)
    y, x = cons_pro_for_svm(lab,
                            text.strip().split(str_splitTag), dic_list,
                            local_fun, global_weight)
    p_lab, p_acc, p_sc = tms_svm.predict(y, x, m)
    return p_lab[0], tms_svm.classer_value(p_sc[0])
def load_tms_model(config_file):
    '''通过模型配置文件加载词典、全局因子、局部因子、SVM模型'''
    model_main_path = os.path.dirname(config_file)
    f = file(config_file,'r')
    for line in f.readlines():
        text = line.split(":")
        if text[0].strip()=="DicName":
            dic,global_weight = fileutil.read_dic_ex(os.path.join(model_main_path,text[1].strip()),dtype=str)
        if text[0].strip()=="ModelName":
            tms_svm.set_svm_type(tms_svm.detect_svm_type(os.path.join(model_main_path,text[1].strip())))
            model= tms_svm.load_model(os.path.join(model_main_path,text[1].strip()))
        if text[0].strip()=="LocalFun":
            local_fun = measure.local_f(text[1].strip())
        if text[0].strip()=="WordSeg":
            seg = int(float(text[1]))
    return local_fun,dic,global_weight,model,seg
Ejemplo n.º 11
0
def cons_train_sample_for_cla(train_docs, local_fun, dic_path, sample_save_path, delete):
    '''根据提供的词典,将指定文件中的指定位置上的内容构造成SVM所需的问题格式,并进行保存'''
    dic_list, global_weight = utils.read_dic_ex(dic_path, dtype=str)
    local_fun = measure.local_f(local_fun)
    label = set()

    fs = file(sample_save_path, 'w')
    for line in train_docs:
        y, string = line.strip().split("\t")
        x = utils.cons_pro_for_svm(string.strip().split(" "), dic_list, local_fun, global_weight)
        y = [float(y)]
        if delete is True and len(x[0]) == 0:
            continue
        save_dic_train_sample(fs, y, x)
        label.add(y[0])
    fs.close()
    return label
Ejemplo n.º 12
0
def ctm_train(filename, indexes, main_save_path, stopword_filename, svm_param,
              config_name, dic_name, model_name, train_name, svm_type,
              param_name, ratio, delete, str_splitTag, tc_splitTag, seg,
              param_select, global_fun, local_fun, label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "model")) is False:
            os.makedirs(os.path.join(main_save_path, "model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "temp")) is False:
            os.makedirs(os.path.join(main_save_path, "temp"))

    #设定SVM模型的类型。

    tms_svm.set_svm_type(svm_type)

    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename == "":
        stop_words_dic = dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)

    #如果需要分词,则对原文件进行分词
    if seg != 0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename) + "/segmented"
        segment.file_seg(filename, indexes, segment_file, str_splitTag,
                         tc_splitTag, seg)
        filename = segment_file

    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename, str_splitTag, tc_splitTag)

    print "-----------------现在正在进行特征选择---------------"
    dic_path = os.path.join(main_save_path, "model", dic_name)
    feature_select(filename,
                   indexes,
                   global_fun,
                   dic_path,
                   ratio,
                   stop_words_dic,
                   str_splitTag=str_splitTag,
                   tc_splitTag=tc_splitTag)

    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path = os.path.join(main_save_path, "temp", train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename, indexes, local_fun, dic_path,
                                      problem_save_path, delete, str_splitTag,
                                      tc_splitTag)

    if param_select == True:
        print "--------------------选择最优的c,g------------------------------"
        search_result_save_path = main_save_path + "temp/" + param_name
        if svm_type == "libsvm":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (3, -10, -2)
            fine_c_step = 0.5
            fine_g_step = 0.5
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c) + " -g " + str(g)
        if svm_type == "liblinear" or (svm_type == "libsvm" and
                                       is_linear_kernal(svm_param) is True):
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (1, 1, 1)
            fine_c_step = 0.5
            fine_g_step = 0
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c)

    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path = main_save_path + "model/" + model_name
    ctm_train_model(problem_save_path, svm_type, svm_param, model_save_path)

    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path, "model", config_name), 'w')
    save_config(f_config, dic_name, model_name, local_fun_str, global_fun, seg,
                svm_type, svm_param, label_file, label)
    f_config.close()
def ctm_train(filename,indexes,main_save_path,stopword_filename,svm_param,config_name,dic_name,model_name,train_name,svm_type,param_name,ratio,delete,str_splitTag,tc_splitTag,seg,param_select,global_fun,local_fun,label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"model")) is False:
            os.makedirs(os.path.join(main_save_path,"model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"temp")) is False:
            os.makedirs(os.path.join(main_save_path,"temp"))
    
    #设定SVM模型的类型。  
    
    tms_svm.set_svm_type(svm_type)   
        
    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename =="":
        stop_words_dic=dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)
    
    #如果需要分词,则对原文件进行分词
    if seg!=0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename)+"/segmented"
        segment.file_seg(filename,indexes,segment_file,str_splitTag,tc_splitTag,seg)
        filename = segment_file
    
    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename,str_splitTag,tc_splitTag)
    
    print "-----------------现在正在进行特征选择---------------"  
    dic_path= os.path.join(main_save_path,"model",dic_name)    
    feature_select(filename,indexes,global_fun,dic_path,ratio,stop_words_dic,str_splitTag=str_splitTag,tc_splitTag=tc_splitTag)
    
    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path  = os.path.join(main_save_path,"temp",train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename,indexes,local_fun,dic_path,problem_save_path,delete,str_splitTag,tc_splitTag)
    
    if param_select ==True:
        print"--------------------选择最优的c,g------------------------------"
        search_result_save_path  = main_save_path +"temp/"+param_name
        if svm_type=="libsvm":
           coarse_c_range=(-5,7,2)
           coarse_g_range=(3,-10,-2)
           fine_c_step=0.5
           fine_g_step=0.5
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)+" -g "+str(g)
        if svm_type=="liblinear" or (svm_type=="libsvm" and is_linear_kernal(svm_param) is True):
           coarse_c_range=(-5,7,2)
           coarse_g_range=(1,1,1)
           fine_c_step=0.5
           fine_g_step=0
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)
    
    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path  = main_save_path+"model/"+model_name
    ctm_train_model(problem_save_path,svm_type,svm_param,model_save_path)
    
    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path,"model",config_name),'w')
    save_config(f_config,dic_name,model_name,local_fun_str,global_fun,seg,svm_type,svm_param,label_file,label)
    f_config.close()