Beispiel #1
0
def tms_feature_select(filename,
                       indexes=[1],
                       global_fun="one",
                       main_save_path="../",
                       dic_name="dic.key",
                       ratio=0.4,
                       stopword_filename="",
                       str_splitTag="^",
                       tc_splitTag="\t"):
    '''特征选择的主程序,输入指定的文件,会自动生成词典,并根据卡方公式进行特征选择。
    必须参数:
        filename 训练文本所在的文件名,默认情况下,已经分好词。
    结果文件:
        词典。
   可选参数:
    indexs需要训练的指标项 ,默认为[1]
    main_save_path 模型保存的路径.默认为"../"
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    dic_name 用户自定义词典名称;默认“dic.key”
    ratio 特征选择保留词的比例 ;默认 0.4
    str_splitTag 分词所用的分割符号 ,默认"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,默认"\t"
    global_fun :全局权重的计算方式:有"one","idf","rf" ,默认为"one"
    '''
    train_model.ctm_feature_select(filename, indexes, global_fun,
                                   main_save_path, dic_name, ratio,
                                   stopword_filename, str_splitTag,
                                   tc_splitTag)
Beispiel #2
0
def tms_feature_select(filename,indexes=[1],global_fun="one",main_save_path="../",dic_name="dic.key",ratio=0.4,stopword_filename="",str_splitTag="^",tc_splitTag="\t"):
    '''特征选择的主程序,输入指定的文件,会自动生成词典,并根据卡方公式进行特征选择。
    必须参数:
        filename 训练文本所在的文件名,默认情况下,已经分好词。
    结果文件:
        词典。
   可选参数:
    indexs需要训练的指标项 ,默认为[1]
    main_save_path 模型保存的路径.默认为"../"
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    dic_name 用户自定义词典名称;默认“dic.key”
    ratio 特征选择保留词的比例 ;默认 0.4
    str_splitTag 分词所用的分割符号 ,默认"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,默认"\t"
    global_fun :全局权重的计算方式:有"one","idf","rf" ,默认为"one"
    '''
    train_model.ctm_feature_select(filename, indexes, global_fun, main_save_path, dic_name, ratio, stopword_filename, str_splitTag, tc_splitTag)
def main():
    usage = "usage:%prog [options] version=%prog 1.0"
    parser = OptionParser(usage=usage)
    parser.add_option("-s",
                      "--step",
                      type="choice",
                      choices=["1", "2", "3", "4", "5"],
                      dest="step",
                      help="step1 is auto training the svm model")
    parser.add_option("-p", "--path", dest="save_main_path")
    parser.add_option("-P", "--problem_path", dest="problem_save_path")
    parser.add_option("-i",
                      "--indexes",
                      dest="indexes",
                      action="callback",
                      type="string",
                      default=[1],
                      callback=list_callback)
    parser.add_option("-w",
                      "--stopword",
                      action="store_false",
                      dest="stopword",
                      default=True)
    parser.add_option("-n",
                      "--config_name",
                      dest="config_name",
                      default="tms.config")
    parser.add_option("-d", "--dic_name", dest="dic_name", default="dic.key")
    parser.add_option("-D", "--dic_path", dest="dic_path")
    parser.add_option("-m",
                      "--model_name",
                      dest="model_name",
                      default="tms.model")
    parser.add_option("-t",
                      "--train_name",
                      dest="train_name",
                      default="tms.train")
    parser.add_option("-a",
                      "--param_name",
                      dest="param_name",
                      default="tms.param")
    parser.add_option("-r", "--ratio", dest="ratio", type="float", default=0.4)
    parser.add_option("-A",
                      "--svm_param",
                      dest="svm_param",
                      default="'-s 0 -t 2 -c 1.0 -g 0.25'")
    parser.add_option("-T",
                      "--tc_splitTag",
                      dest="tc_splitTag",
                      type="string",
                      default="\t")
    parser.add_option("-S",
                      "--str_splitTag",
                      dest="str_splitTag",
                      type="string",
                      default="^")
    parser.add_option("-v",
                      "--svm_type",
                      dest="svm_type",
                      default="libsvm",
                      type="choice",
                      choices=["libsvm", "liblinear"])
    parser.add_option("-e",
                      "--segment",
                      type="choice",
                      dest="segment",
                      default=0,
                      choices=[0, 1, 2])
    parser.add_option("-c",
                      "--param_select",
                      action="store_false",
                      dest="param_select",
                      default=True)
    parser.add_option("-g",
                      "--global_fun",
                      dest="global_fun",
                      default="one",
                      type="choice",
                      choices=["one", "idf", "rf"])
    parser.add_option("-l",
                      "--local_fun",
                      dest="local_fun",
                      default="tf",
                      type="choice",
                      choices=["tf"])
    parser.add_option("-b",
                      "--label_file",
                      dest="label_file",
                      type="string",
                      default="")
    options, args = parser.parse_args()
    if options.indexes:
        indexes = [int(i) for i in options.indexes]
    if options.step:
        step = int(options.step)

    if options.stopword == False:
        stopword_filename = ""
    else:
        stopword_filename = os.path.dirname(args[0]) + "/stopwords.txt"

    if options.svm_param:
        svm_param = options.svm_param.replace("'", "")
    if step == 1:
        train_model.ctm_train(args[0],
                              indexes,
                              options.save_main_path,
                              stopword_filename,
                              config_name=options.config_name,
                              svm_type=options.svm_type,
                              segment=options.segment,
                              param_select=options.param_select,
                              global_fun=options.global_fun,
                              local_fun=options.local_fun,
                              svm_param=svm_param,
                              dic_name=options.dic_name,
                              model_name=options.model_name,
                              train_name=options.train_name,
                              param_name=options.param_name,
                              ratio=options.ratio,
                              delete=True,
                              str_splitTag=options.str_splitTag,
                              tc_splitTag=options.tc_splitTag,
                              label_file=options.label_file)
    if step == 2:
        train_model.ctm_feature_select(args[0],
                                       indexes,
                                       options.global_fun,
                                       options.save_main_path,
                                       options.dic_name,
                                       options.ratio,
                                       stopword_filename,
                                       str_splitTag=options.str_splitTag,
                                       tc_splitTag=options.tc_splitTag)

    if step == 3:
        if os.path.exists(options.save_main_path):
            if os.path.exists(options.save_main_path + "temp/") is False:
                os.makedirs(options.save_main_path + "temp/")
        sample_save_path = options.save_main_path + "temp/svm.train"
        train_model.cons_train_sample_for_cla(
            args[0],
            indexes,
            options.local_fun,
            options.dic_path,
            sample_save_path,
            delete=True,
            str_splitTag=options.str_splitTag,
            tc_splitTag=options.tc_splitTag)

    if step == 4:
        search_result_save_path = options.save_main_path + "temp/" + "svm.param"
        tms_svm.set_svm_type(options.svm_type)
        if options.svm_type == "libsvm":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (3, -10, -2)
            fine_c_step = 0.5
            fine_g_step = 0.5
        if options.svm_type == "liblinear":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (1, 1, 1)
            fine_c_step = 0.5
            fine_g_step = 0
        c, g = grid_search_param.grid(args[0], search_result_save_path,
                                      options.svm_type, coarse_c_range,
                                      coarse_g_range, fine_c_step, fine_g_step)
        print "best c = %s\t g = %s\n" % (c, g)

    if step == 5:
        model_save_path = options.save_main_path + "model/" + options.model_name
        train_model.ctm_train_model(options.problem_save_path, svm_param,
                                    model_save_path)
Beispiel #4
0
def main():
    usage ="usage:%prog [options] version=%prog 1.0"
    parser = OptionParser(usage=usage)
    parser.add_option("-s","--step",type="choice",choices=["1","2","3","4","5"],dest="step",help="step1 is auto training the svm model")
    parser.add_option("-p","--path",dest="save_main_path")
    parser.add_option("-P","--problem_path",dest="problem_save_path")
    parser.add_option("-i","--indexes",dest="indexes",action="callback",type="string",default=[1],callback=list_callback)
    parser.add_option("-w","--stopword",action="store_false",dest="stopword",default=True)
    parser.add_option("-n","--config_name",dest="config_name",default="tms.config")
    parser.add_option("-d","--dic_name",dest="dic_name",default="dic.key")
    parser.add_option("-D","--dic_path",dest="dic_path")
    parser.add_option("-m","--model_name",dest="model_name",default="tms.model")
    parser.add_option("-t","--train_name",dest="train_name",default="tms.train")
    parser.add_option("-a","--param_name",dest="param_name",default="tms.param")
    parser.add_option("-r","--ratio",dest="ratio",type="float",default=0.4)
    parser.add_option("-A","--svm_param",dest="svm_param",default="'-s 0 -t 2 -c 1.0 -g 0.25'")
    parser.add_option("-T","--tc_splitTag",dest="tc_splitTag",type="string",default="\t")
    parser.add_option("-S","--str_splitTag",dest="str_splitTag",type="string",default="^")
    parser.add_option("-v","--svm_type",dest="svm_type",default="libsvm",type="choice",choices=["libsvm","liblinear"])
    parser.add_option("-e","--segment",type="choice",dest="segment",default=0,choices=[0,1,2])
    parser.add_option("-c","--param_select",action="store_false",dest="param_select",default=True)
    parser.add_option("-g","--global_fun",dest="global_fun",default="one",type="choice",choices=["one","idf","rf"])
    parser.add_option("-l","--local_fun",dest="local_fun",default="tf",type="choice",choices=["tf"])
    parser.add_option("-b","--label_file",dest="label_file",type="string",default="")
    options, args = parser.parse_args() 
    if options.indexes:
        indexes = [int(i) for i in options.indexes]
    if options.step:
        step = int(options.step)

    if options.stopword ==False:
        stopword_filename=""
    else:
        stopword_filename = os.path.dirname(args[0])+"/stopwords.txt"
        
    if options.svm_param:
        svm_param = options.svm_param.replace("'","") 
    if step==1:
        train_model.ctm_train(args[0],indexes,options.save_main_path,stopword_filename,config_name=options.config_name,svm_type =options.svm_type,segment=options.segment,param_select=options.param_select,global_fun=options.global_fun,local_fun=options.local_fun,svm_param=svm_param,dic_name=options.dic_name,model_name=options.model_name,train_name=options.train_name,param_name=options.param_name,ratio=options.ratio,delete=True,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag,label_file=options.label_file)
    if step==2:
        train_model.ctm_feature_select(args[0],indexes,options.global_fun,options.save_main_path,options.dic_name,options.ratio,stopword_filename,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag)
    
    if step==3:
        if os.path.exists(options.save_main_path):
            if os.path.exists(options.save_main_path+"temp/") is False:
                os.makedirs(options.save_main_path+"temp/")
        sample_save_path  = options.save_main_path +"temp/svm.train"
        train_model.cons_train_sample_for_cla(args[0],indexes,options.local_fun,options.dic_path,sample_save_path,delete=True,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag)
    
    if step==4:
        search_result_save_path  = options.save_main_path +"temp/"+"svm.param"
        tms_svm.set_svm_type(options.svm_type)
        if options.svm_type=="libsvm":
            coarse_c_range=(-5,7,2)
            coarse_g_range=(3,-10,-2)
            fine_c_step=0.5
            fine_g_step=0.5
        if options.svm_type =="liblinear":
            coarse_c_range=(-5,7,2)
            coarse_g_range=(1,1,1)
            fine_c_step=0.5
            fine_g_step=0
        c,g=grid_search_param.grid(args[0],search_result_save_path,options.svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
        print "best c = %s\t g = %s\n"%(c,g)
    
    if step==5:
        model_save_path  = options.save_main_path+"model/"+options.model_name
        train_model.ctm_train_model(options.problem_save_path,svm_param,model_save_path)