def ctm_predict(filename,config_file,indexes,result_save_path,result_indexes,str_splitTag,tc_splitTag,seg,delete=False,change_decode=False,in_decode="UTF-8",out_encode="GBK"): '''一般形式的下得模型预测,即单个模型。''' local_fun,dic,global_weight,model,seg_ori = load_tms_model(config_file) if seg!=0: if seg_ori !=seg: print u"预测分词工具与原训练时分词工具不一样" print "-----------------正在对源文本进行分词-------------------" segment_file = os.path.dirname(filename)+"/segmented" segment.file_seg(filename,indexes,segment_file,str_splitTag,tc_splitTag,seg) filename = segment_file print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename,str_splitTag,tc_splitTag) f= file(filename,'r') fs = file(result_save_path,'w') print "-----------------正在对样本进行预测-------------------" for line in f.readlines(): if change_decode ==True: line = line.decode(in_decode).encode(out_encode,'ignore') text = line.strip().split(tc_splitTag) if len(text)<indexes[len(indexes)-1]+1 or len(text)<result_indexes[len(result_indexes)-1]+1: continue text_temp="" for i in indexes: text_temp+=str_splitTag+text[i] label,sc=cal_sc_optim(1,model,text_temp,dic,local_fun,global_weight,str_splitTag) fs.write(str(label)+"\t"+str(sc)+"\t") for index in result_indexes: fs.write(text[index]+"\t") fs.write("\n") f.close() fs.close() print u"-----------------预测完毕-------------------"
def file_seg(filename, indexes=[1], out_filename="", str_splitTag="^", tc_splitTag="\t", seg=1): if out_filename == "": out_filename = os.path.dirname(filename) + "/segmented" segment.file_seg(filename, indexes, out_filename, str_splitTag, tc_splitTag, type)
def ctm_predict_multi(filename, config_files, indexes_lists, result_save_path, result_indexes, str_splitTag, tc_splitTag, seg, delete=False, change_decode=False, in_decode="UTF-8", out_encode="GBK"): '''多个模型的预测,如一个文本有多个模型需要预测 其中title_indexes,dic_path ,model_path为二维度的。 ''' if seg != 0: print "-----------------正在对源文本进行分词-------------------" all_index = list() for index in indexes_lists: all_index.extend(index) segment_file = os.path.dirname(filename) + "/segmented" segment.file_seg(filename, all_index, segment_file, str_splitTag, tc_splitTag, seg) filename = segment_file k = len(config_files) #得到预测模型的个数 dic_list = [] local_fun_list = [] model_list = [] global_weight_list = [] for i in range(k): local_fun, dic, global_weight, model, seg_ori = load_tms_model( config_files[i]) dic_list.append(dic) local_fun_list.append(local_fun) model_list.append(model) global_weight_list.append(global_weight) print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename, str_splitTag, tc_splitTag) f = file(filename, 'r') fs = file(result_save_path, 'w') print "-----------------正在对样本进行预测-------------------" for line in f.readlines(): if len(line.strip()) < 1: continue if change_decode == True: line = line.decode(in_decode).encode(out_encode, 'ignore') text = line.strip().split(tc_splitTag) for j in range(k): indexes = indexes_lists[j] model = model_list[j] dic = dic_list[j] local_fun = local_fun_list[j] if len(text) < indexes[len(indexes) - 1] + 1 or len( text) < result_indexes[len(result_indexes) - 1] + 1: label = 0 sc = 0 else: text_temp = "" for index in indexes: text_temp += str_splitTag + text[index] if dir(model).count("get_svm_type") == 1: tms_svm.set_svm_type("libsvm") if dir(model).count("get_nr_feature") == 1: tms_svm.set_svm_type("liblinear") label, sc = cal_sc_optim(1, model, text_temp, dic, local_fun, global_weight, str_splitTag) fs.write(str(label) + "\t" + str(sc) + "\t") for index in result_indexes: if index > len(text) - 1: break fs.write(text[index] + "\t") fs.write("\n") f.close() fs.close() print "-----------------预测完毕-------------------"
def ctm_predict_multi(filename,config_files,indexes_lists,result_save_path,result_indexes,str_splitTag,tc_splitTag,seg,delete=False,change_decode=False,in_decode="UTF-8",out_encode="GBK"): '''多个模型的预测,如一个文本有多个模型需要预测 其中title_indexes,dic_path ,model_path为二维度的。 ''' if seg!=0: print "-----------------正在对源文本进行分词-------------------" all_index = list() for index in indexes_lists: all_index.extend(index) segment_file = os.path.dirname(filename)+"/segmented" segment.file_seg(filename,all_index,segment_file,str_splitTag,tc_splitTag,seg) filename = segment_file k = len(config_files) #得到预测模型的个数 dic_list=[] local_fun_list=[] model_list=[] global_weight_list = [] for i in range(k): local_fun,dic,global_weight,model,seg_ori = load_tms_model(config_files[i]) dic_list.append(dic) local_fun_list.append(local_fun) model_list.append(model) global_weight_list .append(global_weight) print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename,str_splitTag,tc_splitTag) f= file(filename,'r') fs = file(result_save_path,'w') print "-----------------正在对样本进行预测-------------------" for line in f.readlines(): if len(line.strip())<1: continue if change_decode ==True: line = line.decode(in_decode).encode(out_encode,'ignore') text = line.strip().split(tc_splitTag) for j in range(k): indexes = indexes_lists[j] model = model_list[j] dic = dic_list[j] local_fun = local_fun_list[j] if len(text)<indexes[len(indexes)-1]+1 or len(text)<result_indexes[len(result_indexes)-1]+1: label =0 sc=0 else: text_temp="" for index in indexes: text_temp+=str_splitTag+text[index] if dir(model).count("get_svm_type")==1: tms_svm.set_svm_type("libsvm") if dir(model).count("get_nr_feature")==1: tms_svm.set_svm_type("liblinear") label,sc=cal_sc_optim(1,model,text_temp,dic,local_fun,global_weight,str_splitTag) fs.write(str(label)+"\t"+str(sc)+"\t") for index in result_indexes: if index>len(text)-1: break fs.write(text[index]+"\t") fs.write("\n") f.close() fs.close() print u"-----------------预测完毕-------------------"
def ctm_train(filename, indexes, main_save_path, stopword_filename, svm_param, config_name, dic_name, model_name, train_name, svm_type, param_name, ratio, delete, str_splitTag, tc_splitTag, seg, param_select, global_fun, local_fun, label_file): '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。 然后使用最优的参数进行SVM分类,最后生成训练后的模型。 需要保存的文件:(需定义一个主保存路径) 模型文件:词典.key+模型.model 临时文件 :svm分类数据文件.train filename 训练文本所在的文件名 indexs需要训练的指标项 main_save_path 模型保存的路径 stopword_filename 停用词的名称以及路径 ;默认不适用停用词 svm_type :svm类型:libsvm 或liblinear svm_param 用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 " dic_name 用户自定义词典名称;例如“dic.key” model_name用户自定义模型名称 ;例如"svm.model" train_name用户自定义训练样本名称 ;例如“svm.train” param_name用户自定义参数文件名称 ;例如"svm.param" ratio 特征选择保留词的比例 ;例如 0.4 delete对于所有特征值为0的样本是否删除,True or False str_splitTag 分词所用的分割符号 例如"^" tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t" seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词 param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。 local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf global_fun :全局权重的计算方式:有"one","idf","rf" label_file:类标签的解释说明文件。 ''' print "-----------------创建模型文件保存的路径-----------------" if os.path.exists(main_save_path): if os.path.exists(os.path.join(main_save_path, "model")) is False: os.makedirs(os.path.join(main_save_path, "model")) if os.path.exists(main_save_path): if os.path.exists(os.path.join(main_save_path, "temp")) is False: os.makedirs(os.path.join(main_save_path, "temp")) #设定SVM模型的类型。 tms_svm.set_svm_type(svm_type) #如果没有给出停用词的文件名,则默认不使用停用词 if stopword_filename == "": stop_words_dic = dict() else: stop_words_dic = fileutil.read_dic(stopword_filename) #如果需要分词,则对原文件进行分词 if seg != 0: print "-----------------正在对源文本进行分词-------------------" segment_file = os.path.dirname(filename) + "/segmented" segment.file_seg(filename, indexes, segment_file, str_splitTag, tc_splitTag, seg) filename = segment_file #对原训练样本进行词干化处理 print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename, str_splitTag, tc_splitTag) print "-----------------现在正在进行特征选择---------------" dic_path = os.path.join(main_save_path, "model", dic_name) feature_select(filename, indexes, global_fun, dic_path, ratio, stop_words_dic, str_splitTag=str_splitTag, tc_splitTag=tc_splitTag) print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- " problem_save_path = os.path.join(main_save_path, "temp", train_name) local_fun_str = local_fun local_fun = measure.local_f(local_fun) label = cons_train_sample_for_cla(filename, indexes, local_fun, dic_path, problem_save_path, delete, str_splitTag, tc_splitTag) if param_select == True: print "--------------------选择最优的c,g------------------------------" search_result_save_path = main_save_path + "temp/" + param_name if svm_type == "libsvm": coarse_c_range = (-5, 7, 2) coarse_g_range = (3, -10, -2) fine_c_step = 0.5 fine_g_step = 0.5 c, g = grid_search_param.grid(problem_save_path, search_result_save_path, svm_type, coarse_c_range, coarse_g_range, fine_c_step, fine_g_step) svm_param = svm_param + " -c " + str(c) + " -g " + str(g) if svm_type == "liblinear" or (svm_type == "libsvm" and is_linear_kernal(svm_param) is True): coarse_c_range = (-5, 7, 2) coarse_g_range = (1, 1, 1) fine_c_step = 0.5 fine_g_step = 0 c, g = grid_search_param.grid(problem_save_path, search_result_save_path, svm_type, coarse_c_range, coarse_g_range, fine_c_step, fine_g_step) svm_param = svm_param + " -c " + str(c) print "-----------------训练模型,并将模型进行保存----------" model_save_path = main_save_path + "model/" + model_name ctm_train_model(problem_save_path, svm_type, svm_param, model_save_path) print "-----------------保存模型配置-----------------" f_config = file(os.path.join(main_save_path, "model", config_name), 'w') save_config(f_config, dic_name, model_name, local_fun_str, global_fun, seg, svm_type, svm_param, label_file, label) f_config.close()
def ctm_train(filename,indexes,main_save_path,stopword_filename,svm_param,config_name,dic_name,model_name,train_name,svm_type,param_name,ratio,delete,str_splitTag,tc_splitTag,seg,param_select,global_fun,local_fun,label_file): '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。 然后使用最优的参数进行SVM分类,最后生成训练后的模型。 需要保存的文件:(需定义一个主保存路径) 模型文件:词典.key+模型.model 临时文件 :svm分类数据文件.train filename 训练文本所在的文件名 indexs需要训练的指标项 main_save_path 模型保存的路径 stopword_filename 停用词的名称以及路径 ;默认不适用停用词 svm_type :svm类型:libsvm 或liblinear svm_param 用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 " dic_name 用户自定义词典名称;例如“dic.key” model_name用户自定义模型名称 ;例如"svm.model" train_name用户自定义训练样本名称 ;例如“svm.train” param_name用户自定义参数文件名称 ;例如"svm.param" ratio 特征选择保留词的比例 ;例如 0.4 delete对于所有特征值为0的样本是否删除,True or False str_splitTag 分词所用的分割符号 例如"^" tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t" seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词 param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。 local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf global_fun :全局权重的计算方式:有"one","idf","rf" label_file:类标签的解释说明文件。 ''' print "-----------------创建模型文件保存的路径-----------------" if os.path.exists(main_save_path): if os.path.exists(os.path.join(main_save_path,"model")) is False: os.makedirs(os.path.join(main_save_path,"model")) if os.path.exists(main_save_path): if os.path.exists(os.path.join(main_save_path,"temp")) is False: os.makedirs(os.path.join(main_save_path,"temp")) #设定SVM模型的类型。 tms_svm.set_svm_type(svm_type) #如果没有给出停用词的文件名,则默认不使用停用词 if stopword_filename =="": stop_words_dic=dict() else: stop_words_dic = fileutil.read_dic(stopword_filename) #如果需要分词,则对原文件进行分词 if seg!=0: print "-----------------正在对源文本进行分词-------------------" segment_file = os.path.dirname(filename)+"/segmented" segment.file_seg(filename,indexes,segment_file,str_splitTag,tc_splitTag,seg) filename = segment_file #对原训练样本进行词干化处理 print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename,str_splitTag,tc_splitTag) print "-----------------现在正在进行特征选择---------------" dic_path= os.path.join(main_save_path,"model",dic_name) feature_select(filename,indexes,global_fun,dic_path,ratio,stop_words_dic,str_splitTag=str_splitTag,tc_splitTag=tc_splitTag) print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- " problem_save_path = os.path.join(main_save_path,"temp",train_name) local_fun_str = local_fun local_fun = measure.local_f(local_fun) label = cons_train_sample_for_cla(filename,indexes,local_fun,dic_path,problem_save_path,delete,str_splitTag,tc_splitTag) if param_select ==True: print"--------------------选择最优的c,g------------------------------" search_result_save_path = main_save_path +"temp/"+param_name if svm_type=="libsvm": coarse_c_range=(-5,7,2) coarse_g_range=(3,-10,-2) fine_c_step=0.5 fine_g_step=0.5 c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step) svm_param = svm_param + " -c "+str(c)+" -g "+str(g) if svm_type=="liblinear" or (svm_type=="libsvm" and is_linear_kernal(svm_param) is True): coarse_c_range=(-5,7,2) coarse_g_range=(1,1,1) fine_c_step=0.5 fine_g_step=0 c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step) svm_param = svm_param + " -c "+str(c) print "-----------------训练模型,并将模型进行保存----------" model_save_path = main_save_path+"model/"+model_name ctm_train_model(problem_save_path,svm_type,svm_param,model_save_path) print "-----------------保存模型配置-----------------" f_config = file(os.path.join(main_save_path,"model",config_name),'w') save_config(f_config,dic_name,model_name,local_fun_str,global_fun,seg,svm_type,svm_param,label_file,label) f_config.close()
def file_seg(filename,indexes=[1],out_filename="",str_splitTag="^",tc_splitTag="\t",seg=1): if out_filename=="": out_filename = os.path.dirname(filename)+"/segmented" segment.file_seg(filename, indexes, out_filename, str_splitTag, tc_splitTag, type)