def cons_train_sample_for_cla(filename, indexs, local_fun, dic_path, sample_save_path, delete, str_splitTag, tc_splitTag): '''根据提供的词典,将指定文件中的指定位置上的内容构造成SVM所需的问题格式,并进行保存''' dic_list, global_weight = fileutil.read_dic_ex(dic_path, dtype=str) if type(local_fun) == types.StringType: local_fun = measure.local_f(local_fun) label = set() #对原训练样本进行词干化处理 print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename, str_splitTag, tc_splitTag) f = file(filename, 'r') fs = file(sample_save_path, 'w') for line in f.readlines(): text = line.strip().split(tc_splitTag) text_temp = "" if len(text) < indexs[len(indexs) - 1] + 1: continue for i in indexs: text_temp += str_splitTag + text[i] y, x = ctmutil.cons_pro_for_svm(text[0], text_temp.strip().split(str_splitTag), dic_list, local_fun, global_weight) if delete == True and len(x[0]) == 0: continue save_dic_train_sample(fs, y, x) label.add(y[0]) f.close() fs.close() return label
def cons_train_sample_for_cla(filename,indexs,local_fun,dic_path,sample_save_path,delete,str_splitTag,tc_splitTag): '''根据提供的词典,将指定文件中的指定位置上的内容构造成SVM所需的问题格式,并进行保存''' dic_list,global_weight = fileutil.read_dic_ex(dic_path,dtype=str) if type(local_fun)==types.StringType: local_fun = measure.local_f(local_fun) label = set() #对原训练样本进行词干化处理 print "-----------------正在对源文本进行词干化处理-------------------" stem.stemFile(filename,str_splitTag,tc_splitTag) f= file(filename,'r') fs = file(sample_save_path,'w') for line in f.readlines(): text = line.strip().split(tc_splitTag) text_temp="" if len(text)<indexs[len(indexs)-1]+1: continue for i in indexs: text_temp+=str_splitTag+text[i] y,x = ctmutil.cons_pro_for_svm(text[0],text_temp.strip().split(str_splitTag),dic_list,local_fun,global_weight) if delete == True and len(x[0])==0: continue save_dic_train_sample(fs,y,x) label.add(y[0]) f.close() fs.close() return label
def load_tms_model(config_file): '''通过模型配置文件加载词典、全局因子、局部因子、SVM模型''' model_main_path = os.path.dirname(config_file) f = file(config_file,'r') for line in f.readlines(): text = line.split(":") if text[0].strip()=="DicName": dic,global_weight = fileutil.read_dic_ex(os.path.join(model_main_path,text[1].strip()),dtype=str) if text[0].strip()=="ModelName": tms_svm.set_svm_type(tms_svm.detect_svm_type(os.path.join(model_main_path,text[1].strip()))) model= tms_svm.load_model(os.path.join(model_main_path,text[1].strip())) if text[0].strip()=="LocalFun": local_fun = measure.local_f(text[1].strip()) if text[0].strip()=="WordSeg": seg = int(float(text[1])) return local_fun,dic,global_weight,model,seg