def main(args): init_logger() tokenizer = load_tokenizer(args) train_dataset = None if args.do_predict else load_and_cache_examples( args, tokenizer, mode="train") dev_dataset = None test_dataset = None if args.do_predict else load_and_cache_examples( args, tokenizer, mode="test") if args.do_train: trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) trainer.train() if args.do_eval: trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) trainer.load_model() trainer.evaluate("test") if args.do_predict: predict = Predict(args, tokenizer) predict.load_model() sentences = [args.sentence] result_json = dict() result_json['result'] = int(predict.predict(sentences)) print(json.dumps(result_json, ensure_ascii=False))
def modelsLoad(self): filelist = os.listdir(self.modeldir) for f in filelist: path = os.path.join(self.modeldir, f) id = f.split(".")[0] policy = f.split(".")[1] if policy == 'model': # 初始化 Predict 对象 pred = Predict() pred.load_model(path) self.indexclass[id] = pred else: # 初始化 Rule 对象 rule = Rule() rule.load_rule(path) self.indexclass[id] = rule logging.debug("load rule file " + id) self.predictPolicy[id] = policy logging.info('[Done] load all models')
def evaluation(self, model_path, eval_file, predict_file, badcase_file): print "%d samples are used to train, %d samples are used to test" % ( len(self.train_features), len(self.test_features)) _predict = Predict() _predict.load_model(model_path) #report = classification_report(self.test_labels, self._predict.predict(self.test_features)) efile = open(eval_file, 'w+') pfile = open(predict_file, 'w') bfile = open(badcase_file, 'w') cate_cnt = dict() accuracy_cnt = 0 all_cnt = 0 print "******************" for features, label, line in zip(self.test_features, self.test_labels, self.test_lines): result = _predict.predict(features) #print result.encode('u8'), label.encode('u8'), name.encode('u8') all_cnt += 1 if all_cnt % 1000 == 0: print >> sys.stderr, "test %d samples ... " % all_cnt cate_key = label pred_key = result cate_key = cate_key.replace('#', '') pred_key = pred_key.replace('#', '') if cate_key not in cate_cnt: cate_cnt[cate_key] = {'cate': 0, 'pred': 0, 'corr': 0} if pred_key not in cate_cnt: cate_cnt[pred_key] = {'cate': 0, 'pred': 0, 'corr': 0} cate_cnt[cate_key]['cate'] += 1 cate_cnt[pred_key]['pred'] += 1 print >> pfile, "%s\t%s" % (result.encode('u8'), line) if cate_key == pred_key: cate_cnt[cate_key]['corr'] += 1 accuracy_cnt += 1 else: print >> bfile, "%s\t%s\t%s" % ( result.encode('u8'), line, json.dumps(features, ensure_ascii=False).encode('u8')) print >> efile, "key\tcate_num\tpred_num\tcorrect_num\tprecision\trecall\tF_score" F_score_cnt = 0 for key, vdic in cate_cnt.iteritems(): cate_num = vdic['cate'] pred_num = vdic['pred'] corr_num = vdic['corr'] if pred_num == 0 or cate_num == 0: print >> efile, "%s\t%d\t%d\t%d" % ( key.encode('utf-8'), cate_num, pred_num, corr_num) else: precision = corr_num * 1.0 / pred_num recall = corr_num * 1.0 / cate_num F_score = 0.0 if (precision + recall) > 1e-3: F_score = 2 * precision * recall / (precision + recall) print >> efile, "%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f" % ( key.encode('utf-8'), cate_num, pred_num, corr_num, precision, recall, F_score) print >> efile, "accuracy: ", accuracy_cnt * 1.0 / all_cnt print >> efile, "%d samples are used to train, %d samples are used to test" % ( len(self.train_features), len(self.test_features)) efile.close() pfile.close() bfile.close()
class ClassifierRun(): def __init__(self, cfg_file_name): self.config = ConfigParser.ConfigParser() self.cur_dir = os.path.dirname(os.path.abspath(cfg_file_name)) self.cfg_parser(cfg_file_name) self.preprocess = Preprocess(cfg_file_name) self.cnt = 0 self.train_features = [] self.train_labels = [] self.test_features = [] self.test_labels = [] self.test_names = [] self._train = Train(self.space, self.params) self._predict = Predict() self._rule = Rule() self._tree = ClassTreePedict('./resource/cate_id.cfg', './model') def cfg_parser(self, cfg_file_name): self.config.read(cfg_file_name) section = 'model' if self.config.has_option(section, 'model_file'): self.model_path = self.config.get(section, 'model_file') else: self.model_path = './model/testmodel' if self.config.has_option(section, 'model_dir'): self.model_dir = self.config.get(section, 'model_dir') else: self.model_dir = './model' if self.config.has_option(section, 'vec_space') and self.config.get(section, 'vec_space') == 'topic': self.space = 'topic' else: self.space = 'word' if self.space == 'topic': if self.config.has_section('topic_param'): self.params = dict(self.config.items('topic_param')) elif self.space == 'word': if self.config.has_section('word_param'): self.params = dict(self.config.items('word_param')) section = 'evaluation' self.test_size_prob = 1.0 if self.config.has_option(section, 'test_size'): self.test_size_prob = self.config.getfloat(section, 'test_size') if self.config.has_option(section, 'random_state'): seed = self.config.getint(section, 'random_state') random.seed(seed) self.level = 0 section = 'default' if self.config.has_option(section, 'level'): self.level = self.config.getint(section, 'level') if self.config.has_option(section, 'cate_id_file'): self.cate_id_file = self.config.get(section, 'cate_id_file') else: self.cate_id_file = "resource/cate_id.cfg" logging.info('[Done] config parsing') logging.info('use %s space, params=%s' %(self.space, json.dumps(self.params) )) def train(self): self._train.train(self.train_features, self.train_labels) self._train.dump_model(self.model_path) def test(self): if self.model_path.endswith('rule'): self._rule.load_rule(self.model_path) is_rule = True else: self._predict.load_model(self.model_path) is_rule = False print len(self.test_features) for (features, label, name) in zip(self.test_features, self.test_labels, self.test_names): if is_rule: result = self._rule.predict(name, 0) else: result = self._predict.predict(features) print result.encode('u8'),'\t', label.encode('u8'),'\t', name.encode('u8'), '\t', json.dumps(features,'\t', ensure_ascii=False).encode('u8') def testone(self, name, cat_name, brand, price): tree = ClassTreePedict(self.cate_id_file, model_dir) features = self.preprocess.process(name, cat_name, brand, price, level=0) features = json.loads('{"Eden": 1, "Botkier": 1, "Satchel": 1, "马毛": 1, "女士": 1, "柏柯尔": 1, "拼接": 1, "手提包": 1, "Small": 1 }') result = tree.predict(name, features, indexclass=u"root") print result.encode('u8'), name.encode('u8'), json.dumps(features, ensure_ascii=False).encode('u8') # map_cfg 类目和ID的映射文件,model_dir 存放模型文件目录,data_file 数据文件 def predict(self, map_cfg, model_dir, data_file_name): tree = ClassTreePedict(map_cfg, model_dir) data_file = open(data_file_name, 'r') for line in data_file: line = line.strip() try: old_cate, cid_cate, name, brand, price = line.decode('u8').split(u'\t') except Exception,e : print >> sys.stderr, "Error:", line print >> sys.stderr, e sys.exit() cat_name = json.dumps(cid_cate.split(',')) price = float(price) features = self.preprocess.process(name, cat_name, brand, price, level=0) #result = tree.predict(name, features, indexclass=u"root") indexclass = u'root' result = tree.predict(name, features, indexclass, price, cat_name) print "%s\t%s\t%s" %(result.encode('u8'), old_cate.encode('u8'), name.encode('u8')) data_file.close()
class Core_Process(object): """ Func: training and predict process. """ def __init__(self, model_name='root', opt='train'): self.model_name = model_name self.preprocess = Preprocess(CONFIG_FILE) self.d_model_map = self.preprocess.d_models if opt == 'predict': self.predict_obj = Predict() if self.d_model_map.get(self.model_name, None): self.predict_obj.load_model(MODEL_PATH + 'model/' + self.model_name + '.model') else: self.predict_obj.load_model(MODEL_PATH + 'model/' + 'root.model') print "\nNote: using the default model--root.model to predict.\n" self.train_features = [] self.train_labels = [] self.predict_features = [] self.predict_labels = [] self.predict_data_id = [] self.predict_result = [] def load_data_path(self, data_path): """ Input: data_path <string>: the input file path. Output: None """ print data_path fp = open(data_path, 'r') for json_line in fp.readlines(): d_line = json.loads(json_line) data_id = d_line['id'] desc_text = ' '.join(d_line['description'].replace('.', ' ').split()) labels = d_line['label'] features = self.preprocess.process(title='', content=desc_text, model_name=self.model_name) self.train_features.append(features) self.train_labels.append(labels) self.predict_data_id.append(data_id) fp.close() if len(self.train_features) == len(self.train_labels): pass #print '=========', len(self.train_features), len(self.train_labels) else: print 'ERROR: len(train_features) != len(train_labels)' def train_all(self, train_data_dir, model_name='root'): """ train model with all training dataset, use model 'root' by default """ self.load_data_path(train_data_dir) print >> sys.stderr, "train the model", train_data_dir space = 'word' #space = 'topic' # There are some problems ? _train = Train(space, {}) _train.train(self.train_features, self.train_labels) if not os.path.exists(os.path.join(MODEL_PATH, 'model')): os.makedirs(os.path.join(MODEL_PATH, 'model')) if not os.path.exists(os.path.join(MODEL_PATH, 'report')): os.makedirs(os.path.join(MODEL_PATH, 'report')) if not os.path.exists(os.path.join(MODEL_PATH, 'feature')): os.makedirs(os.path.join(MODEL_PATH, 'feature')) model_path = MODEL_PATH + 'model/' + model_name + ".model" print >> sys.stderr, "dump the model", model_path _train.dump_model(model_path) feature_file = os.path.join(MODEL_PATH, 'feature/' + model_name + ".feature") #输出选择的特征及系数 ffile = open(feature_file, 'w') feature_coef = _train.get_feature_coef() print "----------len featrue coef:", len(feature_coef) feature_len = 0 for cate in feature_coef: print "-------------", cate print >> ffile, "%s" % (cate.encode('u8')) features = sorted(feature_coef[cate].items(), key=lambda x: x[1], reverse=True) feature_len = len(features) for f_item in features: print >> ffile, "\t%s\t%f" % (f_item[0].encode('u8'), f_item[1]) ffile.close() print >> sys.stderr, "%d features has been selected!" % feature_len def evaluation(self, predict_file): """ Func: evaluation of batch data. Input: predict_file <string>: input file path. Output: precision <float>: the precision of the prediction . """ d_eval = {'corr': 0} all_cnt = 0 precision = 0.0 self.load_data_path(predict_file) self.predict_features = self.train_features self.predict_labels = self.train_labels all_cnt = len(self.predict_labels) for features, label in zip(self.predict_features, self.predict_labels): result = self.predict_obj.predict(features) if result == label: d_eval['corr'] += 1 self.predict_result.append(result) if all_cnt == 0: print 'ERROR: all_cnt of predict_file: 0 !' else: precision = d_eval['corr'] * 1.0 / all_cnt print '========== all_cnt: ', all_cnt print '========== precision: ', precision return precision def run(self, opt, file_path): """ opt: to determine train or predict file_path: traning data. """ if opt == 'train': for mod_name, values in self.d_model_map.items(): self.train_all(file_path, mod_name) elif opt == 'predict': predict_file = file_path result = self.evaluation(predict_file) report_file = os.path.join(MODEL_PATH, 'report/' + self.model_name + ".report") rfile = open(report_file, 'a') rfile.write(str(file_path + ' precision: ') + str(result) + '\n') rfile.close() with open(report_file + '.rep', 'w') as rf: for tid, res in zip(self.predict_data_id, self.predict_result): rf.write(tid + '\t' + res + '\n') else: print 'Nothing to do, please input train or predict.' def predict_one(self, desc_text): """ Func: predict single data. Input: desc_text <string>: description text of the single data. Output: result <string>: the label of the input text. """ features = self.preprocess.process(title='', content=desc_text, model_name=self.model_name) result = self.predict_obj.predict(features) return str(result)