Exemple #1
0
def main(args):
    init_logger()
    tokenizer = load_tokenizer(args)
    train_dataset = None if args.do_predict else load_and_cache_examples(
        args, tokenizer, mode="train")
    dev_dataset = None
    test_dataset = None if args.do_predict else load_and_cache_examples(
        args, tokenizer, mode="test")

    if args.do_train:
        trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
        trainer.train()

    if args.do_eval:
        trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
        trainer.load_model()
        trainer.evaluate("test")

    if args.do_predict:
        predict = Predict(args, tokenizer)
        predict.load_model()

        sentences = [args.sentence]
        result_json = dict()
        result_json['result'] = int(predict.predict(sentences))
        print(json.dumps(result_json, ensure_ascii=False))
 def modelsLoad(self):
     filelist = os.listdir(self.modeldir)
     for f in filelist:
         path = os.path.join(self.modeldir, f)
         id = f.split(".")[0]
         policy = f.split(".")[1]
         if policy == 'model':
             # 初始化 Predict 对象
             pred = Predict()
             pred.load_model(path)
             self.indexclass[id] = pred
         else:
             # 初始化 Rule 对象
             rule = Rule()
             rule.load_rule(path)
             self.indexclass[id] = rule
             logging.debug("load rule file " + id)
         self.predictPolicy[id] = policy
     logging.info('[Done] load all models')
Exemple #3
0
    def evaluation(self, model_path, eval_file, predict_file, badcase_file):
        print "%d samples are used to train, %d samples are used to test" % (
            len(self.train_features), len(self.test_features))
        _predict = Predict()
        _predict.load_model(model_path)
        #report = classification_report(self.test_labels, self._predict.predict(self.test_features))
        efile = open(eval_file, 'w+')
        pfile = open(predict_file, 'w')
        bfile = open(badcase_file, 'w')
        cate_cnt = dict()
        accuracy_cnt = 0
        all_cnt = 0
        print "******************"
        for features, label, line in zip(self.test_features, self.test_labels,
                                         self.test_lines):
            result = _predict.predict(features)
            #print result.encode('u8'), label.encode('u8'), name.encode('u8')
            all_cnt += 1
            if all_cnt % 1000 == 0:
                print >> sys.stderr, "test %d samples ... " % all_cnt
            cate_key = label
            pred_key = result
            cate_key = cate_key.replace('#', '')
            pred_key = pred_key.replace('#', '')

            if cate_key not in cate_cnt:
                cate_cnt[cate_key] = {'cate': 0, 'pred': 0, 'corr': 0}

            if pred_key not in cate_cnt:
                cate_cnt[pred_key] = {'cate': 0, 'pred': 0, 'corr': 0}

            cate_cnt[cate_key]['cate'] += 1
            cate_cnt[pred_key]['pred'] += 1
            print >> pfile, "%s\t%s" % (result.encode('u8'), line)

            if cate_key == pred_key:
                cate_cnt[cate_key]['corr'] += 1
                accuracy_cnt += 1
            else:
                print >> bfile, "%s\t%s\t%s" % (
                    result.encode('u8'), line,
                    json.dumps(features, ensure_ascii=False).encode('u8'))

        print >> efile, "key\tcate_num\tpred_num\tcorrect_num\tprecision\trecall\tF_score"
        F_score_cnt = 0
        for key, vdic in cate_cnt.iteritems():
            cate_num = vdic['cate']
            pred_num = vdic['pred']
            corr_num = vdic['corr']
            if pred_num == 0 or cate_num == 0:
                print >> efile, "%s\t%d\t%d\t%d" % (
                    key.encode('utf-8'), cate_num, pred_num, corr_num)
            else:
                precision = corr_num * 1.0 / pred_num
                recall = corr_num * 1.0 / cate_num
                F_score = 0.0
                if (precision + recall) > 1e-3:
                    F_score = 2 * precision * recall / (precision + recall)
                print >> efile, "%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f" % (
                    key.encode('utf-8'), cate_num, pred_num, corr_num,
                    precision, recall, F_score)
        print >> efile, "accuracy: ", accuracy_cnt * 1.0 / all_cnt
        print >> efile, "%d samples are used to train, %d samples are used to test" % (
            len(self.train_features), len(self.test_features))
        efile.close()
        pfile.close()
        bfile.close()
class ClassifierRun():
    def __init__(self, cfg_file_name):
        self.config = ConfigParser.ConfigParser()
        self.cur_dir = os.path.dirname(os.path.abspath(cfg_file_name))
        self.cfg_parser(cfg_file_name)
        self.preprocess = Preprocess(cfg_file_name)

        self.cnt = 0
        self.train_features = []
        self.train_labels = []

        self.test_features = []
        self.test_labels = []
        self.test_names = []

        self._train = Train(self.space, self.params)
        self._predict = Predict()
        self._rule = Rule()
        
        self._tree = ClassTreePedict('./resource/cate_id.cfg', './model')

    def cfg_parser(self, cfg_file_name):
        self.config.read(cfg_file_name)
        section = 'model'
        if self.config.has_option(section, 'model_file'):
            self.model_path = self.config.get(section, 'model_file')
        else:
            self.model_path = './model/testmodel'

        if self.config.has_option(section, 'model_dir'):
            self.model_dir = self.config.get(section, 'model_dir')
        else:
            self.model_dir = './model'
       
        if self.config.has_option(section, 'vec_space') and self.config.get(section, 'vec_space') == 'topic':
            self.space = 'topic'
        else:
            self.space = 'word'

        if self.space == 'topic':
            if self.config.has_section('topic_param'):
                self.params = dict(self.config.items('topic_param'))
        elif self.space == 'word':
            if self.config.has_section('word_param'):
                self.params = dict(self.config.items('word_param'))

        section = 'evaluation'
        self.test_size_prob = 1.0
        if self.config.has_option(section, 'test_size'):
            self.test_size_prob = self.config.getfloat(section, 'test_size')
        if self.config.has_option(section, 'random_state'):
            seed = self.config.getint(section, 'random_state')
            random.seed(seed)
            
        self.level = 0
        section = 'default'
        if self.config.has_option(section, 'level'):
            self.level = self.config.getint(section, 'level')
        if self.config.has_option(section, 'cate_id_file'):
            self.cate_id_file = self.config.get(section, 'cate_id_file')
        else:
            self.cate_id_file = "resource/cate_id.cfg"

        logging.info('[Done] config parsing')       
        logging.info('use %s space, params=%s' %(self.space, json.dumps(self.params) ))
 
    def train(self):
        self._train.train(self.train_features, self.train_labels)
        self._train.dump_model(self.model_path)

    def test(self):
        if self.model_path.endswith('rule'):
            self._rule.load_rule(self.model_path)
            is_rule = True
        else:
            self._predict.load_model(self.model_path)
            is_rule = False
        print len(self.test_features)
        for (features, label, name) in zip(self.test_features, self.test_labels, self.test_names):
            if is_rule:
                result = self._rule.predict(name, 0)
            else:
                result = self._predict.predict(features)
            print result.encode('u8'),'\t', label.encode('u8'),'\t', name.encode('u8'), '\t', json.dumps(features,'\t', ensure_ascii=False).encode('u8')

    def testone(self, name, cat_name, brand, price):
        tree = ClassTreePedict(self.cate_id_file, model_dir)
        features = self.preprocess.process(name, cat_name, brand, price, level=0)
        features = json.loads('{"Eden": 1, "Botkier": 1, "Satchel": 1, "马毛": 1, "女士": 1, "柏柯尔": 1, "拼接": 1, "手提包": 1, "Small": 1 }')
        result = tree.predict(name, features, indexclass=u"root") 
        print result.encode('u8'), name.encode('u8'), json.dumps(features, ensure_ascii=False).encode('u8')

    # map_cfg 类目和ID的映射文件,model_dir 存放模型文件目录,data_file 数据文件
    def predict(self, map_cfg, model_dir, data_file_name):
        tree = ClassTreePedict(map_cfg, model_dir)
        data_file = open(data_file_name, 'r')
        for line in data_file:
            line = line.strip()
            try:
                old_cate, cid_cate, name, brand, price = line.decode('u8').split(u'\t')
            except Exception,e :
                print >> sys.stderr, "Error:", line
                print >> sys.stderr, e
                sys.exit()
            cat_name = json.dumps(cid_cate.split(','))
            price = float(price)
            features = self.preprocess.process(name, cat_name, brand, price, level=0)
            #result = tree.predict(name, features, indexclass=u"root")
            indexclass = u'root'
            result = tree.predict(name, features, indexclass, price, cat_name)
            print "%s\t%s\t%s" %(result.encode('u8'), old_cate.encode('u8'), name.encode('u8'))
        data_file.close()
Exemple #5
0
class Core_Process(object):
    """
    Func: training and predict process.
    """
    def __init__(self, model_name='root', opt='train'):

        self.model_name = model_name
        self.preprocess = Preprocess(CONFIG_FILE)
        self.d_model_map = self.preprocess.d_models

        if opt == 'predict':
            self.predict_obj = Predict()
            if self.d_model_map.get(self.model_name, None):
                self.predict_obj.load_model(MODEL_PATH + 'model/' +
                                            self.model_name + '.model')
            else:
                self.predict_obj.load_model(MODEL_PATH + 'model/' +
                                            'root.model')
                print "\nNote: using the default model--root.model to predict.\n"

        self.train_features = []
        self.train_labels = []

        self.predict_features = []
        self.predict_labels = []

        self.predict_data_id = []
        self.predict_result = []

    def load_data_path(self, data_path):
        """
        Input: 
            data_path <string>: the input file path.
        Output:
            None
        """
        print data_path
        fp = open(data_path, 'r')
        for json_line in fp.readlines():
            d_line = json.loads(json_line)

            data_id = d_line['id']
            desc_text = ' '.join(d_line['description'].replace('.',
                                                               ' ').split())
            labels = d_line['label']
            features = self.preprocess.process(title='',
                                               content=desc_text,
                                               model_name=self.model_name)

            self.train_features.append(features)
            self.train_labels.append(labels)

            self.predict_data_id.append(data_id)

        fp.close()

        if len(self.train_features) == len(self.train_labels):
            pass
            #print '=========', len(self.train_features), len(self.train_labels)
        else:
            print 'ERROR: len(train_features) != len(train_labels)'

    def train_all(self, train_data_dir, model_name='root'):
        """
        train model with all training dataset, use model 'root' by default
        """
        self.load_data_path(train_data_dir)

        print >> sys.stderr, "train the model", train_data_dir

        space = 'word'
        #space = 'topic'    # There are some problems ?
        _train = Train(space, {})
        _train.train(self.train_features, self.train_labels)

        if not os.path.exists(os.path.join(MODEL_PATH, 'model')):
            os.makedirs(os.path.join(MODEL_PATH, 'model'))
        if not os.path.exists(os.path.join(MODEL_PATH, 'report')):
            os.makedirs(os.path.join(MODEL_PATH, 'report'))
        if not os.path.exists(os.path.join(MODEL_PATH, 'feature')):
            os.makedirs(os.path.join(MODEL_PATH, 'feature'))

        model_path = MODEL_PATH + 'model/' + model_name + ".model"
        print >> sys.stderr, "dump the model", model_path
        _train.dump_model(model_path)

        feature_file = os.path.join(MODEL_PATH,
                                    'feature/' + model_name + ".feature")

        #输出选择的特征及系数
        ffile = open(feature_file, 'w')
        feature_coef = _train.get_feature_coef()
        print "----------len featrue coef:", len(feature_coef)
        feature_len = 0
        for cate in feature_coef:
            print "-------------", cate
            print >> ffile, "%s" % (cate.encode('u8'))
            features = sorted(feature_coef[cate].items(),
                              key=lambda x: x[1],
                              reverse=True)
            feature_len = len(features)
            for f_item in features:
                print >> ffile, "\t%s\t%f" % (f_item[0].encode('u8'),
                                              f_item[1])
        ffile.close()
        print >> sys.stderr, "%d features has been selected!" % feature_len

    def evaluation(self, predict_file):
        """
        Func: evaluation of batch data.
        Input:
            predict_file <string>: input file path.
        Output:
            precision <float>: the precision of the prediction .
        """
        d_eval = {'corr': 0}
        all_cnt = 0
        precision = 0.0

        self.load_data_path(predict_file)
        self.predict_features = self.train_features
        self.predict_labels = self.train_labels
        all_cnt = len(self.predict_labels)

        for features, label in zip(self.predict_features, self.predict_labels):
            result = self.predict_obj.predict(features)
            if result == label:
                d_eval['corr'] += 1
            self.predict_result.append(result)
        if all_cnt == 0:
            print 'ERROR: all_cnt of predict_file: 0 !'
        else:
            precision = d_eval['corr'] * 1.0 / all_cnt

        print '========== all_cnt: ', all_cnt
        print '========== precision: ', precision

        return precision

    def run(self, opt, file_path):
        """
        opt: to determine train or predict
        file_path: traning data.
        """
        if opt == 'train':
            for mod_name, values in self.d_model_map.items():
                self.train_all(file_path, mod_name)

        elif opt == 'predict':
            predict_file = file_path
            result = self.evaluation(predict_file)

            report_file = os.path.join(MODEL_PATH,
                                       'report/' + self.model_name + ".report")
            rfile = open(report_file, 'a')
            rfile.write(str(file_path + '  precision: ') + str(result) + '\n')
            rfile.close()
            with open(report_file + '.rep', 'w') as rf:
                for tid, res in zip(self.predict_data_id, self.predict_result):
                    rf.write(tid + '\t' + res + '\n')

        else:
            print 'Nothing to do, please input train or predict.'

    def predict_one(self, desc_text):
        """
        Func: predict single data.
        Input:
            desc_text <string>: description text of the single data.
        Output:
            result <string>: the label of the input text.
        """
        features = self.preprocess.process(title='',
                                           content=desc_text,
                                           model_name=self.model_name)
        result = self.predict_obj.predict(features)

        return str(result)