Ejemplo n.º 1
0
def load_train_data():
    starttime = datetime.datetime.now()
    print("构造样本集...")
    root_dir = get_classifier_train_samples()
    val_label_list = []
    for lists in os.listdir(root_dir):
        path = os.path.join(root_dir, lists)
        label = lists[:lists.find('-')]
        with codecs.open(path, "r", encoding="utf-8") as f:
            line = f.readline()
            line = line.strip("\n\r")
            while line != "":
                if line.startswith("Q "):  # 仅提取问题做为训练集
                    val_label_list.append((line, label))
                # val_label_list.append((line, label))
                line = f.readline()
                line = line.strip("\n\r")
    # 洗牌,以便均衡训练集
    print("shuffle特征集...")
    import random
    random.shuffle(val_label_list)
    features_list = []
    labels_list = []
    inb = IntentClassifierNB()
    w_i_dict = inb.load_word_index()
    for sentence, label in val_label_list:
        features_list.append(inb.build_feature(sentence, w_i_dict))
        labels_list.append(label)
    endtime = datetime.datetime.now()
    print("===========构造训练样本集耗时: %s" % (endtime - starttime).seconds)
    # 存储类别标签集合
    dump_labels_set(sorted(set(labels_list)))
    # return np.array(features_list), np.array(labels_list)
    return features_list, labels_list
Ejemplo n.º 2
0
 def load_test_data(self):
     samples = self.__get_samples(
         path_configer.get_classifier_train_samples())
     intents_labels = samples.keys()
     examples_dict = {}
     for lab in intents_labels:
         texts = samples[lab]
         for t in texts:
             examples_dict[t] = lab
     return examples_dict
Ejemplo n.º 3
0
 def load_train_data(self):
     """
     加载训练数据
     :return: {'类别':{examples:'句子样本',centroid:'中心向量'}}
     """
     rs = {}
     samples = self.__get_samples(
         path_configer.get_classifier_train_samples())
     intents_labels = samples.keys()
     for lab in intents_labels:
         labels_dict = {}
         examples_list = []
         texts = samples[lab]
         for t in texts:
             examples_list.append(t)
         labels_dict["examples"] = examples_list
         labels_dict["centroid"] = None
         rs[lab] = labels_dict
     return rs
Ejemplo n.º 4
0
def build_corpus_vocabulary():
    data_root_dir = path_configer.get_classifier_train_samples()
    w_path = "%s/question.txt" % path_configer.get_resources_corpus()
    if os.path.exists(w_path):
        os.remove(w_path)
    for file in os.listdir(data_root_dir):
        path = os.path.join(data_root_dir, file)
        q_list = []
        print("开始读取文件:%s" % file)
        with codecs.open(path, "r", encoding="utf-8") as f:
            line = f.readline()
            line = line.strip("\n\r")
            while line != "":
                q_list.append(line[line.find(" ") + 1:])
                line = f.readline()
                line = line.strip("\n\r")
        print("开始写入文本%s" % w_path)
        with codecs.open(w_path, "a", encoding="utf-8") as f:
            for item in q_list:
                if len(item.strip()) > 0:
                    f.write('%s\n' % item)
Ejemplo n.º 5
0
 def build_text_vec_indx(self):
     text_vec_indx = {}
     data_root_dir = path_configer.get_classifier_train_samples()
     for file in os.listdir(data_root_dir):
         vocabulary_path = os.path.join(data_root_dir, file)
         logging.info("构建文本-向量索引文件...")
         with codecs.open(vocabulary_path, "r", encoding="utf-8") as f:
             line = f.readline()
             line = line.strip("\n\r")
             while line != "":
                 if line.startswith('Q '):
                     text_vec_indx[line] = CentroidsVecClassifier().sum_vecs_avg(line)
                 line = f.readline()
                 line = line.strip("\n\r")
     logging.info("文本-向量索引文件构建完成!")
     logging.info("存储文本-向量索引文件...")
     text_vec_path = "%s/text_vec.index" % path_configer.get_resources()
     if os.path.exists(text_vec_path):
         os.remove(text_vec_path)
     joblib.dump(text_vec_indx,text_vec_path)
     logging.info("存储文本-向量索引文件完成!")
     return text_vec_indx
Ejemplo n.º 6
0
def train_sf_chatbot():
    data_root_dir = path_configer.get_classifier_train_samples()
    for file_name in os.listdir(data_root_dir):
        if file_name.startswith("QA_sf_"):
            __train(('%s/%s' % (get_chatter_corpus(), file_name)),
                    file_name[:file_name.find('-')])