def predict(self, texts): """ 根据模型预测某文件的分类 :param texts: 要分类的文本 :return: 返回分类 """ texts = [clean_en_text(t) for t in texts] tf_vector = self.tf_idf_model.transform(texts) chi_vector = self.chi_model.transform(tf_vector) out = self.clf_model.predict(chi_vector) print('----------推理结果------:', out) return out
def clean(file_path): """ 清理文本, 然后利用清理后的文本进行训练 """ with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() lines_clean = [] for line in lines: line_list = line.split('__label__') lines_clean.append(clean_en_text(line_list[0]) + ' __label__' + line_list[1]) with open(file_path, 'w', encoding='utf-8') as f: f.writelines(lines_clean)
def __select_features(data_set): data_set[0] = [clean_en_text(data) for data in data_set[0]] tf_idf_model = TfidfVectorizer(ngram_range=(1, 1), binary=True, sublinear_tf=True) tf_vectors = tf_idf_model.fit_transform(data_set[0]) k = int(tf_vectors.shape[1] / 5) chi_model = SelectKBest(chi2, k=k) chi_features = chi_model.fit_transform(tf_vectors, data_set[1]) print('tf-idf:\t\t' + str(tf_vectors.shape[1])) print('chi:\t\t' + str(chi_features.shape[1])) return chi_features, tf_idf_model, chi_model
def load_data_and_labels(pos_file, neg_file): # 读取文件 positive_examples = list(open(pos_file, "r", encoding='utf-8').readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open(neg_file, "r", encoding='utf-8').readlines()) negative_examples = [s.strip() for s in negative_examples] # 分词 x_text = positive_examples + negative_examples x_text = [clean_en_text(sent) for sent in x_text] x_text = [s.split(' ') for s in x_text] # 生成label positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return x_text, y
def select_features(self, data_set): dataset = [clean_en_text(data) for data in data_set['text']] tf_idf_model = TfidfVectorizer(ngram_range=(1, 1), binary=True, sublinear_tf=True) tf_vectors = tf_idf_model.fit_transform(dataset) # 选出前1/6的词用来做特征 k = int(tf_vectors.shape[1] / 5) chi_model = SelectKBest(chi2, k=10) chi_features = chi_model.fit_transform(tf_vectors, data_set['label']) print('tf-idf:\t\t' + str(tf_vectors.shape[1])) print('chi:\t\t' + str(chi_features.shape[1])) return chi_features, tf_idf_model, chi_model
from nlp.utils.clean_text import clean_en_text if __name__ == '__main__': sentence = 'This is a good time\' , please be happy' print(clean_en_text(sentence)) path = 'data/imdb/aclImdb.txt' path2 = 'data/imdb/aclImdb_a.txt' out_lines = [] with open(path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line_arr = line.split("##") out_lines.append(line_arr[1].strip() + '##' + line_arr[0].strip() + '\n') with open(path2, 'w', encoding='utf-8') as f: f.writelines(out_lines)