Beispiel #1
0
 def predict(self, texts):
     """
     根据模型预测某文件的分类
     :param texts: 要分类的文本
     :return: 返回分类
     """
     texts = [clean_en_text(t) for t in texts]
     tf_vector = self.tf_idf_model.transform(texts)
     chi_vector = self.chi_model.transform(tf_vector)
     out = self.clf_model.predict(chi_vector)
     print('----------推理结果------:', out)
     return out
Beispiel #2
0
def clean(file_path):
    """
    清理文本, 然后利用清理后的文本进行训练
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        lines_clean = []
        for line in lines:
            line_list = line.split('__label__')
            lines_clean.append(clean_en_text(line_list[0]) + ' __label__' + line_list[1])

    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(lines_clean)
Beispiel #3
0
    def __select_features(data_set):
        data_set[0] = [clean_en_text(data) for data in data_set[0]]
        tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
                                       binary=True,
                                       sublinear_tf=True)
        tf_vectors = tf_idf_model.fit_transform(data_set[0])

        k = int(tf_vectors.shape[1] / 5)
        chi_model = SelectKBest(chi2, k=k)
        chi_features = chi_model.fit_transform(tf_vectors, data_set[1])
        print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
        print('chi:\t\t' + str(chi_features.shape[1]))

        return chi_features, tf_idf_model, chi_model
Beispiel #4
0
def load_data_and_labels(pos_file, neg_file):
    # 读取文件
    positive_examples = list(open(pos_file, "r", encoding='utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(neg_file, "r", encoding='utf-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # 分词
    x_text = positive_examples + negative_examples
    x_text = [clean_en_text(sent) for sent in x_text]
    x_text = [s.split(' ') for s in x_text]
    # 生成label
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return x_text, y
    def select_features(self, data_set):
        dataset = [clean_en_text(data) for data in data_set['text']]
        tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
                                       binary=True,
                                       sublinear_tf=True)
        tf_vectors = tf_idf_model.fit_transform(dataset)

        # 选出前1/6的词用来做特征
        k = int(tf_vectors.shape[1] / 5)
        chi_model = SelectKBest(chi2, k=10)
        chi_features = chi_model.fit_transform(tf_vectors, data_set['label'])
        print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
        print('chi:\t\t' + str(chi_features.shape[1]))

        return chi_features, tf_idf_model, chi_model
Beispiel #6
0
from nlp.utils.clean_text import clean_en_text

if __name__ == '__main__':
    sentence = 'This is a good time\' , please be happy'

    print(clean_en_text(sentence))

    path = 'data/imdb/aclImdb.txt'
    path2 = 'data/imdb/aclImdb_a.txt'
    out_lines = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

        for line in lines:
            line_arr = line.split("##")
            out_lines.append(line_arr[1].strip() + '##' + line_arr[0].strip() +
                             '\n')
    with open(path2, 'w', encoding='utf-8') as f:
        f.writelines(out_lines)