Esempio n. 1
0
def predict_document(file_name, model):
    f = open(file_name, 'r', encoding='utf-8')
    document = []
    for line in f.readlines():
        text = cut_sentence.segment(line, type='arr')
        document.append(text)
    f.close()
    return predict_naive_bayes(document, model)
Esempio n. 2
0
def test_naive_bayes(test_file):
    test_set = open(test_file, 'r', encoding='utf-8')
    print("Load ", test_file)
    document = []
    for line in test_set.readlines():
        data = json.loads(line)
        text = cut_sentence.segment(data['text'], type='arr')
        document.append(text)
    test_set.close()
    # for doc in document:
    #     yield doc
    return document[3]
Esempio n. 3
0
def save_sentence_from_json(rootdir):
    directories = os.listdir(rootdir)
    fw = open(rootdir + 'sentences', 'a', encoding='utf-8')
    for dir in directories:
        class_of_dir = rootdir + dir
        json_files = os.listdir(class_of_dir)
        for file_name in json_files:
            fr = open(class_of_dir + '/' + file_name, 'r', encoding='utf-8')
            for line in fr.readlines():
                datum = json.loads(line)
                text = zhconv.convert(datum['text'], 'zh-cn')
                sentence = cut_sentence.segment(text)
                fw.write(sentence)
            print(file_name + ' finished')
            fr.close()
    fw.close()
    print('save_sentence_from_json ok')
Esempio n. 4
0
def load_dataset(rootdir):
    rootdir = os.path.abspath(rootdir) + '/'
    directories = os.listdir(rootdir)
    for clazz in directories:
        train_set = open(rootdir + clazz + '/train_set', 'r', encoding='utf-8')
        print("Load ", clazz)
        document = []
        sample_num = 0
        lines = train_set.readlines()
        for line in lines:
            data = json.loads(line)
            text = cut_sentence.segment(data['text'], type='arr')
            document.append(text)
            sample_num += 1
            if sample_num > max_sample: break
        documents[clazz] = document
        train_set.close()
    return documents