def test(method, train_data, test_data):
    config = function.read_json_file(NET_CONFIG_FOLDER + "/" + method +
                                     ".json")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    train_loader, val_loader, test_loader = torch_data.load_data(
        train_data, test_data, config['sentence_len'], config['batch_size'],
        device)

    if method == 'RNN':
        net = RNN(config)
    elif method == 'MLP':
        net = MLP(config)
    elif method == 'RCNN':
        net = RCNN(config)
    elif method == 'FastText':
        net = FastText(config)
    else:
        net = CNN(config)
    net.to(device)

    if 'load' in config:
        # read pre-trained net
        net.load_state_dict(torch.load(NET_FOLDER + "/" + method))
    else:
        net = train(config, net, train_loader, val_loader)
        if 'save' in config:
            torch.save(net.state_dict(), NET_FOLDER + "/" + method)
    _, coefficient, report = test_stat(test_loader, net)
    print("Test result:")
    print(report)
    print(f"coefficient = {coefficient:.4f}")
Beispiel #2
0
def gen_test_set(file_path, test_count=500, only_long_sentence=True):
    homo_dic = function.read_json_file(HOMO_DIC_PATH)
    all_sentences = function.read_json_file(file_path)
    all_length = len(all_sentences)
    test_index = random.sample(range(0, all_length), test_count)
    answers = []
    inputs = []
    char_count = 0
    for index in test_index:
        sentence = all_sentences[index]
        length = len(sentence)
        if only_long_sentence and length < 10:
            continue
        chars = [sentence[i] for i in range(0, length, 2)]
        pinyin_ids = [int(sentence[i]) for i in range(1, length, 2)]
        pinyins = []
        for char, pinyin_id in zip(chars, pinyin_ids):
            for dic_pinyin, dic_id in homo_dic[char].items():
                if dic_id == pinyin_id:
                    pinyins.append(dic_pinyin)
                    continue
        answers.append(''.join(chars) + '\n')
        inputs.append(' '.join(pinyins) + '\n')
        char_count += len(chars)
    new_all_sentences = []
    # delete test from training file
    for index, sentence in enumerate(all_sentences):
        if index not in test_index:
            new_all_sentences.append(sentence)

    function.write_json_file(file_path, new_all_sentences)
    with open(TEST_INPUT, "a") as file:
        file.writelines(inputs)
    with open(TEST_ANSWER, "a", encoding='gbk') as file:
        file.writelines(answers)
    print(
        f"Generate a test set with {len(inputs)} sentences and {char_count} characters. "
        f"Test input added at {TEST_INPUT}. Answer added at {TEST_ANSWER}")
Beispiel #3
0
 def data2tensor_vocab(self, news_data):
     word2id = function.read_json_file(WORD2ID_PATH)
     sentences, labels, emotions = function.data2vec(news_data)
     texts_id = []
     for sentence in sentences:
         word_list = sentence.split(' ')
         words_id = []
         for word in word_list:
             words_id.append(word2id.get(word, word2id[UNKNOWN]))
         if len(words_id) < self.sentence_len:
             # padding
             words_id.extend([word2id[PADDING]] *
                             (self.sentence_len - len(words_id)))
         texts_id.append(words_id[:self.sentence_len])
     return (torch.tensor(texts_id).to(self.device),
             torch.tensor(labels).to(self.device),
             torch.tensor(emotions).to(self.device))
 def __init__(self, model_path):
     model_dic = function.read_json_file(model_path)
     self.model = Model.input_dict(model_dic)
     self.pinyin2char = function.read_json_file(PINYIN2CHAR_PATH)
     print("Model loaded.")
Beispiel #5
0
import torch
from constants import *
import function
import deep_learning
import numpy as np


def parse(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--command", required=True, choices=['preprocess', 'test'])
    parser.add_argument("--method", choices=['naive_bayes', 'SVM', 'CNN', 'RNN', 'MLP', 'RCNN', 'FastText'], default='naive_bayes')
    parser.add_argument("--preprocess_folder", default=SINA_FOLDER)
    parser.add_argument("--train_path", default=TRAIN_PATH)
    parser.add_argument("--test_path", default=TEST_PATH)
    parser.add_argument("--ori_embedding_path", default=ORI_EMB_PATH)
    parser.add_argument("--embedding_path", default=EMBEDDING_PATH)
    return parser.parse_args(argv)


if __name__ == '__main__':
    args = parse(sys.argv[1:])
    if args.command == 'test':
        test_data = function.read_json_file(args.test_path)
        train_data = function.read_json_file(args.train_path)
        if args.method == 'naive_bayes' or args.method == 'SVM':
            baseline_ml.test(args.method, train_data, test_data)
        else:
            deep_learning.test(args.method, train_data, test_data)
    if args.command == 'preprocess':
        preprocess.preprocess(args.preprocess_folder, args.ori_embedding_path)
Beispiel #6
0
def preprocess(folder_path, name):
    char_table = function.read_json_file(CHAR_TABLE_PATH)
    homo_dic = function.read_json_file(HOMO_DIC_PATH)

    # cut a line to chinese sentences. if ignore number, skip all sentences with number.
    def cut_sentences(inp_line, ignore_number=False):
        pro_str = ""
        sentences = []
        valid_str = True
        for char in inp_line:
            if char in char_table:
                pro_str += char
            elif char.encode("utf-8").isdigit():
                if not ignore_number:
                    pro_str += char
                else:
                    valid_str = False
            elif char in SEPARATOR:
                if valid_str:
                    sentences.append(pro_str)
                pro_str = ""
                valid_str = True
            # for english characters and other punctuations like " ", "/"
            else:
                continue
        ret_sentences = []
        for sentence in sentences:
            if len(sentence) < 2:  # sentence too short, ignore
                continue
            try:
                ret_sentences.append(cn2an.transform(sentence, mode="an2cn"))
            except ValueError:  # number too long. ignore the sentence.
                continue
        return ret_sentences

    # label value in homo_dic to each character
    def label_homo(sentence):
        pinyins = pypinyin.lazy_pinyin(sentence)
        ret_sentence = ""
        for char, pinyin in zip(sentence, pinyins):
            ret_sentence += char
            pinyin = function.pinyin_fix(pinyin)
            try:
                ret_sentence += str(homo_dic[char][pinyin])
            except KeyError:  # pinyin not in dict. caused by conflicts of pypinyin and "拼音汉字表"
                print(char, pinyin)
                ret_sentence += "0"
        return ret_sentence

    def process_file(file_path, cnt, batch_name):
        all_sentences = []
        if batch_name == 'sina':
            with open(file_path, encoding="gbk") as file:
                lines = file.readlines()
            for line in lines:
                news_piece = json.loads(line)
                title = news_piece["title"]
                content = news_piece["html"]
                all_sentences += cut_sentences(title)
                all_sentences += cut_sentences(content)
        if batch_name == 'weixin':
            with open(file_path) as file:
                lines = file.readlines()
            length = len(lines)
            for line_index in range(0, length, 3):  # get 1/3 of weixin corpus
                content = json.loads(lines[line_index])['content']
                all_sentences += cut_sentences(content, ignore_number=True)
        sentences_with_pinyin = []
        for sentence in all_sentences:
            sentences_with_pinyin.append(label_homo(sentence))
        save_path = TRAINING_DATA_PATH + f"/{name}-{cnt}.json"
        function.write_json_file(save_path, sentences_with_pinyin)
        print(f"{file_path} processed. Saved as {save_path}")

    function.pypinyin_fix()
    all_files_paths = os.listdir(folder_path)
    for index, rel_path in enumerate(all_files_paths):
        path = folder_path + "/" + rel_path
        try:
            print(f"Begin processing {path}")
            process_file(path, index, name)
        except UnicodeDecodeError:
            print("Illegal file, continue.")
Beispiel #7
0
 def train_file(file_path):
     data = function.read_json_file(file_path)
     for sentence in list(data):
         # add (n_gram - 1) 'bb' to the beginning of the sentence and 'ee' to the end
         sentence = ('bb' * (n - 1)) + sentence + 'ee'
         model.train(sentence)
Beispiel #8
0
def get_embedding():
    embedding = function.read_json_file(EMBEDDING_PATH)
    return torch.tensor(embedding['list'])