Ejemplo n.º 1
0
 def test_predict(self):
     ues_word = False
     if ues_word:
         tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
     else:
         tokenizer = lambda x: [y for y in x]  # char-level
     if os.path.exists(config.vocab_path):
         vocab = pkl.load(open(config.vocab_path, 'rb'))
     else:
         vocab = build_vocab(config.train_path,
                             tokenizer=tokenizer,
                             max_size=MAX_VOCAB_SIZE,
                             min_freq=1)
         pkl.dump(vocab, open(config.vocab_path, 'wb'))
     config.n_vocab = len(vocab)
     model = FastText.Model(config)
     map_location = torch.device('cpu')
     model.load_state_dict(
         torch.load(config.save_path, map_location=map_location))
     _test_sentance = sentance2ids(['2岁男童爬窗台不慎7楼坠下获救(图)'], config)
     _test_sentance = build_iterator(_test_sentance, config)
     for X, y in _test_sentance:
         result = model(X)
         result = torch.max(result.data, 1)[1].cpu().numpy()
     result = list(result)
     print(result[0])
     self.assertTrue(isinstance(result, list))
Ejemplo n.º 2
0
        from utils_fasttext import build_dataset, build_iterator, get_time_dif

        embedding = 'random'
    else:
        from utils import build_dataset, build_iterator, get_time_dif

    x = import_module('models.' + model_name)
    config = x.Config(dataset, embedding)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    vocab, train_data, dev_data, test_data = build_dataset(config, word)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    config.n_vocab = len(vocab)
    # to 是为了去除device
    model = x.Model(config).to(config.device)
    if model_name != 'Transformer':
        init_network(model)
    print(model.parameters)
    train(config, model, train_iter, dev_iter, test_iter)
Ejemplo n.º 3
0
    config = x.Config(dataset, embedding)
    config.batch_size = 1
    model = x.Model(config).to(config.device)
    model.load_state_dict(torch.load(config.save_path))
    model.eval()

    def sentence_to_index(sentence, pad_size=32):
        index = []
        vocab = pkl.load(open(config.vocab_path, 'rb'))
        tokenizer = lambda x: [y for y in x]
        lin = sentence.strip()
        content, label = lin.split('\t')
        words_line = []
        token = tokenizer(content)
        seq_len = len(token)
        if pad_size:
            if len(token) < pad_size:
                token.extend([PAD] * (pad_size - len(token)))
            else:
                token = token[:pad_size]
                seq_len = pad_size
        # word to id
        for word in token:
            words_line.append(vocab.get(word, vocab.get(UNK)))
        index.append((words_line, int(label), seq_len))
        return index


    for i, (trains, labels) in enumerate(build_iterator(sentence_to_index('学校有哪些体育设施\t0', config.pad_size), config)):
        predict = model(trains).data.max(1, keepdim=True)[1]
        print(predict)
Ejemplo n.º 4
0
ues_word = False
if ues_word:
    tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
else:
    tokenizer = lambda x: [y for y in x]  # char-level
if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
    vocab = build_vocab(config.train_path,
                        tokenizer=tokenizer,
                        max_size=MAX_VOCAB_SIZE,
                        min_freq=1)
    pkl.dump(vocab, open(config.vocab_path, 'wb'))
config.n_vocab = len(vocab)
model = FastText.Model(config)
map_location = torch.device('cpu')
model.load_state_dict(torch.load(config.save_path, map_location=map_location))
_test_sentance = sentance2ids(['北上资金卷土重来:热捧业绩预增 大幅增仓农业股'], config)
_test_sentance = build_iterator(_test_sentance, config)
for X, y in _test_sentance:
    result = model(X)
    result = torch.max(result.data, 1)[1].cpu().numpy()
result = list(result)

classify = [
    'finance', 'realty', 'stocks', 'education', 'science', 'society',
    'politics', 'sports', 'game', 'entertainment'
]

print(classify[result[0]])
Ejemplo n.º 5
0
    model_name = args.model  # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
    if model_name == 'FastText':
        from utils_fasttext import build_dataset, build_iterator, get_time_dif
        embedding = 'random'
    else:
        from utils import build_dataset, build_iterator, get_time_dif

    x = import_module('models.' + model_name)
    config = x.Config(dataset, embedding)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    w2v_vec, train_data, dev_data, test_data = build_dataset(config, args.word)
    train_iter = build_iterator(train_data, config, w2v_vec)
    dev_iter = build_iterator(dev_data, config, w2v_vec)
    test_iter = build_iterator(test_data, config, w2v_vec)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    config.n_vocab = len(w2v_vec.wv.vocab)
    model = x.Model(config).to(config.device)
    if model_name != 'Transformer':
        init_network(model)
    print(model.parameters)
    train(config, model, train_iter, dev_iter, test_iter)