Beispiel #1
0
def import_models(dataset):
    models = {}
    for f in glob.glob('checkpoints/cnn_{}_*'.format(dataset)):
        fname = os.path.split(f)[1]
        embedding_dims = 300
        embedding_type = get_embedding_type(fname)

        X_train, y_train = load('{}_train'.format(dataset))
        vocab = load('{}_vocab'.format(dataset)).vocab

        model = TextCNN(dataset=dataset,
                        input_size=X_train.shape[1],
                        vocab_size=len(vocab) + 1,
                        embedding_dims=embedding_dims,
                        embedding_type=embedding_type)
        model.load_state_dict(torch.load(f))
        model.eval()
        models[fname] = model

    return models
Beispiel #2
0
def main():
    test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim)
    test_loader = DataLoader(test_set,
                             batch_size=args.bs,
                             shuffle=False,
                             drop_last=True)

    if args.model == 'textcnn':
        model = TextCNN(input_dim, 200)
        model.load_state_dict(torch.load('./saved_models/textcnn.pkl'))
    elif args.model == 'lstm':
        model = MyLSTM(input_dim, hidden_dim=8)
        model.load_state_dict(torch.load('./saved_models/lstm.pkl'))
    else:
        print('"--model" argument only accepts "textcnn" or "lstm"')
        exit(0)

    model = model.to(device)

    pred, ans, pred_dists, true_dists = test(model, test_loader, device,
                                             args.bs)
    calc_f1_score(pred, ans)
    calc_coef(pred_dists, true_dists)
Beispiel #3
0
class Classify:

    def __init__(self, features='word', device='gpu'):
        self.features = features
        self.sentence_length = TextCNNConfig.sequence_length
        self.device = device
        self.__device()
        self.load_vocab()
        self.__load_model()

    def __device(self):
        if torch.cuda.is_available() and self.device=='gpu':
            self.device = torch.device('cuda')
        else:
            self.device = 'cpu'

    def __load_model(self):
        self.model = TextCNN(TextCNNConfig)
        self.model.load_state_dict(torch.load("./ckpts/cnn_model.pth"))
        self.model.to(self.device)
        self.model.eval()

    def load_vocab(self):
        with open('./ckpts/vocab.txt','r',encoding='utf-8') as f:
            vocab = f.read().strip().split('\n')
        self.vocab = {k: v for k, v in zip(vocab, range(len(vocab)))}

        with open('./ckpts/target.txt','r',encoding='utf-8') as f:
            target = f.read().strip().split('\n')
        self.target = {v: k for k, v in zip(target, range(len(target)))}        

    def cut_words(self, sentence : str) -> list:
        if self.features == 'word':
            return jieba.lcut(sentence)
        else:
            return list(sentence)

    def sentence_cut(self, sentence):
        """针对一个句子的字符转ID,并截取到固定长度,返回定长的字符代号。"""
        words = self.cut_words(sentence)
        if len(words) >= self.sentence_length:
            sentence_cutted = words[:self.sentence_length]
        else:
            sentence_cutted = words + ["<PAD>"] * (self.sentence_length - len(words))
        sentence_id = [self.vocab[w] if w in self.vocab else self.vocab["<UNK>"] for w in sentence_cutted]
        return sentence_id

    def predict(self, content):
        """
        传入一个句子,测试单个类别
        """
        with torch.no_grad():
            content_id = [self.sentence_cut(content)]
            start_time = time.time()
            content_id = torch.LongTensor(content_id)
            one_batch_input = content_id.to(self.device)
            outputs = self.model(one_batch_input)
            max_value, max_index = torch.max(outputs, axis=1)
            predict = max_index.cpu().numpy()
            print(time.time()-start_time)
        return self.target[predict[0]]