def __init__(self, args): self.args = args data = Data(args.train_path, args.val_path, args.glove_path) data.build_vocab() train_data, val_data = data.input2tensor() embedding_matrix = data.build_embedding_matrix(args.embed_type, args.embed_dim) train_dataset = MyDataset(train_data, data.max_len) val_dataset = MyDataset(val_data, data.max_len) self.train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) self.val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) if args.model_type == 'CNN': self.model = CNNModel(args, data.vocab_size, embedding_matrix).to(args.device) else: self.model = LSTMNet(args, data.vocab_size, embedding_matrix).to(args.device) self.loss_func = nn.CrossEntropyLoss() self.optim = torch.optim.Adam(self.model.parameters(), lr=args.learning_rate) if torch.cuda.is_available(): print('cuda memory allocated:', torch.cuda.memory_allocated(device=args.device.index))
def load_data(train_path, val_path, glove_path): data = Data(train_path, val_path, glove_path) train_x_list, _, val_x_list, _ = data.split_sentence() data.build_vocab() orig_data = train_x_list + val_x_list train_data = get_train_data(data.vocab, orig_data) print("数据实例个数: {}".format(len(train_data))) vocab_size = len(data.vocab) + 1 print("词表长度为:", vocab_size) dist = np.array([v for k, v in data.word_freq.items()]) dist = np.power(dist, 0.75) dist = dist / dist.sum() return train_data, data.vocab, vocab_size, dist