def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, fine_tune): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # 取bert最后一层的输出作为text-cnn模型的输入 embedding = model.get_sequence_output() if not fine_tune: embedding = tf.stop_gradient(embedding) tf.logging.info("bert embedding size: {}".format(embedding.get_shape())) text_cnn = TextCNN(embedded_chars=embedding, filter_sizes=FLAGS.filter_sizes, num_filter=FLAGS.num_filter, labels=labels, num_label=num_labels, dropout_rate=FLAGS.dropout_rate, max_len=FLAGS.max_seq_length, is_training=True) result = text_cnn.gen_result() return result
def main(): train_set = SinaDataset(path.join(args.source, 'train.json'), input_dim) test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim) train_loader = DataLoader(train_set, batch_size=args.bs, shuffle=True, drop_last=True) test_loader = DataLoader(test_set, batch_size=args.bs, shuffle=True, drop_last=True) model = TextCNN(input_dim, 200) # model = MyLSTM(input_dim, hidden_dim=8) model = model.to(device) optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.wd) epoch = 0 train_loss = [] train_accu = [] valid_loss = [] valid_accu = [] while True: epoch += 1 epoch_loss, epoch_accu = train_one_epoch(epoch, model, optimizer, train_loader, device, args.bs) val_loss, val_accu = validate(model, test_loader, device, args.bs) train_loss += epoch_loss train_accu += epoch_accu valid_loss += val_loss valid_accu += val_accu print('saving...') torch.save(model.state_dict(), './saved_models/epoch' + str(epoch) + '.pkl') print() if args.max_epoch and epoch >= args.max_epoch: train_result = { 'batch-size': args.bs, 'train-loss': train_loss, 'train-accu': train_accu, 'valid-loss': valid_loss, 'valid-accu': valid_accu } with open('train-result.json', 'w', encoding='utf-8') as f: json.dump(train_result, f) break
def main(): # create the experiments dirs create_dirs(config) # create tensorflow session sess = tf.Session() # build preprocessor preprocessor = Preprocessor(config) # load data, preprocess and generate data data = DataGenerator(preprocessor, config) # create an instance of the model you want model = TextCNN.TextCNN(preprocessor, config) # create tensorboard logger logger = Logger(sess, config) # create trainer and pass all the previous components to it trainer = Trainer(sess, model, data, config, logger) # load model if exists model.load(sess) # here you train your model trainer.train()
def main(): test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim) test_loader = DataLoader(test_set, batch_size=args.bs, shuffle=False, drop_last=True) if args.model == 'textcnn': model = TextCNN(input_dim, 200) model.load_state_dict(torch.load('./saved_models/textcnn.pkl')) elif args.model == 'lstm': model = MyLSTM(input_dim, hidden_dim=8) model.load_state_dict(torch.load('./saved_models/lstm.pkl')) else: print('"--model" argument only accepts "textcnn" or "lstm"') exit(0) model = model.to(device) pred, ans, pred_dists, true_dists = test(model, test_loader, device, args.bs) calc_f1_score(pred, ans) calc_coef(pred_dists, true_dists)
def import_models(dataset): models = {} for f in glob.glob('checkpoints/cnn_{}_*'.format(dataset)): fname = os.path.split(f)[1] embedding_dims = 300 embedding_type = get_embedding_type(fname) X_train, y_train = load('{}_train'.format(dataset)) vocab = load('{}_vocab'.format(dataset)).vocab model = TextCNN(dataset=dataset, input_size=X_train.shape[1], vocab_size=len(vocab) + 1, embedding_dims=embedding_dims, embedding_type=embedding_type) model.load_state_dict(torch.load(f)) model.eval() models[fname] = model return models
vocab_size = 5000 seq_length = 20 best_score = 1000 target_params = pickle.load(open('save/target_params.pkl', 'rb'), encoding='bytes') target_lstm = TARGET_LSTM(vocab_size, 64, 32, 32, 20, 0, target_params) start_token = 0 # generator generator = SeqGAN(seq_length, vocab_size, gen_emb_dim, gen_hidden_dim, start_token, oracle=True).to_gpu() if args.gen: print(args.gen) serializers.load_hdf5(args.gen, generator) # discriminator discriminator = TextCNN(num_classes=2, vocab_size=vocab_size, embedding_size=dis_embedding_dim, filter_sizes=dis_filter_sizes, num_filters=dis_num_filters).to_gpu() if args.dis: serializers.load_hdf5(args.dis, discriminator) sess = tf.Session() sess.run(tf.initialize_all_variables()) gen_data_loader.create_batches(positive_file) generate_samples_pos(sess, target_lstm, 64, 10000, positive_file) # summaries summary_dir = os.path.join(out_dir, "summaries") loss_ = tf.placeholder(tf.float32) target_loss_summary = tf.scalar_summary('target_loss', loss_) dis_loss_summary = tf.scalar_summary('dis_loss', loss_)
print('train_num = {}'.format(train_num)) print('test_num = {}'.format(test_num)) vocab_size = 2000 seq_length = 40 start_token = 0 # generator generator = SeqGAN(vocab_size=vocab_size, emb_dim=args.gen_emb_dim, hidden_dim=args.gen_hidden_dim, sequence_length=seq_length, start_token=start_token, lstm_layer=args.num_lstm_layer, dropout=True).to_gpu() if args.gen: serializers.load_hdf5(args.gen, generator) # discriminator discriminator = TextCNN(num_classes=2, vocab_size=vocab_size, embedding_size=args.dis_embedding_dim, filter_sizes=[int(n) for n in args.dis_filter_sizes.split(',')], num_filters=[int(n) for n in args.dis_num_filters.split(',')] ).to_gpu() if args.dis: serializers.load_hdf5(args.dis, discriminator) # set optimizer gen_optimizer = optimizers.Adam(alpha=args.gen_lr) gen_optimizer.setup(generator) gen_optimizer.add_hook(chainer.optimizer.GradientClipping(args.gen_grad_clip)) dis_optimizer = optimizers.Adam(alpha=args.dis_lr) dis_optimizer.setup(discriminator) dis_optimizer.add_hook(NamedWeightDecay(args.dis_l2_reg_lambda, '/out/')) # summaries sess = tf.Session()
def train(name, dataset, epochs, batch_size, learning_rate, regularization, embedding_dims, embedding_type): dirname, _ = os.path.split(os.path.abspath(__file__)) run_uid = datetime.datetime.today().strftime('%Y-%m-%dT%H:%M:%S') logger = StatsLogger(dirname, 'stats', name, run_uid) print('Loading data') X_train, y_train = load('{}_train'.format(dataset)) X_valid, y_valid = load('{}_valid'.format(dataset)) vocab = load('{}_vocab'.format(dataset)).vocab X_train = torch.as_tensor(X_train, dtype=torch.long) y_train = torch.as_tensor(y_train, dtype=torch.float) X_valid = torch.as_tensor(X_valid, dtype=torch.long) y_valid = torch.as_tensor(y_valid, dtype=torch.float) prev_acc = 0 model = TextCNN(dataset=dataset, input_size=X_train.size()[1], vocab_size=len(vocab) + 1, embedding_dims=embedding_dims, embedding_type=embedding_type) print(model) print('Parameters: {}'.format(sum([p.numel() for p in \ model.parameters() if p.requires_grad]))) print('Training samples: {}'.format(len(X_train))) if torch.cuda.is_available(): X_train = X_train.cuda() y_train = y_train.cuda() X_valid = X_valid.cuda() y_valid = y_valid.cuda() model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=regularization) criterion = nn.BCEWithLogitsLoss() print('Starting training') for epoch in range(epochs): epoch_loss = [] epoch_acc = [] iters = 0 total_iters = num_batches(len(X_train), batch_size) for i, batch in enumerate(minibatch_iter(len(X_train), batch_size)): model.train() X_train_batch = X_train[batch] y_train_batch = y_train[batch] if torch.cuda.is_available(): X_train_batch = X_train_batch.cuda() y_train_batch = y_train_batch.cuda() optimizer.zero_grad() output = model(X_train_batch) train_loss = criterion(output, y_train_batch) train_acc = accuracy(output, y_train_batch) epoch_loss.append(train_loss.item()) epoch_acc.append(train_acc.item()) train_loss.backward() optimizer.step() model.eval() train_loss, train_acc = np.mean(epoch_loss), np.mean(epoch_acc) valid_loss, valid_acc, _ = compute_dataset_stats( X_valid, y_valid, model, nn.BCEWithLogitsLoss(), 256) stats = [epoch + 1, train_loss, train_acc, valid_loss, valid_acc] epoch_string = '* Epoch {}: t_loss={:.3f}, t_acc={:.3f}, ' + \ 'v_loss={:.3f}, v_acc={:.3f}' print(epoch_string.format(*stats)) logger.write(stats) # checkpoint model if prev_acc < valid_acc: prev_acc = valid_acc model_path = os.path.join(dirname, 'checkpoints', name) torch.save(model.state_dict(), model_path) logger.close()
#print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], embedding_matrix=embedding_matrix, embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None:
train_iter, val_iter, word_vectors = dataset2dataloader(batch_size=100, debug=True) else: train_iter, val_iter, word_vectors, X_lang = make_dataloader( batch_size=100, debug=True) for model_name in model_names[-1:]: if model_name == "RNN": model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors) elif model_name == "CNN": model = TextCNN(vocab_size=len(word_vectors), embedding_dim=50, num_of_class=num_of_class, embedding_vectors=word_vectors) elif model_name == "LSTM": model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, rnn_type="LSTM") optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fun = torch.nn.CrossEntropyLoss() for epoch in range(epoch_num): model.train() # 包含dropout或者BN的模型需要指定 for i, batch in enumerate(train_iter): if load_data_by_torchtext:
def cv_score(dataset, embedding_type, epochs, batch_size=32, learning_rate=1e-4, regularization=0): kf = KFold(10) X, y = load('{}_train'.format(dataset)) vocab = load('{}_vocab'.format(dataset)).vocab cv_acc = [] cv_std = [] for ci, (train_index, test_index) in enumerate(kf.split(X)): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] X_train = torch.as_tensor(X_train, dtype=torch.long).cuda() y_train = torch.as_tensor(y_train, dtype=torch.float).cuda() X_test = torch.as_tensor(X_test, dtype=torch.long).cuda() y_test = torch.as_tensor(y_test, dtype=torch.float).cuda() model = TextCNN(dataset=dataset, input_size=X_train.shape[1], vocab_size=len(vocab) + 1, embedding_dims=300, embedding_type=embedding_type).cuda() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=regularization) criterion = nn.BCEWithLogitsLoss() model.train() for epoch in range(epochs): for i, batch in enumerate(minibatch_iter(len(X_train), batch_size)): X_train_batch = X_train[batch].cuda() y_train_batch = y_train[batch].cuda() optimizer.zero_grad() output = model(X_train_batch) train_loss = criterion(output, y_train_batch) train_loss.backward() optimizer.step() model.eval() _, test_acc, test_std = compute_dataset_stats(X_test, y_test, model, nn.BCEWithLogitsLoss(), 256) cv_acc.append(test_acc) cv_std.append(test_std) print(' [{}] acc={}, std={}'.format(ci + 1, test_acc, test_std)) print('{} - {}'.format(dataset, embedding_type)) print('Mean acc - {}'.format(np.mean(cv_acc))) print('Min acc - {}'.format(np.min(cv_acc))) print('Max acc - {}'.format(np.max(cv_acc))) print('Mean std - {}'.format(np.mean(cv_std)))
def __load_model(self): self.model = TextCNN(TextCNNConfig) self.model.load_state_dict(torch.load("./ckpts/cnn_model.pth")) self.model.to(self.device) self.model.eval()
class Classify: def __init__(self, features='word', device='gpu'): self.features = features self.sentence_length = TextCNNConfig.sequence_length self.device = device self.__device() self.load_vocab() self.__load_model() def __device(self): if torch.cuda.is_available() and self.device=='gpu': self.device = torch.device('cuda') else: self.device = 'cpu' def __load_model(self): self.model = TextCNN(TextCNNConfig) self.model.load_state_dict(torch.load("./ckpts/cnn_model.pth")) self.model.to(self.device) self.model.eval() def load_vocab(self): with open('./ckpts/vocab.txt','r',encoding='utf-8') as f: vocab = f.read().strip().split('\n') self.vocab = {k: v for k, v in zip(vocab, range(len(vocab)))} with open('./ckpts/target.txt','r',encoding='utf-8') as f: target = f.read().strip().split('\n') self.target = {v: k for k, v in zip(target, range(len(target)))} def cut_words(self, sentence : str) -> list: if self.features == 'word': return jieba.lcut(sentence) else: return list(sentence) def sentence_cut(self, sentence): """针对一个句子的字符转ID,并截取到固定长度,返回定长的字符代号。""" words = self.cut_words(sentence) if len(words) >= self.sentence_length: sentence_cutted = words[:self.sentence_length] else: sentence_cutted = words + ["<PAD>"] * (self.sentence_length - len(words)) sentence_id = [self.vocab[w] if w in self.vocab else self.vocab["<UNK>"] for w in sentence_cutted] return sentence_id def predict(self, content): """ 传入一个句子,测试单个类别 """ with torch.no_grad(): content_id = [self.sentence_cut(content)] start_time = time.time() content_id = torch.LongTensor(content_id) one_batch_input = content_id.to(self.device) outputs = self.model(one_batch_input) max_value, max_index = torch.max(outputs, axis=1) predict = max_index.cpu().numpy() print(time.time()-start_time) return self.target[predict[0]]
shuffle=False, num_workers=16, pin_memory=True) glove_file = "GloVe/glove.6B.300d.txt" if not args.flat: emb_dim = 300 # Document and label embed length else: emb_dim = trainvalset.n_labels word_embed_dim = 300 # Model doc_model = TextCNN( trainvalset.text_dataset.vocab, glove_file=glove_file, emb_dim=emb_dim, dropout_p=0.1, word_embed_dim=word_embed_dim, ) doc_lr = 0.001 label_model = LabelEmbedModel(trainvalset.n_labels, emb_dim=emb_dim, dropout_p=0.6, eye=args.flat) if args.cascaded_step2: label_model_pretrained = torch.load( args.pretrained_label_model)['label_model'] label_model.load_state_dict(label_model_pretrained) if args.flat or args.cascaded_step2: