Esempio n. 1
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings, fine_tune):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # 取bert最后一层的输出作为text-cnn模型的输入
    embedding = model.get_sequence_output()
    if not fine_tune:
        embedding = tf.stop_gradient(embedding)
    tf.logging.info("bert embedding size: {}".format(embedding.get_shape()))
    text_cnn = TextCNN(embedded_chars=embedding,
                       filter_sizes=FLAGS.filter_sizes,
                       num_filter=FLAGS.num_filter,
                       labels=labels,
                       num_label=num_labels,
                       dropout_rate=FLAGS.dropout_rate,
                       max_len=FLAGS.max_seq_length,
                       is_training=True)

    result = text_cnn.gen_result()
    return result
Esempio n. 2
0
def main():
    train_set = SinaDataset(path.join(args.source, 'train.json'), input_dim)
    test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim)
    train_loader = DataLoader(train_set,
                              batch_size=args.bs,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(test_set,
                             batch_size=args.bs,
                             shuffle=True,
                             drop_last=True)

    model = TextCNN(input_dim, 200)
    # model = MyLSTM(input_dim, hidden_dim=8)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.wd)

    epoch = 0
    train_loss = []
    train_accu = []
    valid_loss = []
    valid_accu = []
    while True:
        epoch += 1
        epoch_loss, epoch_accu = train_one_epoch(epoch, model, optimizer,
                                                 train_loader, device, args.bs)
        val_loss, val_accu = validate(model, test_loader, device, args.bs)
        train_loss += epoch_loss
        train_accu += epoch_accu
        valid_loss += val_loss
        valid_accu += val_accu

        print('saving...')
        torch.save(model.state_dict(),
                   './saved_models/epoch' + str(epoch) + '.pkl')
        print()

        if args.max_epoch and epoch >= args.max_epoch:
            train_result = {
                'batch-size': args.bs,
                'train-loss': train_loss,
                'train-accu': train_accu,
                'valid-loss': valid_loss,
                'valid-accu': valid_accu
            }
            with open('train-result.json', 'w', encoding='utf-8') as f:
                json.dump(train_result, f)

            break
def main():
    # create the experiments dirs
    create_dirs(config)

    # create tensorflow session
    sess = tf.Session()

    # build preprocessor
    preprocessor = Preprocessor(config)

    # load data, preprocess and generate data
    data = DataGenerator(preprocessor, config)

    # create an instance of the model you want
    model = TextCNN.TextCNN(preprocessor, config)

    # create tensorboard logger
    logger = Logger(sess, config)

    # create trainer and pass all the previous components to it
    trainer = Trainer(sess, model, data, config, logger)

    # load model if exists
    model.load(sess)

    # here you train your model
    trainer.train()
Esempio n. 4
0
def main():
    test_set = SinaDataset(path.join(args.source, 'test.json'), input_dim)
    test_loader = DataLoader(test_set,
                             batch_size=args.bs,
                             shuffle=False,
                             drop_last=True)

    if args.model == 'textcnn':
        model = TextCNN(input_dim, 200)
        model.load_state_dict(torch.load('./saved_models/textcnn.pkl'))
    elif args.model == 'lstm':
        model = MyLSTM(input_dim, hidden_dim=8)
        model.load_state_dict(torch.load('./saved_models/lstm.pkl'))
    else:
        print('"--model" argument only accepts "textcnn" or "lstm"')
        exit(0)

    model = model.to(device)

    pred, ans, pred_dists, true_dists = test(model, test_loader, device,
                                             args.bs)
    calc_f1_score(pred, ans)
    calc_coef(pred_dists, true_dists)
Esempio n. 5
0
def import_models(dataset):
    models = {}
    for f in glob.glob('checkpoints/cnn_{}_*'.format(dataset)):
        fname = os.path.split(f)[1]
        embedding_dims = 300
        embedding_type = get_embedding_type(fname)

        X_train, y_train = load('{}_train'.format(dataset))
        vocab = load('{}_vocab'.format(dataset)).vocab

        model = TextCNN(dataset=dataset,
                        input_size=X_train.shape[1],
                        vocab_size=len(vocab) + 1,
                        embedding_dims=embedding_dims,
                        embedding_type=embedding_type)
        model.load_state_dict(torch.load(f))
        model.eval()
        models[fname] = model

    return models
vocab_size = 5000
seq_length = 20
best_score = 1000
target_params = pickle.load(open('save/target_params.pkl', 'rb'), encoding='bytes')
target_lstm = TARGET_LSTM(vocab_size, 64, 32, 32, 20, 0, target_params)
start_token = 0

# generator
generator = SeqGAN(seq_length, vocab_size, gen_emb_dim, gen_hidden_dim, start_token, oracle=True).to_gpu()
if args.gen:
    print(args.gen)
    serializers.load_hdf5(args.gen, generator)

# discriminator
discriminator = TextCNN(num_classes=2, vocab_size=vocab_size,
                        embedding_size=dis_embedding_dim,
                        filter_sizes=dis_filter_sizes, num_filters=dis_num_filters).to_gpu()
if args.dis:
    serializers.load_hdf5(args.dis, discriminator)

sess = tf.Session()
sess.run(tf.initialize_all_variables())
gen_data_loader.create_batches(positive_file)
generate_samples_pos(sess, target_lstm, 64, 10000, positive_file)

# summaries
summary_dir = os.path.join(out_dir, "summaries")

loss_ = tf.placeholder(tf.float32)
target_loss_summary = tf.scalar_summary('target_loss', loss_)
dis_loss_summary = tf.scalar_summary('dis_loss', loss_)
Esempio n. 7
0
print('train_num = {}'.format(train_num))
print('test_num = {}'.format(test_num))
vocab_size = 2000
seq_length = 40
start_token = 0

# generator
generator = SeqGAN(vocab_size=vocab_size, emb_dim=args.gen_emb_dim, hidden_dim=args.gen_hidden_dim,
                   sequence_length=seq_length, start_token=start_token, lstm_layer=args.num_lstm_layer,
                   dropout=True).to_gpu()
if args.gen:
    serializers.load_hdf5(args.gen, generator)

# discriminator
discriminator = TextCNN(num_classes=2, vocab_size=vocab_size, embedding_size=args.dis_embedding_dim,
                        filter_sizes=[int(n) for n in args.dis_filter_sizes.split(',')],
                        num_filters=[int(n) for n in args.dis_num_filters.split(',')]
                        ).to_gpu()
if args.dis:
    serializers.load_hdf5(args.dis, discriminator)

# set optimizer
gen_optimizer = optimizers.Adam(alpha=args.gen_lr)
gen_optimizer.setup(generator)
gen_optimizer.add_hook(chainer.optimizer.GradientClipping(args.gen_grad_clip))

dis_optimizer = optimizers.Adam(alpha=args.dis_lr)
dis_optimizer.setup(discriminator)
dis_optimizer.add_hook(NamedWeightDecay(args.dis_l2_reg_lambda, '/out/'))

# summaries
sess = tf.Session()
Esempio n. 8
0
def train(name, dataset, epochs, batch_size, learning_rate, regularization,
          embedding_dims, embedding_type):

    dirname, _ = os.path.split(os.path.abspath(__file__))
    run_uid = datetime.datetime.today().strftime('%Y-%m-%dT%H:%M:%S')
    logger = StatsLogger(dirname, 'stats', name, run_uid)

    print('Loading data')
    X_train, y_train = load('{}_train'.format(dataset))
    X_valid, y_valid = load('{}_valid'.format(dataset))
    vocab = load('{}_vocab'.format(dataset)).vocab

    X_train = torch.as_tensor(X_train, dtype=torch.long)
    y_train = torch.as_tensor(y_train, dtype=torch.float)
    X_valid = torch.as_tensor(X_valid, dtype=torch.long)
    y_valid = torch.as_tensor(y_valid, dtype=torch.float)

    prev_acc = 0

    model = TextCNN(dataset=dataset,
                    input_size=X_train.size()[1],
                    vocab_size=len(vocab) + 1,
                    embedding_dims=embedding_dims,
                    embedding_type=embedding_type)
    print(model)
    print('Parameters: {}'.format(sum([p.numel() for p in \
                                  model.parameters() if p.requires_grad])))
    print('Training samples: {}'.format(len(X_train)))

    if torch.cuda.is_available():
        X_train = X_train.cuda()
        y_train = y_train.cuda()
        X_valid = X_valid.cuda()
        y_valid = y_valid.cuda()
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=regularization)
    criterion = nn.BCEWithLogitsLoss()

    print('Starting training')
    for epoch in range(epochs):
        epoch_loss = []
        epoch_acc = []

        iters = 0
        total_iters = num_batches(len(X_train), batch_size)

        for i, batch in enumerate(minibatch_iter(len(X_train), batch_size)):
            model.train()

            X_train_batch = X_train[batch]
            y_train_batch = y_train[batch]

            if torch.cuda.is_available():
                X_train_batch = X_train_batch.cuda()
                y_train_batch = y_train_batch.cuda()

            optimizer.zero_grad()

            output = model(X_train_batch)
            train_loss = criterion(output, y_train_batch)
            train_acc = accuracy(output, y_train_batch)

            epoch_loss.append(train_loss.item())
            epoch_acc.append(train_acc.item())

            train_loss.backward()
            optimizer.step()

        model.eval()
        train_loss, train_acc = np.mean(epoch_loss), np.mean(epoch_acc)
        valid_loss, valid_acc, _ = compute_dataset_stats(
            X_valid, y_valid, model, nn.BCEWithLogitsLoss(), 256)

        stats = [epoch + 1, train_loss, train_acc, valid_loss, valid_acc]
        epoch_string = '* Epoch {}: t_loss={:.3f}, t_acc={:.3f}, ' + \
                       'v_loss={:.3f}, v_acc={:.3f}'
        print(epoch_string.format(*stats))
        logger.write(stats)

        # checkpoint model
        if prev_acc < valid_acc:
            prev_acc = valid_acc
            model_path = os.path.join(dirname, 'checkpoints', name)
            torch.save(model.state_dict(), model_path)

    logger.close()
#print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(sequence_length=x_train.shape[1],
                      num_classes=y_train.shape[1],
                      embedding_matrix=embedding_matrix,
                      embedding_size=FLAGS.embedding_dim,
                      filter_sizes=list(map(int,
                                            FLAGS.filter_sizes.split(","))),
                      num_filters=FLAGS.num_filters,
                      l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
Esempio n. 10
0
        train_iter, val_iter, word_vectors = dataset2dataloader(batch_size=100,
                                                                debug=True)
    else:
        train_iter, val_iter, word_vectors, X_lang = make_dataloader(
            batch_size=100, debug=True)

    for model_name in model_names[-1:]:
        if model_name == "RNN":
            model = TextRNN(vocab_size=len(word_vectors),
                            embedding_dim=50,
                            hidden_size=128,
                            num_of_class=num_of_class,
                            weights=word_vectors)
        elif model_name == "CNN":
            model = TextCNN(vocab_size=len(word_vectors),
                            embedding_dim=50,
                            num_of_class=num_of_class,
                            embedding_vectors=word_vectors)
        elif model_name == "LSTM":
            model = TextRNN(vocab_size=len(word_vectors),
                            embedding_dim=50,
                            hidden_size=128,
                            num_of_class=num_of_class,
                            weights=word_vectors,
                            rnn_type="LSTM")
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        loss_fun = torch.nn.CrossEntropyLoss()

        for epoch in range(epoch_num):
            model.train()  # 包含dropout或者BN的模型需要指定
            for i, batch in enumerate(train_iter):
                if load_data_by_torchtext:
Esempio n. 11
0
def cv_score(dataset,
             embedding_type,
             epochs,
             batch_size=32,
             learning_rate=1e-4,
             regularization=0):
    kf = KFold(10)
    X, y = load('{}_train'.format(dataset))
    vocab = load('{}_vocab'.format(dataset)).vocab

    cv_acc = []
    cv_std = []

    for ci, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        X_train = torch.as_tensor(X_train, dtype=torch.long).cuda()
        y_train = torch.as_tensor(y_train, dtype=torch.float).cuda()
        X_test = torch.as_tensor(X_test, dtype=torch.long).cuda()
        y_test = torch.as_tensor(y_test, dtype=torch.float).cuda()

        model = TextCNN(dataset=dataset,
                        input_size=X_train.shape[1],
                        vocab_size=len(vocab) + 1,
                        embedding_dims=300,
                        embedding_type=embedding_type).cuda()

        optimizer = optim.Adam(model.parameters(),
                               lr=learning_rate,
                               weight_decay=regularization)
        criterion = nn.BCEWithLogitsLoss()

        model.train()

        for epoch in range(epochs):
            for i, batch in enumerate(minibatch_iter(len(X_train),
                                                     batch_size)):
                X_train_batch = X_train[batch].cuda()
                y_train_batch = y_train[batch].cuda()

                optimizer.zero_grad()

                output = model(X_train_batch)
                train_loss = criterion(output, y_train_batch)

                train_loss.backward()
                optimizer.step()

        model.eval()
        _, test_acc, test_std = compute_dataset_stats(X_test, y_test, model,
                                                      nn.BCEWithLogitsLoss(),
                                                      256)

        cv_acc.append(test_acc)
        cv_std.append(test_std)
        print('  [{}] acc={}, std={}'.format(ci + 1, test_acc, test_std))

    print('{} - {}'.format(dataset, embedding_type))
    print('Mean acc - {}'.format(np.mean(cv_acc)))
    print('Min acc - {}'.format(np.min(cv_acc)))
    print('Max acc - {}'.format(np.max(cv_acc)))
    print('Mean std - {}'.format(np.mean(cv_std)))
Esempio n. 12
0
 def __load_model(self):
     self.model = TextCNN(TextCNNConfig)
     self.model.load_state_dict(torch.load("./ckpts/cnn_model.pth"))
     self.model.to(self.device)
     self.model.eval()
Esempio n. 13
0
class Classify:

    def __init__(self, features='word', device='gpu'):
        self.features = features
        self.sentence_length = TextCNNConfig.sequence_length
        self.device = device
        self.__device()
        self.load_vocab()
        self.__load_model()

    def __device(self):
        if torch.cuda.is_available() and self.device=='gpu':
            self.device = torch.device('cuda')
        else:
            self.device = 'cpu'

    def __load_model(self):
        self.model = TextCNN(TextCNNConfig)
        self.model.load_state_dict(torch.load("./ckpts/cnn_model.pth"))
        self.model.to(self.device)
        self.model.eval()

    def load_vocab(self):
        with open('./ckpts/vocab.txt','r',encoding='utf-8') as f:
            vocab = f.read().strip().split('\n')
        self.vocab = {k: v for k, v in zip(vocab, range(len(vocab)))}

        with open('./ckpts/target.txt','r',encoding='utf-8') as f:
            target = f.read().strip().split('\n')
        self.target = {v: k for k, v in zip(target, range(len(target)))}        

    def cut_words(self, sentence : str) -> list:
        if self.features == 'word':
            return jieba.lcut(sentence)
        else:
            return list(sentence)

    def sentence_cut(self, sentence):
        """针对一个句子的字符转ID,并截取到固定长度,返回定长的字符代号。"""
        words = self.cut_words(sentence)
        if len(words) >= self.sentence_length:
            sentence_cutted = words[:self.sentence_length]
        else:
            sentence_cutted = words + ["<PAD>"] * (self.sentence_length - len(words))
        sentence_id = [self.vocab[w] if w in self.vocab else self.vocab["<UNK>"] for w in sentence_cutted]
        return sentence_id

    def predict(self, content):
        """
        传入一个句子,测试单个类别
        """
        with torch.no_grad():
            content_id = [self.sentence_cut(content)]
            start_time = time.time()
            content_id = torch.LongTensor(content_id)
            one_batch_input = content_id.to(self.device)
            outputs = self.model(one_batch_input)
            max_value, max_index = torch.max(outputs, axis=1)
            predict = max_index.cpu().numpy()
            print(time.time()-start_time)
        return self.target[predict[0]]
                            shuffle=False,
                            num_workers=16,
                            pin_memory=True)

    glove_file = "GloVe/glove.6B.300d.txt"
    if not args.flat:
        emb_dim = 300  # Document and label embed length
    else:
        emb_dim = trainvalset.n_labels
    word_embed_dim = 300

    # Model
    doc_model = TextCNN(
        trainvalset.text_dataset.vocab,
        glove_file=glove_file,
        emb_dim=emb_dim,
        dropout_p=0.1,
        word_embed_dim=word_embed_dim,
    )
    doc_lr = 0.001
    label_model = LabelEmbedModel(trainvalset.n_labels,
                                  emb_dim=emb_dim,
                                  dropout_p=0.6,
                                  eye=args.flat)

    if args.cascaded_step2:
        label_model_pretrained = torch.load(
            args.pretrained_label_model)['label_model']
        label_model.load_state_dict(label_model_pretrained)

    if args.flat or args.cascaded_step2: