Beispiel #1
0
def run(config, output_dir, num_splits=5, patience=0):
    use_cuda = torch.cuda.is_available() and config.cuda_device >= 0
    vocab_file = 'data/twitter_hashtag/1kthashtag.vocab'
    dataset_file = 'data/DataSetsEraldo/dataSetSupernatural.txt'

    # The returned embedding tensor is kept unchanged to init each split model.
    emb_ = load_glove_embedding('data/twitter_hashtag/1kthashtag.glove')

    criterion = nn.CrossEntropyLoss()

    corpus = CorpusTE(train_file=dataset_file, vocab_file=vocab_file)

    metrics = {
        'accuracy': skmetrics.accuracy_score,
        'fscore_class1': skmetrics.f1_score
    }

    if config.stratified:

        def fun_split(vs):
            return corpus.stratified_split(vs)
    else:

        def fun_split(vs):
            return corpus.split(vs)

    mean = 0.0
    for split in range(1, num_splits + 1):
        # Create a copy of the embedding tensor to avoid information leak between splits.
        # It is important to call detach(), since clone() is recorded in the computation graph
        #   (gradients propagated to the cloned tensor will be propagated to the original tensor).
        emb = emb_.clone().detach()

        model = TextCNN(config=config, pre_trained_emb=emb)
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        train_corpus, valid_corpus = fun_split(config.valid_split)

        output_dir_split = os.path.join(output_dir, "split{}".format(split))

        t = Trainer(train_corpus=train_corpus,
                    valid_corpus=valid_corpus,
                    test_corpus=None,
                    model=model,
                    config=config,
                    criterion=criterion,
                    optimizer=optimizer,
                    verbose=False,
                    output_dir=output_dir_split,
                    train_metrics=metrics,
                    val_metrics=metrics,
                    selection_metric='fscore_class1',
                    use_cuda=use_cuda)
        res = t.train(tqdm_prefix="Split {}/{}".format(split, num_splits),
                      patience=patience,
                      init_res_dict={"split": split})
        pprint(res["best"])
        mean = mean + res['best']['selection_metric']
    mean = mean / num_splits
    print(mean)
Beispiel #2
0
def train():
    vocab_file = 'data/twitter_hashtag/1kthashtag.vocab'
    dataset_file = 'data/DataSetsEraldo/dataSetSupernatural.txt'

    config = TextCNNConfig()

    config.batch_size = 128
    config.stratified = False
    config.balanced = True
    config.stratified_batch = False

    corpus = CorpusTE(train_file=dataset_file, vocab_file=vocab_file)
    if config.stratified:
        train_corpus, valid_corpus = corpus.stratified_split(
            valid_split=config.valid_split)
    else:
        train_corpus, valid_corpus = corpus.split(
            valid_split=config.valid_split)

    num_epochs = 12
    num_iter = num_epochs * ceil(len(train_corpus.y_data) / config.batch_size)
    lr_min = 1e-5
    lr_max = 1

    config.learning_rate = lr_min
    config.num_epochs = num_epochs

    emb = load_glove_embedding('data/twitter_hashtag/1kthashtag.glove')
    model = TextCNN(config=config, pre_trained_emb=emb)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    output_dir = "results/out_train_{}".format(
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S-%f"))

    metrics = {
        'accuracy': skmetrics.accuracy_score,
        'fscore_class1': skmetrics.f1_score
    }

    lr_scheduler = LambdaLR(
        optimizer, lambda it: (lr_max / lr_min)**(it / (num_iter - 1)))

    t = Trainer(train_corpus=train_corpus,
                valid_corpus=valid_corpus,
                test_corpus=None,
                model=model,
                config=config,
                criterion=criterion,
                optimizer=optimizer,
                verbose=True,
                output_dir=output_dir,
                train_metrics=metrics,
                val_metrics=metrics,
                selection_metric='fscore_class1',
                lr_scheduler=lr_scheduler)
    res = t.train(patience=0)
    pprint(res["best"])
Beispiel #3
0
def run(config, output_dir, num_rep=5, valid_split=0.2, patience=0):
    use_cuda = torch.cuda.is_available()

    mean = 0.0  ## barbara
    vocab_file = 'data/twitter_hashtag/1kthashtag.vocab'
    dataset_file = 'data/twitter_hashtag/multiple.txt'
    emb = load_glove_embedding('data/twitter_hashtag/1kthashtag.glove')

    criterion = nn.CrossEntropyLoss()

    corpus = TwitterHashtagCorpus(train_file=dataset_file,
                                  vocab_file=vocab_file)
    config.vocab_size = corpus.vocab_size
    train_corpus = Corpus()
    train_corpus.x_data = corpus.x_train[:1000]
    train_corpus.y_data = corpus.y_train[:1000]
    valid_corpus = Corpus()
    valid_corpus.x_data = corpus.x_validation[:1000]
    valid_corpus.y_data = corpus.y_validation[:1000]

    metrics = {'accuracy': skmetrics.accuracy_score}

    for rep in range(1, num_rep + 1):
        model = TextCNN(config=config, pre_trained_emb=emb)
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        #train_corpus, valid_corpus = corpus.split(valid_split=valid_split)

        output_dir_rep = os.path.join(output_dir, "rep{}".format(rep))

        t = Trainer(train_corpus=train_corpus,
                    valid_corpus=valid_corpus,
                    test_corpus=None,
                    model=model,
                    config=config,
                    criterion=criterion,
                    optimizer=optimizer,
                    verbose=False,
                    output_dir=output_dir_rep,
                    train_metrics=metrics,
                    val_metrics=metrics,
                    selection_metric='accuracy',
                    use_cuda=use_cuda)
        res = t.train(tqdm_prefix="Rep {}/{}".format(rep, num_rep),
                      patience=patience,
                      init_res_dict={"rep": rep})

        pprint(res["best"])
        mean = mean + res['best']['selection_metric']
    mean = mean / num_rep
    print(mean)
Beispiel #4
0
 def __init__(self, emb_dim, filter_num, filter_sizes, dropout_p=0.5):
     super(Discriminator, self).__init__()
     # TODO: add dropout
     self.query_cnn = TextCNN(emb_dim, filter_num, filter_sizes)
     self.response_cnn = TextCNN(emb_dim, filter_num, filter_sizes)
     self.dropout = nn.Dropout(p=dropout_p)
     self.judger = nn.Sequential(
         nn.Linear(2 * filter_num * len(filter_sizes), 128),
         #nn.ReLU(),
         #nn.Linear(256, 128),
         nn.ReLU(),
         self.dropout,
         nn.Linear(128, 1),
         nn.Sigmoid())
Beispiel #5
0
 def __init__(self,
              vocab_size,
              emb_dim,
              filter_num,
              filter_sizes,
              dropout=0.0):
     super(Discriminator, self).__init__()
     self.query_cnn = TextCNN(emb_dim, filter_num, filter_sizes)
     self.response_cnn = TextCNN(emb_dim, filter_num, filter_sizes)
     self.dropout = nn.Dropout(p=dropout)
     self.embeddings = nn.Embedding(vocab_size, emb_dim)
     #
     self.judger = nn.Sequential(
         nn.Linear(2 * filter_num * len(filter_sizes), 128), nn.ReLU(),
         self.dropout, nn.Linear(128, 2), nn.Softmax(dim=1))
Beispiel #6
0
def test():

    with tf.Session() as sess:
        
        vocab = load_vocab(TRAIN_VOCAB_FILENAME)
        cnn = TextCNN(SEQUENCE_LENGTH, NUM_CLASS, len(vocab), 128, [3,4,5], 128)
        saver = tf.train.Saver()
        saver.restore(sess, './textcnn.ckpt')
        print('model restored')
		
        # http 통신 post 로 body 에 'str' 
        input_text = request.form['str']
        masterName = request.form['masterName']
		
        tokens = tokenize(input_text)
        print('입력 문장을 다음의 토큰으로 분해:')
        print(tokens)

        sequence = [get_token_id(t, vocab) for t in tokens]
        x = []
        while len(sequence) > 0:
            seq_seg = sequence[:SEQUENCE_LENGTH]
            sequence = sequence[SEQUENCE_LENGTH:]

            padding = [1] *(SEQUENCE_LENGTH - len(seq_seg))
            seq_seg = seq_seg + padding

            x.append(seq_seg)
        
        feed_dict = {
            cnn.input : x,
            cnn.dropout_keep_prob : 1.0
        }

        predict = sess.run([cnn.predictions], feed_dict)
        

        result = np.mean(predict) 
        if (result > 0.75):
            print('추천')
        elif (result < 0.25):
            print('비추천')
        else:
            print('평가 불가능')

    
    MyMentorDB.update_item(
        Key={
            'Username' : masterName
        },
        UpdateExpression='ADD grade :val',
        ExpressionAttributeValues = {
        ':val' : int(result)
        }
    )

    
    tf.reset_default_graph()
    
    return (str(result))
def test():
    with tf.Session() as sess:
        vocab = load_vocab(TRAIN_VOCAB_FILENAME)
        cnn = TextCNN(SEQUENCE_LENGTH, NUM_CLASS, len(vocab), 128, [3, 4, 5],
                      128)
        saver = tf.train.Saver()
        saver.restore(sess, './textcnn.ckpt')
        print('model restored')

        input_text = input('사용자 평가를 문장으로 입력하세요: ')
        tokens = tokenize(input_text)
        print('입력 문장을 다음의 토큰으로 분해:')
        print(tokens)

        sequence = [get_token_id(t, vocab) for t in tokens]
        x = []
        while len(sequence) > 0:
            seq_seg = sequence[1:SEQUENCE_LENGTH]
            sequence = sequence[SEQUENCE_LENGTH:]

            padding = [1] * (SEQUENCE_LENGTH - len(seq_seg))
            seq_seg = seq_seg + padding

            x.append(seq_seg)

        feed_dict = {cnn.input: x, cnn.dropout_keep_prob: 1.0}

        predict = sess.run([cnn.predictions], feed_dict)
        result = np.mean(predict)
        if (result > 0.75):
            print('추천')
        elif (result < 0.25):
            print('비추천')
        else:
            print('평가 불가능')
Beispiel #8
0
def test(x_train, y_train, tokenizer, x_dev, y_dev, batch_size=64):
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(tokenizer.vocab),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)
            out_dir = FLAGS.out_dir
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)
            ckpt_file = tf.train.latest_checkpoint(checkpoint_dir)
            saver.restore(sess, ckpt_file)

            def test_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                loss, accuracy, predict = sess.run(
                    [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict)
                auc = calAUC(predict, y_batch)
                time_str = datetime.datetime.now().isoformat()
                print("{}: loss {:g}, acc {:g}, auc {:g}".format(
                    time_str, loss, accuracy, auc))

            test_step(x_dev, y_dev)
            test_step(x_train[:batch_size], y_train[:batch_size])
Beispiel #9
0
def test():

    with tf.Session() as sess:

        vocab = load_vocab(TRAIN_VOCAB_FILENAME)

        cnn = TextCNN(SEQUENCE_LENGTH, NUM_CLASS, len(vocab), 128, [3, 4, 5],
                      128)

        saver = tf.train.Saver()

        saver.restore(sess, './textcnn.ckpt')

        print('model restored')

        while 1:

            input_text = input('사용자 평가를 문장으로 입력하세요(Z 입력시 종료): ')
            if input_text in ['z', 'Z']:
                break
            tokens = tokenize(input_text)

            print('입력 문장을 다음의 토큰으로 분해:')

            print(tokens)

            sequence = [get_token_id(t, vocab) for t in tokens]

            x = []

            while len(sequence) > 0:

                seq_seg = sequence[:SEQUENCE_LENGTH]

                sequence = sequence[SEQUENCE_LENGTH:]

                padding = [1] * (SEQUENCE_LENGTH - len(seq_seg))

                seq_seg = seq_seg + padding

                x.append(seq_seg)

            feed_dict = {cnn.input: x, cnn.dropout_keep_prob: 1.0}

            #별점 예측
            predict = sess.run([cnn.predictions], feed_dict)
            result = np.array(predict)
            result = result[0][0]
            print("=========================결과==========================")
            print("별점: ", result)

            if result in [0]:
                print("불만족")
            elif result in [1]:
                print("보통")
            elif result in [2]:
                print("만족")
Beispiel #10
0
def train(inputs, labels):
    """

    :param input:
    :return:
    """
    with tf.Session() as sess:
        cnn = TextCNN(flag.setence, flag.num_classes, flag.vocab_size, flag.embedding_size, flag.filter_sizes,
                      flag.num_filters, flag.keep_prob)

        output = cnn(inputs)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=labels)
        total_loss = loss + flag.decay_rate * tf.nn.l2_loss(cnn.final_weight + cnn.final_bias)
        global_step = tf.train.get_or_create_global_step()

        optimizer = tf.train.AdamOptimizer(flag.learning_rate)

        gradients_vars = optimizer.compute_gradients(total_loss)

        for i, (grad, var) in enumerate(gradients_vars):
            if grad is not None:
                gradients_vars[i] = (tf.clip_by_value(grad, -10, 10), var)
                tf.summary.histogram(var.name + '/grad', grad)  # tf.histogram_summary
        tf.summary.scalar('loss', total_loss)
        sum_merge = tf.summary.merge_all()
        train_op = optimizer.apply_gradients(gradients_vars, global_step=global_step)

        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(flag.model_saved_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('reloading model parameters..')
        else:
            print('create mdoel from scratch..')
            sess.run(tf.global_variables_initializer())

        summarizer = tf.summary.FileWriter(flag.model_saved_dir, sess.graph)

        for i in range(flag.num_loop):
            step_loss,summary, _ = sess.run([total_loss,sum_merge, train_op])
            if i %1000 == 0:
                print('check points {}'.format(i))
                saver.save(sess,flag.model_saved_path, global_step=global_step)
                summarizer.add_summary(summary,global_step=global_step)
def train():
    # 指定样本文件
    positive_data_file = "./rt-polaritydata/rt-polarity.pos"
    negative_data_file = "./rt-polaritydata/rt-polarity.neg"
    # 设置训练参数
    num_steps = 50  # 定义训练次数
    SaveFileName = "text_cnn_model"  # 定义保存模型文件夹名称
    # 设置模型参数
    num_classes = 2  # 设置模型分类
    l2_reg_lambda = 0.1  # 定义正则化系数
    filter_sizes = "3,4,5"  # 定义多通道卷积核
    num_filters = 64  # 定义每通道的输出个数

    # 加载数据集
    data, vocab_processor, max_len = dataset(positive_data_file,
                                             negative_data_file)
    #搭建模型
    text_cnn = TextCNN(seq_length=max_len,
                       num_classes=num_classes,
                       vocab_size=len(vocab_processor.vocabulary_),
                       embeding_size=128,
                       filter_sizes=list(map(int, filter_sizes.split(','))),
                       num_filters=num_filters)

    def l2_loss(y_true, y_pred):
        l2_loss = tf.constant(0.0)
        for tf_var in text_cnn.trainable_weights:
            if tf_var.name == "fully_connecred":
                l2_loss += tf.reduce_mean(tf.nn.l2_loss(tf_var))

        loss = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred,
                                                       labels=y_true)
        return loss + l2_reg_lambda * l2_loss

    text_cnn.compile(loss=l2_loss,
                     optimizer=tf.keras.optimizers.Adam(lr=1e-3),
                     metrics=['acc'])
    text_cnn.fit(data, epochs=num_steps)

    text_cnn.save("textcnn.h5")
Beispiel #12
0
def train(train_dir, val_dir, labels_file, word2vec_path, batch_size,
          max_steps, log_step, val_step, snapshot, out_dir):
    '''
    训练...
    :param train_dir: 训练数据目录
    :param val_dir:   val数据目录
    :param labels_file:  labels文件目录
    :param word2vec_path: 词向量模型文件
    :param batch_size: batch size
    :param max_steps:  最大迭代次数
    :param log_step:  log显示间隔
    :param val_step:  测试间隔
    :param snapshot:  保存模型间隔
    :param out_dir:   模型ckpt和summaries输出的目录
    :return:
    '''

    max_sentence_length = 300

    embedding_dim = 50

    filter_sizes = [3, 4, 5, 6]

    num_filters = 200  # Number of filters per filter size

    base_lr = 0.001  # 学习率

    dropout_keep_prob = 0.5

    l2_reg_lambda = 0.0  # "L2 regularization lambda (default: 0.0)

    allow_soft_placement = True  # 如果你指定的设备不存在,允许TF自动分配设备

    log_device_placement = False  # 是否打印设备分配日志

    print("Loading data...")

    w2vModel = create_word2vec.load_wordVectors(word2vec_path)

    labels_set = files_processing.read_txt(labels_file)

    labels_nums = len(labels_set)

    train_file_list = create_batch_data.get_file_list(file_dir=train_dir,
                                                      postfix='*.npy')

    train_batch = create_batch_data.get_data_batch(train_file_list,
                                                   labels_nums=labels_nums,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   one_hot=True)

    val_file_list = create_batch_data.get_file_list(file_dir=val_dir,
                                                    postfix='*.npy')

    val_batch = create_batch_data.get_data_batch(val_file_list,
                                                 labels_nums=labels_nums,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 one_hot=True)

    print("train data info *****************************")

    train_nums = create_word2vec.info_npy(train_file_list)

    print("val data   info *****************************")

    val_nums = create_word2vec.info_npy(val_file_list)

    print("labels_set info *****************************")

    files_processing.info_labels_set(labels_set)

    # Training

    with tf.Graph().as_default():

        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)

        sess = tf.Session(config=session_conf)

        with sess.as_default():

            cnn = TextCNN(sequence_length=max_sentence_length,
                          num_classes=labels_nums,
                          embedding_size=embedding_dim,
                          filter_sizes=filter_sizes,
                          num_filters=num_filters,
                          l2_reg_lambda=l2_reg_lambda)

            # Define Training procedure

            global_step = tf.Variable(0, name="global_step", trainable=False)

            optimizer = tf.train.AdamOptimizer(learning_rate=base_lr)

            # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)

            grads_and_vars = optimizer.compute_gradients(cnn.loss)

            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)

            grad_summaries = []

            for g, v in grads_and_vars:

                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)

                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))

                    grad_summaries.append(grad_hist_summary)

                    grad_summaries.append(sparsity_summary)

            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries

            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy

            loss_summary = tf.summary.scalar("loss", cnn.loss)

            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries

            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])

            train_summary_dir = os.path.join(out_dir, "summaries", "train")

            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries

            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])

            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")

            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it

            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))

            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)

            # Initialize all variables

            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """

                A single training step

                """

                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: dropout_keep_prob
                }

                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)

                if step % log_step == 0:
                    print("training: step {}, loss {:g}, acc {:g}".format(
                        step, loss, accuracy))

                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """

                Evaluates model on a dev set

                """

                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }

                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)

                if writer:
                    writer.add_summary(summaries, step)

                return loss, accuracy

            for i in range(max_steps):

                train_batch_data, train_batch_label = create_batch_data.get_next_batch(
                    train_batch)

                train_batch_data = create_word2vec.indexMat2vector_lookup(
                    w2vModel, train_batch_data)

                train_step(train_batch_data, train_batch_label)

                current_step = tf.train.global_step(sess, global_step)

                if current_step % val_step == 0:

                    val_losses = []

                    val_accs = []

                    # for k in range(int(val_nums/batch_size)):

                    for k in range(100):
                        val_batch_data, val_batch_label = create_batch_data.get_next_batch(
                            val_batch)

                        val_batch_data = create_word2vec.indexMat2vector_lookup(
                            w2vModel, val_batch_data)

                        val_loss, val_acc = dev_step(val_batch_data,
                                                     val_batch_label,
                                                     writer=dev_summary_writer)

                        val_losses.append(val_loss)

                        val_accs.append(val_acc)

                    mean_loss = np.array(val_losses, dtype=np.float32).mean()

                    mean_acc = np.array(val_accs, dtype=np.float32).mean()

                    print("--------Evaluation:step {}, loss {:g}, acc {:g}".
                          format(current_step, mean_loss, mean_acc))

                if current_step % snapshot == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)

                    print("Saved model checkpoint to {}\n".format(path))
Beispiel #13
0
print('Loading data...')
# get data
x_train, y_train, x_test, y_test, word2index = data_helpers.preprocess()
max_features = len(word2index)

max_len = max(len(x) for x in x_train)
print(max_len)

print('Pad sequences...')
x_train = sequence.pad_sequences(x_train, maxlen=max_len, value=0)
x_test = sequence.pad_sequences(x_test, maxlen=max_len, value=0)

print('Build model...')
model = TextCNN(max_len,
                embedding_dim,
                batch_size=batch_size,
                class_num=2,
                max_features=max_features,
                epochs=epochs)

print('Train...')
model.fit(x_train, x_test, y_train, y_test)

print('Test...')
result = model.predict(x_test)
result = np.argmax(np.array(result), axis=1)
y_test = np.argmax(np.array(y_test), axis=1)

print('f1:', f1_score(y_test, result, average='macro'))
print('accuracy:', accuracy_score(y_test, result))
print('classification report:\n', classification_report(y_test, result))
print('confusion matrix:\n', confusion_matrix(y_test, result))
Beispiel #14
0
print('Pad sequences (samples x time)...')
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices(
    (x_test, y_test)).batch(batch_size)

print('Build model...')
model = TextCNN(maxlen=maxlen,
                max_features=max_features,
                embedding_dims=embedding_dims,
                class_num=class_num,
                kernel_sizes=[2, 3, 5],
                kernel_regularizer=None,
                last_activation='softmax')

# 为训练选择优化器与损失函数
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

#选择衡量指标来度量模型的损失值(loss)和准确率(accuracy)。这些指标在 epoch 上累积值,然后打印出整体结果。
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='test_accuracy')
Beispiel #15
0
    def build_graph(self):
        """Build graph."""

        self.tf_graph = TextCNN(self.flags, self.embedding)
        self.tf_graph.build(self.word_ids, self.word_label)
Beispiel #16
0
# training hyperparameter
BATCH_SZIE = 128
EPOCHS = 10

# load data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=MAX_WORD_NUM)

# padding sequence
x_train = sequence.pad_sequences(x_train, maxlen=MAX_SENT_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SENT_LEN)

# build model
model = TextCNN(max_sent_len=MAX_SENT_LEN,
                max_word_num=MAX_WORD_NUM,
                embedding_dims=EMBEDDING_DIMS,
                class_num=CLASS_NUM,
                last_activation=LAST_ACTIVATION).build_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

# train
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.fit(x_train,
          y_train,
          batch_size=BATCH_SZIE,
          epochs=EPOCHS,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

# save model
# model.save('textcnn_model.h5')
Beispiel #17
0
def train(x_train, y_train, vocab_processor, x_dev, y_dev, x_test, y_test):
    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement
        )  #这个session配置,按照前面的gpu,cpu自动选择
        sess = tf.Session(config=session_conf)
        print("")
        with sess.as_default():
            print("vocab_size:", len(vocab_processor.vocabulary_))
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)
            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)
            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                #time_str = datetime.datetime.now().isoformat()
                #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                num = 20
                x_batch = x_batch.tolist()
                y_batch = y_batch.tolist()
                l = len(y_batch)
                l_20 = int(l / num)
                x_set = []
                y_set = []
                for i in range(num - 1):
                    x_temp = x_batch[i * l_20:(i + 1) * l_20]
                    x_set.append(x_temp)
                    y_temp = y_batch[i * l_20:(i + 1) * l_20]
                    y_set.append(y_temp)
                x_temp = x_batch[(num - 1) * l_20:]
                x_set.append(x_temp)
                y_temp = y_batch[(num - 1) * l_20:]
                y_set.append(y_temp)

                #每个batch验证集计算一下准确率,num个batch再平均
                lis_loss = []
                lis_accu = []
                for i in range(num):
                    feed_dict = {
                        cnn.input_x: np.array(x_set[i]),
                        cnn.input_y: np.array(y_set[i]),
                        cnn.dropout_keep_prob: 1.0
                    }
                    step, summaries, loss, accuracy = sess.run(
                        [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
                    lis_loss.append(loss)
                    lis_accu.append(accuracy)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
                print("test_loss and test_acc" + "\t\t" +
                      str(sum(lis_loss) / num) + "\t\t" +
                      str(sum(lis_accu) / num))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helper.batch_iter(list(zip(x_train, y_train)),
                                             FLAGS.batch_size,
                                             FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))

            print("testing:......")
            list_acc = []
            count = 0
            test_batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                                  FLAGS.batch_size,
                                                  num_epochs=1,
                                                  shuffle=False)
            for batch in test_batches:
                x_batch, y_batch = zip(*batch)

                count += 1
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                accuracy = sess.run(cnn.accuracy, feed_dict)

                list_acc.append(accuracy)
            print("test acc:", str(sum(list_acc) / count))
Beispiel #18
0
 def predict(self):
     """Predict line."""
     word_id_list = tf.placeholder(tf.int32, shape=[None, None])
     model = TextCNN(self.flags, self.embedding)
     model.build_predictor(word_id_list)
     return model, word_id_list
Beispiel #19
0
def train(x_train, y_train, tokenizer, x_dev, y_dev):
    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(tokenizer.vocab),
                          embedding_size=FLAGS.embedding_dim,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            #out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            out_dir = FLAGS.out_dir
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            #vocab_processor.save(os.path.join(out_dir, "vocab"))
            ckpt_file = tf.train.latest_checkpoint(checkpoint_dir)
            if ckpt_file:
                saver.restore(sess, ckpt_file)
                print('restoring model from %s' % ckpt_file)
            else:
                # Initialize all variables
                sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                if step % FLAGS.log_every == 0:
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
                    train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy, predict = sess.run([
                    global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                    cnn.predictions
                ], feed_dict)
                auc = calAUC(predict, y_batch)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}, auc {:g}".format(
                    time_str, step, loss, accuracy, auc))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            iter = iterData(x_train,
                            y_train,
                            batch_size=FLAGS.train_batch_size,
                            epoch=FLAGS.num_epochs)
            # Training loop. For each batch...
            data = next(iter)
            step = 0
            epoch = 0
            print('training begin')
            while data != '__RETURN__':
                if data == '__STOP__':
                    data = next(iter)
                    epoch += 1
                    continue
                x_batch, y_batch = data
                train_step(x_batch, y_batch)
                data = next(iter)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
Beispiel #20
0
def main():
    for i in range(10):
        # 加载配置文件
        config = Config()
        if torch.cuda.is_available():
            torch.cuda.set_device(0)
        # 加载数据集
        early_stopping = EarlyStopping(patience=10, verbose=True, cv_index=i)
        kwargs = {'num_workers': 2, 'pin_memory': True}
        dataset_train = MR_dataset(config=config,
                                   state="train",
                                   k=i,
                                   embedding_state=True)
        train_data_batch = DataLoader(dataset_train,
                                      batch_size=config.batch_size,
                                      shuffle=False,
                                      drop_last=False,
                                      **kwargs)
        dataset_valid = MR_dataset(config=config,
                                   state="valid",
                                   k=i,
                                   embedding_state=False)
        valid_data_batch = DataLoader(dataset_valid,
                                      batch_size=config.batch_size,
                                      shuffle=False,
                                      drop_last=False,
                                      **kwargs)
        dataset_test = MR_dataset(config=config,
                                  state="test",
                                  k=i,
                                  embedding_state=False)
        test_data_batch = DataLoader(dataset_test,
                                     batch_size=config.batch_size,
                                     shuffle=False,
                                     drop_last=False,
                                     **kwargs)
        print(len(dataset_train), len(dataset_valid), len(dataset_test))

        if config.use_pretrained_embed:
            config.embedding_pretrained = torch.from_numpy(
                dataset_train.weight).float().cuda()
            print("load pretrained models.")
        else:
            config.embedding_pretrained = None

        config.vocab_size = dataset_train.vocab_size

        model = TextCNN(config)
        print(model)

        if config.use_cuda and torch.cuda.is_available():
            # print("load data to CUDA")
            model.cuda()
            # config.embedding_pretrained.cuda()

        criterion = nn.CrossEntropyLoss()  # 定义为交叉熵损失函数
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
        count = 0
        loss_sum = 0.0
        for epoch in range(config.epoch):
            # 开始训练
            model.train()
            for data, label in train_data_batch:
                if config.use_cuda and torch.cuda.is_available():
                    data = data.to(torch.int64).cuda()
                    label = label.cuda()
                else:
                    data.to(torch.int64)
                # data = torch.autograd.Variable(data).long().cuda()
                # label = torch.autograd.Variable(label).squeeze()
                out = model(data)
                l2_loss = config.l2_weight * torch.sum(
                    torch.pow(list(model.parameters())[1], 2))
                loss = criterion(out, autograd.Variable(
                    label.long())) + l2_loss
                loss_sum += loss.data.item()
                count += 1
                if count % 100 == 0:
                    print("epoch", epoch, end='  ')
                    print("The loss is: %.5f" % (loss_sum / 100))
                    loss_sum = 0
                    count = 0
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            # 一轮训练结束,在验证集测试
            valid_loss, valid_acc = get_test_result(model, valid_data_batch,
                                                    dataset_valid, config,
                                                    criterion)
            early_stopping(valid_loss, model, config)
            print("The valid acc is: %.5f" % valid_acc)
            if early_stopping.early_stop:
                print("Early stopping")
                break
        # 1 fold训练结果
        model.load_state_dict(
            torch.load(
                os.path.abspath(
                    os.path.join(config.checkpoint_path,
                                 'checkpoint%d.pt' % i))))
        test_loss, test_acc = get_test_result(model, test_data_batch,
                                              dataset_test, config, criterion)
        print("The test acc is: %.5f" % test_acc)
Beispiel #21
0
def train():
    config = KDConfig()

    logger = get_logger(config.log_path, "train_KD")

    device = config.device

    # 加载bert模型,作为teacher
    logger.info("load bert .....")
    bert = Bert(config.bert_config)
    bert.load_state_dict(torch.load(config.bert_config.model_path))
    bert.to(device)
    bert.eval()

    # 冻结bert参数
    for name, p in bert.named_parameters():
        p.requires_grad = False

    # 加载textcnn模型,作为student
    textcnn = TextCNN(config.textcnn_config)
    textcnn.to(device)
    textcnn.train()

    # 加载数据集
    logger.info("load train/dev data .....")
    train_loader = DataLoader(KDdataset(config.base_config.train_data_path),
                              batch_size=config.batch_size,
                              shuffle=True)
    dev_loader = DataLoader(KDdataset(config.base_config.dev_data_path),
                            batch_size=config.batch_size,
                            shuffle=False)

    optimizer = Adam(textcnn.parameters(), lr=config.lr)

    # 开始训练
    logger.info("start training .....")
    best_acc = 0.
    for epoch in range(config.epochs):
        for i, batch in enumerate(train_loader):
            cnn_ids, labels, input_ids, token_type_ids, attention_mask = batch[0].to(device), batch[1].to(device), \
                                                                         batch[2].to(device), batch[3].to(device), \
                                                                         batch[4].to(device)
            optimizer.zero_grad()
            students_output = textcnn(cnn_ids)
            teacher_output = bert(input_ids, token_type_ids, attention_mask)
            loss = loss_fn_kd(students_output, labels, teacher_output,
                              config.T, config.alpha)
            loss.backward()
            optimizer.step()

            # 打印信息
            if i % 100 == 0:
                labels = labels.data.cpu().numpy()
                preds = torch.argmax(students_output, dim=1)
                preds = preds.data.cpu().numpy()
                acc = np.sum(preds == labels) * 1. / len(preds)
                logger.info(
                    "TRAIN: epoch: {} step: {} acc: {} loss: {} ".format(
                        epoch + 1, i, acc, loss.item()))

        acc, table = dev(textcnn, dev_loader, config)

        logger.info("DEV: acc: {} ".format(acc))
        logger.info("DEV classification report: \n{}".format(table))

        if acc > best_acc:
            torch.save(textcnn.state_dict(), config.model_path)
            best_acc = acc

    logger.info("start testing ......")
    test_loader = DataLoader(KDdataset(config.base_config.test_data_path),
                             batch_size=config.batch_size,
                             shuffle=False)
    best_model = TextCNN(config.textcnn_config)
    best_model.load_state_dict(torch.load(config.model_path))
    acc, table = dev(best_model, test_loader, config)

    logger.info("TEST acc: {}".format(acc))
    logger.info("TEST classification report:\n{}".format(table))
Beispiel #22
0
def train():
    config = TextCNNConfig()
    logger = get_logger(config.log_path, "train_textcnn")
    model = TextCNN(config)
    train_loader = DataLoader(CnnDataSet(config.base_config.train_data_path), batch_size=config.batch_size, shuffle=True)
    dev_loader = DataLoader(CnnDataSet(config.base_config.dev_data_path), batch_size=config.batch_size, shuffle=False)
    model.train()
    model.to(config.device)

    optimizer = Adam(model.parameters(), lr=config.learning_rate)
    best_acc = 0.

    for epoch in range(config.num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            texts = texts.to(config.device)
            labels = labels.to(config.device)
            logits = model(texts)
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                labels = labels.data.cpu().numpy()
                preds = torch.argmax(logits, dim=1)
                preds = preds.data.cpu().numpy()
                acc = np.sum(preds == labels) * 1. / len(preds)
                logger.info("TRAIN: epoch: {} step: {} acc: {} loss: {} ".format(epoch + 1, i, acc, loss.item()))

        acc, table = dev(model, dev_loader, config)

        logger.info("DEV: acc: {} ".format(acc))
        logger.info("DEV classification report: \n{}".format(table))

        if acc > best_acc:
            torch.save(model.state_dict(), config.model_path)
            best_acc = acc

    test_loader = DataLoader(CnnDataSet(config.base_config.test_data_path), batch_size=config.batch_size, shuffle=False)
    best_model = TextCNN(config)
    best_model.load_state_dict(torch.load(config.model_path))
    acc, table = dev(best_model, test_loader, config)

    logger.info("TEST acc: {}".format(acc))
    logger.info("TEST classification report:\n{}".format(table))
Beispiel #23
0
    def trainModel(self):

        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)

            with tf.name_scope("readfile"):
                processing = Processing.Processing()
                articles, tags = processing.loadPracticeFile("data/train_all.txt")
                self.data_embedding_new, self.tags_new = processing.embedding(articles, tags)
                X_train, X_val, y_train, y_val = train_test_split(
                    self.data_embedding_new, self.tags_new, test_size=0.2, random_state=0)
            # 加载词典
            vocab = learn.preprocessing.VocabularyProcessor.restore('model/vocab.pickle')

            with sess.as_default():
                textcnn = TextCNN.TextCNN(
                    max_length=len(self.data_embedding_new[0]),
                    num_classes=len(y_train[0]),
                    vocab_size=len(vocab.vocabulary_),
                    embedding_size=FLAGS.embedding_dim,
                    filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                    num_filters=FLAGS.num_filters,
                    l2_reg_lambda=FLAGS.l2_reg_lambda)

                # Define Training procedure
                global_step = tf.Variable(0, name="global_step", trainable=False)
                optimizer = tf.train.AdamOptimizer(1e-3)
                # 对var_list中的变量计算loss的梯度 返回一个以元组(gradient, variable)组成的列表
                grads_and_vars = optimizer.compute_gradients(textcnn.loss)
                # 将计算出的梯度应用到变量上
                train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

                # Initialize all variables
                sess.run(tf.global_variables_initializer())
                saver = tf.train.Saver()
                best_f1 = 0.0

                for time in range(config.epoch):
                    batch_size = config.Batch_Size
                    for trainX_batch, trainY_batch in self.get_batches(X_train, y_train, batch_size):
                        feed_dict = {
                            textcnn.input_x: np.array(trainX_batch),
                            textcnn.input_y: np.array(trainY_batch),
                            textcnn.drop_keep_prob: FLAGS.dropout_keep_prob
                        }
                        _, loss, train_accuracy = sess.run([train_op, textcnn.loss, textcnn.accuracy], feed_dict)

                    print("训练集:第"+str((time+1))+"次迭代的损失为:"+str(loss)+";准确率为:"+str(train_accuracy))

                    all_dev = []
                    for devX_batch, devY_batch in self.get_batches(X_val, y_val, batch_size):
                        feed_dict = {
                            textcnn.input_x: np.array(devX_batch),
                            textcnn.input_y: np.array(devY_batch),
                            textcnn.drop_keep_prob: 1.0
                        }
                        dev_loss, dev_predictions = sess.run([textcnn.loss, textcnn.predictions], feed_dict)
                        all_dev.extend(dev_predictions.tolist())

                    # f1值
                    y_true = []
                    for x in y_val:
                        if x[0] == 1:
                            y_true.append(0)
                        else:
                            y_true.append(1)
                    dev_f1 = f1_score(np.array(y_true), np.array(all_dev))
                    dev_recall = recall_score(np.array(y_true), np.array(all_dev))
                    dev_acc = accuracy_score(np.array(y_true), np.array(all_dev))
                    print("验证集:f1:{},recall:{},acc:{}\n".format(dev_f1, dev_recall, dev_acc))
                    if dev_f1 > best_f1:
                        best_f1 = dev_f1
                        saver.save(sess, "model/TextCNNModel.ckpt")
                        print("saved\n")
Beispiel #24
0
def train():

    if (os.path.exists(TRAIN_DATA_FILENAME)
            and os.path.exists(TRAIN_VOCAB_FILENAME)):

        print('load prebuilt train data & vocab file')

        input = load_data(TRAIN_DATA_FILENAME)

        vocab = load_vocab(TRAIN_VOCAB_FILENAME)

    else:

        print('build train data & vocab from raw text')

        data, newscore = read_raw_data(TRAIN_FILENAME)

        tokens = [t for d in data for t in d[0]]

        vocab = build_vocab(tokens)

        input = build_input(data, vocab, newscore)

        print('save train data & vocab file')

        save_data(TRAIN_DATA_FILENAME, input)

        save_vocab(TRAIN_VOCAB_FILENAME, vocab)

    if (os.path.exists(TEST_DATA_FILENAME)
            and os.path.exists(TEST_VOCAB_FILENAME)):

        print('load prebuilt test data & vocab file ')

        test_input = load_data(TEST_DATA_FILENAME)

        test_vocab = load_vocab(TEST_VOCAB_FILENAME)

    else:

        print('build test data & vocab from raw text')

        data, newscore = read_raw_data(TEST_FILENAME)

        tokens = [t for d in data for t in d[0]]

        test_vocab = build_vocab(tokens)

        test_input = build_input(data, test_vocab, newscore)

        print('save test data & vocab file')

        save_data(TEST_DATA_FILENAME, test_input)

        save_vocab(TEST_VOCAB_FILENAME, test_vocab)

    with tf.Session() as sess:

        seq_length = np.shape(input[0][0])[0]
        print("SS", seq_length)

        num_class = np.shape(input[0][1])[0]
        print("NN", num_class)

        print('initialize cnn filter')

        print('sequence length %d,  number of class %d, vocab size %d' %
              (seq_length, num_class, len(vocab)))

        cnn = TextCNN(seq_length, num_class, len(vocab), 128, [3, 4, 5], 128)

        global_step = tf.Variable(0, name='global_step', trainable=False)

        optimizer = tf.train.AdamOptimizer(1e-3)

        grads_and_vars = optimizer.compute_gradients(cnn.loss)

        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        def train_step(x_batch, y_batch):

            feed_dict = {
                cnn.input: x_batch,
                cnn.label: y_batch,
                cnn.dropout_keep_prob: 0.5
            }

            _, step, loss, accuracy = sess.run(
                [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

        def evaluate(x_batch, y_batch):

            feed_dict = {
                cnn.input: x_batch,
                cnn.label: y_batch,
                cnn.dropout_keep_prob: 1.0
            }

            step, loss, accuracy = sess.run(
                [global_step, cnn.loss, cnn.accuracy], feed_dict)

            print("step %d, loss %f, acc %f" % (step, loss, accuracy))

        saver = tf.train.Saver()

        sess.run(tf.global_variables_initializer())

        for i in range(10000):

            try:

                batch = random.sample(input, 64)

                x_batch, y_batch = zip(*batch)

                train_step(x_batch, y_batch)

                current_step = tf.train.global_step(sess, global_step)

                if current_step % 100 == 0:

                    batch = random.sample(test_input, 64)

                    x_test, y_test = zip(*batch)

                    evaluate(x_test, y_test)

                if current_step % 1000 == 0:

                    save_path = saver.save(sess, './textcnn.ckpt')

                    print('model saved : %s' % save_path)

            except:

                print("Unexpected error:", sys.exc_info()[0])

                raise
Beispiel #25
0
class ModelHandler():
    """Build train process."""
    def __init__(self, flags):
        """Init class."""
        self.flags = flags
        self.embedding, self.embedding_size = read_embedding(
            self.flags.model_dir + self.flags.embedding_path)

    def add_tensor(self):
        """Add data and embeding."""
        self.train_dat = DataSet(self.flags.train_file,
                            self.flags.model_dir,
                            self.flags.batch_size,
                            self.flags.num_class,
                            self.flags.seq_length)

        iterator = self.train_dat.init_iterator()
        self.word_ids, self.word_label = iterator.get_next()
        self.dev_dat = DataSet(self.flags.dev_file,
                          self.flags.model_dir,
                          self.flags.batch_size,
                          self.flags.num_class,
                          self.flags.seq_length)
        self.train_data_init = iterator.make_initializer(self.train_dat.dataset)
        self.dev_data_init = iterator.make_initializer(self.dev_dat.dataset)

        print('add_dev_tensor')

    def train(self, sess, saver):
        """Train process."""
        self.step = 0
        best_accuracy = 0
        patient_passes = 0


        # sess.run(self.train_graph.embedding_init)
        for epoch in range(self.flags.epoch):

            self.train_dat.sample_data()
            sess.run(self.train_data_init)
            tf.local_variables_initializer().run()
            self.current_epoch = epoch
            print("epoch is :", epoch+1)
            self.train_epoch(sess, self.tf_graph)

            self.dev_dat.read_text(self)
            sess.run(self.dev_data_init)
            accuracy, losses = self.evaluate(sess, self.tf_graph)
            if accuracy < best_accuracy:
                patient_passes += 1
                if patient_passes == self.flags.patient_passes:
                    print("without improvement, break")
                    break
                else:
                    print("without improvement")
            else:
                print("new best acc {}".format(accuracy))
                best_accuracy = accuracy
                patient_passes = 0
                saver.save(sess, os.path.join(self.flags.model_dir, "model"),
                           global_step=self.step)

    def build_graph(self):
        """Build graph."""

        self.tf_graph = TextCNN(self.flags, self.embedding)
        self.tf_graph.build(self.word_ids, self.word_label)

    def train_epoch(self, sess, graph):
        """Operation in one epoch."""
        while True:
            self.step += 1
            try:
                _, loss, pred, ids, labels = sess.run(
                    [graph.train_op, graph.loss, graph.pred, graph.word_ids, graph.labels])

                if self.step % 10 == 0:
                    print("training epoch:{}, step:{}, loss:{}"
                          .format(self.current_epoch + 1, self.step, loss))
            except tf.errors.OutOfRangeError:
                print('finish')
                break

    def evaluate(self, sess, graph):
        """Evaluate process."""
        correct_preds = 0
        total_preds = 0
        accuracy = 0
        losses = 0
        while True:
            try:
                batch_correct_pred, pred, batch_loss = sess.run(
                    [graph.correct_pred, graph.pred, graph.loss])
                correct_preds += batch_correct_pred
                total_preds += pred.shape[0]
                losses += batch_loss * pred.shape[0]
            except tf.errors.OutOfRangeError:
                break
        accuracy = float(correct_preds / (total_preds+0.1))
        losses = float(losses / (total_preds+0.1))
        return accuracy, losses

    def predict(self):
        """Predict line."""
        word_id_list = tf.placeholder(tf.int32, shape=[None, None])
        model = TextCNN(self.flags, self.embedding)
        model.build_predictor(word_id_list)
        return model, word_id_list
 config['embedding_size'] = 300
 config['keep_prob'] = 1.0
 config['filter_sizes'] = [7,8,9]
 config['num_filters'] = 300
 config['sentence_length'] = 2500
 # init data path
 train_data_path = '../../corpus/newdata.clean.dat'
 test_data_path =  '../../corpus/stdtestSet.dat'
 channel2id_path =  '../../corpus/channel2cid.yaml'
 cid2channel_path = '../../corpus/cid2channel.yaml'
 dict_path = '../../corpus/dict_texts'
 # loading data
 X_train = np.load()
 y_train = np.load()
 # build model
 model = TextCNN(config)
 model.build_graph()
 # running model
 config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
 config.gpu_options.per_process_gpu_memory_fraction = 0.8
 session = tf.Session(config=config)
 saver = tf.train.Saver(max_to_keep=5)
 with tf.Session(config=config) as sess:
     sess.run(tf.global_variables_initializer())
     check_restore_parameters(sess, saver)
     if mode == 'train':
         print('starting training...')
         train_model(sess, model, epochs=20)
     if mode =='test':
         print('start testing...')
         test_model(sess, model)
Beispiel #27
0
def main(_):
    X_train, X_val, y_train, y_val, n_classes = train_test_loader(
        FLAGS.just_train)
    with open('data/vocab.dic', 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab) + 1
    print('size of vocabulary: {}'.format(vocab_size))

    # padding sentences
    X_train = pad_sequences(X_train,
                            maxlen=FLAGS.sentence_len,
                            value=float(vocab_size - 1))
    if not FLAGS.just_train:
        X_val = pad_sequences(X_val,
                              maxlen=FLAGS.sentence_len,
                              value=float(vocab_size - 1))
    # convert label to one-hot encode
    # to_categorical(y_train, n_classes)
    # to_categorical(y_val, n_classes)

    # create session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        textcnn = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          FLAGS.num_classes,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          vocab_size,
                          FLAGS.embed_size,
                          FLAGS.is_training,
                          multi_label_flag=False)
        # Initialize save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
            print('restoring variables from checkpoint')
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:
                assign_pretrained_word_embedding(sess, vocab, vocab_size,
                                                 textcnn)
        curr_epoch = sess.run(textcnn.epoch_step)

        # feed data and training
        number_of_training_data = len(X_train)
        batch_size = FLAGS.batch_size
        best_val_acc = 0.0

        total_epochs = 0
        if not FLAGS.just_train:
            total_epochs = FLAGS.num_epochs
        else:
            total_epochs = 20

        for epoch in range(curr_epoch, total_epochs):
            loss, acc, counter = .0, .0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 or counter == 0:
                    pass
                    # print('X_train[start:end]: {}'.format(X_train[start:end]))
                feed_dict = {
                    textcnn.input_x: X_train[start:end],
                    textcnn.dropout_keep_prob: 0.5
                }
                if not FLAGS.multi_label_flag:
                    feed_dict[textcnn.input_y] = y_train[start:end]
                else:
                    feed_dict[textcnn.input_y_multilabel] = y_train[start:end]
                curr_loss, curr_acc, _ = sess.run(
                    [textcnn.loss_val, textcnn.accuracy, textcnn.train_op],
                    feed_dict)
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc

                if counter % 50 == 0:
                    print(
                        'Epoch {}\tBatch {}\tTrain Loss {}\tTrain Accuracy {}'.
                        format(epoch, counter, loss / float(counter),
                               acc / float(counter)))
            print('going to increment epoch counter ...')
            sess.run(textcnn.epoch_increment)

            # validation
            if not FLAGS.just_train and epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, textcnn, X_val, y_val,
                                              batch_size)
                unmatched_sample(sess, textcnn, X_val, y_val, batch_size)

                print("Epoch {} Validation Loss: {}\tValidation Accuracy: {}".\
                        format(epoch, eval_loss, eval_acc))
                if eval_acc > best_val_acc:
                    if os.path.exists(FLAGS.ckpt_dir):
                        shutil.rmtree(FLAGS.ckpt_dir)
                    best_val_acc = eval_acc
                    # save model to checkpoint
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    saver.save(sess, save_path, global_step=epoch)
                else:
                    break

        # report result
        if not FLAGS.just_train:
            test_loss, test_acc = do_eval(sess, textcnn, X_val, y_val,
                                          batch_size)
            unmatched_sample(sess, textcnn, X_val, y_val, batch_size)
        else:
            save_path = FLAGS.ckpt_dir + "model.ckpt"
            saver.save(sess, save_path, global_step=20)
Beispiel #28
0
padding_x, max_document_length = padding(x, maxlen=FLAGS.pad_seq_len)
int_y = [int(_y) for _y in y]
encoded_y = one_hot_encode(int_y)
train_x, test_x, train_y, test_y = train_test_data_split(padding_x, encoded_y)

# 3. define session
with tf.Graph().as_default():
    # session_config=tf.ConfigProto(allow_soft_placement=True,log_device_placement=False)
    # sess=tf.Session(config=session_config)
    session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True,
                                              log_device_placement=False)
    sess = tf.compat.v1.Session(config=session_config)
    with sess.as_default():
        model = TextCNN(FLAGS.pad_seq_len, FLAGS.num_classes,
                        len(data_helper.token2idx), FLAGS.embedding_dim,
                        FLAGS.learning_rate, FLAGS.filter_sizes,
                        FLAGS.num_filters, FLAGS.random_embedding,
                        FLAGS.l2_reg_lambda)

        # 4. define important variable
        global_step = tf.Variable(initial_value=0,
                                  trainable=False,
                                  name="global_step")
        optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate)
        grads_and_vars = optimizer.compute_gradients(model.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step)

        # 5. record `summaries`,like:scalars, graph, histogram
        ## I. keep the track of gradient values and sparsity
        grad_summaries = []
        for g, v in grads_and_vars:
Beispiel #29
0
logger.info('loading data...')
try:
    (x_train, y_train), (x_test,
                         y_test) = imdb.load_data(num_words=max_features)
except:
    logger.info('np bug occur...')
    (x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
logger.info('train data length: {}'.format(len(x_train)))
logger.info('test data length: {}'.format(len(x_test)))

logger.info('padding data...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

logger.info('build model...')
model = TextCNN(max_features=max_features, maxlen=maxlen,
                emb_dim=emb_dim).build_model()

logger.info('training...')
earlystop = EarlyStopping(patience=3, mode='max', monitor='val_acc')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[earlystop],
          validation_data=(x_test, y_test))

logger.info('test...')
pred = model.predict(x_test[:10])
logger.info(list(zip(pred, y_test[:10])))
Beispiel #30
0
import os
import data_preparation
from data_preparation import THCNewsDataSet, batch_iter
import torch
import torch.optim as optim
from torch.utils.data.dataloader import DataLoader
from config import Config
from textcnn import TextCNN
import torch.nn as nn
import torch.nn.functional as F

device = torch.device(
    "cuda:0") if torch.cuda.is_available() else torch.device("cpu")


model = TextCNN()

model = model.to(device)

opt = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()


def save_model(model, model_name="best_model_sofa.pkl", model_save_dir="./trained_models/"):
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    path = os.path.join(model_save_dir, model_name)

    torch.save(model.state_dict(), path)
    print("saved model state dict at :"+path)