Beispiel #1
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'test':
        dm.add_data('train_data', train_path, True)
        dm.add_data('test_data', test_path, True)
    else:
        raise Exception('Action except for train, semi, and test')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    model.summary()

    print("args.load_model =", args.load_model)
    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        #print ('Warning : testing without loading any model')
        print('args.action is %s' % (args.action))
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        """
        checkpoint = ModelCheckpoint(filepath=save_path, 
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max' )
        """
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        print("tweets's shape = ", tweets.shape)
        print("snippets's shape = ", snippets.shape)
        print("targets's shape = ", targets.shape)
        print("Y's shape = ", Y.shape)
        #model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])
        history = model.fit(
            [tweets, snippets, targets],
            Y,
            validation_data=([X_val[0, :], X_val[1, :], X_val[2, :]], Y_val),
            epochs=args.nb_epoch,
            batch_size=args.batch_size)  #,
        #callbacks=[checkpoint, earlystopping] )
        predictions = model.predict([tweets, snippets, targets])
        #print(predictions.shape)
        #print(predictions)

        model.save(save_path)

    # testing
    elif args.action == 'test':
        args.val_ratio = 0
        (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio)
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        #print("tweets.shape =", tweets.shape)
        #print("snippets.shape =", snippets.shape)
        #print("targets.shape =", targets.shape)
        predictions = model.predict([tweets, snippets, targets])
        preidctions = predictions.reshape(-1)
        #print(predictions)
        #print(Y.shape)
        #scores = np.sum((predictions - Y)**2)/len(Y)
        scores = model.evaluate([tweets, snippets, targets], Y)
        print("test data mse by keras = %f" % scores[1])
        print("test data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("test data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("test data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))
        #print("test data scores[1](loss = mse) = %f" % scores[1])
        #raise Exception ('Implement your testing function')
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        tweets = X[0, :]
        snippets = X[1, :]
        targets = X[2, :]
        predictions = model.predict([tweets, snippets, targets])
        preidctions = predictions.reshape(-1)
        #scores = np.sum((predictions - Y)**2)/len(Y)
        scores = model.evaluate([tweets, snippets, targets], Y)
        print("train data mse by keras = %f" % scores[1])
        print("train data mse by sklearn = %f" %
              mean_squared_error(Y, predictions))
        for idx, value in enumerate(predictions):
            if value > 0:
                predictions[idx] = 1
            elif value == 0:
                predictions[idx] = 0
            elif value < 0:
                predictions[idx] = -1

        for idx, value in enumerate(Y):
            if value > 0:
                Y[idx] = 1
            elif value == 0:
                Y[idx] = 0
            elif value < 0:
                Y[idx] = -1

        print("train data micro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='micro'))
        print("train data macro f1 score by sklearn = %f" %
              f1_score(Y, predictions, average='macro'))

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Beispiel #2
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

#处理数据
#####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    else:
        dm.add_data('test_data', test_path, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    #初始化模型
    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

#训练过程
# training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        #创建一个实例history
        history = LossHistory()
        hist = model.fit(X,
                         Y,
                         validation_data=(X_val, Y_val),
                         epochs=args.nb_epoch,
                         batch_size=args.batch_size,
                         callbacks=[checkpoint, earlystopping, history])
        #绘制acc-loss曲线
        history.loss_plot('epoch')
#测试过程
# testing
    elif args.action == 'test':
        id = dm.data['test_data'][1]
        out = model.predict(dm.data['test_data'][0])
        out = np.squeeze(out)
        out[out <= 0.5] = 0
        out[out > 0.5] = 1
        out = out.astype(int)
        print("pred shape:", np.array(out).shape)
        print("id shape:", np.array(id).shape)
        result = pd.concat(
            [pd.DataFrame({'id': id}),
             pd.DataFrame({'sentiment': out})],
            axis=1)
        wd = pd.DataFrame(result)
        wd.to_csv("submission.csv", index=None)
        newZip = zipfile.ZipFile('submission.zip', 'w')
        newZip.write('submission.csv', compress_type=zipfile.ZIP_DEFLATED)
        newZip.close()


#半监督训练过
# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            history = LossHistory()

            # train
            hist = model.fit(semi_X,
                             semi_Y,
                             validation_data=(X_val, Y_val),
                             epochs=2,
                             batch_size=args.batch_size,
                             callbacks=[checkpoint, earlystopping, history])
            history.loss_plot('epoch')

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
def main():
    parser = argparse.ArgumentParser(description='Text OHCA recognition')
    parser.add_argument('model')
    parser.add_argument('action', choices=['train', 'test'])

    # training argument
    parser.add_argument('--batch_size', default=256, type=float)
    parser.add_argument('--nb_epoch', default=2000, type=int)
    parser.add_argument('--val_ratio', default=0.1, type=float)
    parser.add_argument('--gpu_fraction', default=0.6, type=float)
    parser.add_argument('--vocab_size', default=50000, type=int)
    parser.add_argument('--max_length', default=400, type=int)
    parser.add_argument('--patience', default=30, type=int)

    # model parameter
    parser.add_argument('--loss_function', default='binary_crossentropy')
    parser.add_argument('--cell', default='LSTM', choices=['LSTM', 'GRU'])
    parser.add_argument('-num_lay', '--num_layers', default=2, type=int)
    parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int)
    parser.add_argument('-hid_siz', '--hidden_size', default=400, type=int)
    parser.add_argument('--pretrain_emb', default=True, type=bool)
    parser.add_argument('--emb_matrix', default='cbowemb.npz')
    #    parser.add_argument('--dropout_rate', default=0.3, type=float)
    parser.add_argument('--keep_prob', default=1.0, type=float)
    parser.add_argument('-lr', '--learning_rate', default=0.013, type=float)
    parser.add_argument('--threshold', default=0.5, type=float)
    # output path for your prediction
    parser.add_argument(
        '--result_path',
        default='result.csv',
    )

    # put model in the same directory
    parser.add_argument('--load_model', default=None)
    parser.add_argument('--load_token', default=True, type=bool)
    parser.add_argument('--save_dir', default='model/')
    # log dir for tensorboard
    parser.add_argument('--log_dir', default='log_dir/')
    # testing output
    parser.add_argument('--testfile', default='data/ohca_scripts.txt')
    parser.add_argument('--testout', default='data/script_test.txt')

    args = parser.parse_args()

    train_path = 'data/ohca_scripts.txt'
    test_path = args.testfile

    save_path = 'token/'
    #load token path
    if args.load_token is not None:
        load_path = os.path.join(save_path)

    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    sess = get_session(args.gpu_fraction)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, with_label=True)
    else:
        dm.add_data('test_data', test_path,
                    with_label=True)  # now the test will have label

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_token is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # Create the graph object
    tf.reset_default_graph()
    # initial model
    print('initial model...')
    rnnmodel = simpleRNN(args)
    #print (model.summary())

    with tf.name_scope('inputs'):
        #create placeholder for training (testing) data
        X_ = tf.placeholder(tf.int32, [None, args.max_length], name='X')
        y_ = tf.placeholder(tf.int32, [
            args.batch_size,
        ], name='y_')
        keep_prob = tf.placeholder_with_default(1.0,
                                                shape=(),
                                                name="keep_prob")

    y_predict = rnnmodel.model(args, X_, keep_prob)

    #prepare for saving model to evaluate
    train_var = [X_, y_, keep_prob, y_predict]
    tf.add_to_collection('train_var', train_var[0])
    tf.add_to_collection('train_var', train_var[1])
    tf.add_to_collection('train_var', train_var[2])
    tf.add_to_collection('train_var', train_var[3])

    #loss (MSE)
    mse = rnnmodel.loss(y_, y_predict)

    #optimizers
    train_op = rnnmodel.optimizer(args, mse)

    #accuracy for validation
    accuracy = rnnmodel.accuracy(y_, y_predict)

    #initial state of LSTM
    init_state = rnnmodel.initial_state

    # merge the write out histogram plots (tensorboard)
    merged = tf.summary.merge_all()

    #check outputs of LSTM
    routputs = rnnmodel.outputs

    if args.load_model is not None:
        load_path = os.path.join(args.save_dir)
        if args.action == 'train':
            print('Warning : load a exist model variables and keep training')
        path = os.path.join(load_path, 'Sentimen_rnn_final')
        if os.path.exists(path + ".meta"):
            print('load model from %s' % path)
            #model.load_weights(path) change to tensorflow model
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')
        raise Exception('Not loading model for testing...')

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        print("Shape of X is {}, and y is {}".format(
            np.array(X).shape,
            np.array(Y).shape))

    elif args.action == 'test':
        (X, Y) = dm.get_labeldata('test_data')
        print("Load test data (shape {})".format(X.shape))
        #raise Exception ('Implement your testing function')

    init = tf.global_variables_initializer()

    #prepare to save model
    save_vars = tf.trainable_variables()
    saver = tf.train.Saver(save_vars,
                           max_to_keep=7,
                           keep_checkpoint_every_n_hours=1)

    last_loss = 1000000.0

    with tf.Session() as sess:
        init.run()

        #if pre-trained, load embedding matrix
        if (args.pretrain_emb == True):
            emb_npfn = save_path + args.emb_matrix
            emb_matrix = np.load(emb_npfn)['embed_m']
            if (emb_matrix.shape[0] != args.vocab_size
                    or emb_matrix.shape[1] != args.embedding_dim):
                print(
                    "Import embedding matrix shape {} does not match shape of ({},{})..."
                    .format(emb_matrix.shape, args.vocab_size,
                            args.embedding_dim))
                exit(1)
            else:
                print("Loading embedding matrix.....")
                sess.run(rnnmodel.embedding_mat.assign(emb_matrix))

        train_writer = tf.summary.FileWriter(args.log_dir + 'train',
                                             sess.graph)
        valid_writer = tf.summary.FileWriter(args.log_dir + 'valid',
                                             sess.graph)
        # load variables in graphs if assigned
        if args.load_model is not None:
            saver.restore(sess, path)

        #if semi-learning, first apply model to semi-learning data
        if (args.action == 'train'):
            #training
            early_stop_counter = 0
            generation_num = 0
            # repeat nb_epoch times
            for e in range(args.nb_epoch):
                state = sess.run([init_state])
                semi_preds = []

                if (e == 0):
                    # hard copy
                    X_train = X.copy()
                    Y_train = Y.copy()

                #elif ( args.action='train'):
                #reset initial LSTM state every epochs
                n_batches = len(X) // args.batch_size
                for ix, (X_batch, y_batch) in enumerate(
                        get_batches(X_train, Y_train, args.batch_size), 1):

                    generation_num += 1
                    train_dict = {
                        X_: X_batch,
                        y_: y_batch,
                        keep_prob: args.keep_prob,
                        init_state: state
                    }
                    #for each traing generation, reload zero initial states

                    _, summary, mse_train, accu_train = sess.run(
                        [train_op, merged, mse, accuracy],
                        feed_dict=train_dict)

                    train_writer.add_summary(summary, generation_num)
                    outputs_ = routputs.eval(feed_dict=train_dict)
                    if (ix == 1):
                        print(X_batch.shape)
                        #print("shape of outputs is {}".format(outputs_[:,-1].shape))

                    if (generation_num % 10 == 0):
                        print("Epoch: {}/{}".format(e, args.nb_epoch),
                              "Iteration: {}".format(generation_num),
                              "Train loss: {:.3f}".format(mse_train))

                    #validation for each 50 generations or end of each epoch
                    if (generation_num % 50 == 0 or ix == n_batches):
                        val_acc = []
                        val_loss = []
                        val_state = sess.run([init_state])
                        for iv, (X_batch, y_batch) in enumerate(
                                get_batches(X_val, Y_val, args.batch_size), 1):
                            val_dict = {
                                X_: X_batch,
                                y_: y_batch,
                                keep_prob: 1,
                                init_state: val_state
                            }

                            summary, batch_acc, batch_loss = sess.run(
                                [merged, accuracy, mse], feed_dict=val_dict)
                            #print out some answer for checking
                            val_predict = sess.run(y_predict,
                                                   feed_dict=val_dict)
                            #print("shape of val_predict is {}".format(np.array(val_predict).shape))
                            #last ten elements of each batch

                            for y_true, y_pre in zip(y_batch[-9:],
                                                     val_predict[-9:]):
                                print("y_true: {}, y_predict: {}".format(
                                    y_true, y_pre))

                            val_loss.append(batch_loss)
                            val_acc.append(batch_acc)

                            sys.stdout.flush()

                        print("Iteration: {}".format(generation_num),
                              "Val acc: {:.3f}".format(np.mean(val_acc)),
                              "Val mse: {:.3f}".format(np.mean(val_loss)))

                        valid_writer.add_summary(summary, generation_num)
                        loss_val_avg = np.mean(val_loss)
                        #save variables every 50 generations
                        saver.save(sess,
                                   os.path.join(args.save_dir, "Sentimen_rnn"),
                                   global_step=generation_num)

                        if (ix == n_batches):
                            #early stop count here
                            if (last_loss > loss_val_avg):
                                last_loss = loss_val_avg
                                early_stop_counter = 0
                            else:
                                early_stop_counter += 1

                if (early_stop_counter >= args.patience
                        or e == (args.nb_epoch - 1)):
                    #save model
                    saver.save(
                        sess, os.path.join(args.save_dir,
                                           "Sentimen_rnn_final"))
                    saver.export_meta_graph(os.path.join(
                        args.save_dir, "Sentimen_rnn_final.meta"),
                                            collection_list=['train_var'])
                    break

            print("End of training.....")

        #testing
        elif (args.action == 'test'):
            # hard copy
            X_test = X.copy()
            Y_test = Y.copy()
            state = sess.run([init_state])
            with open(args.testout, 'w+') as outfile:

                for ix, (X_batch, y_batch) in enumerate(
                        get_batches(X_test, Y_test, args.batch_size), 1):

                    test_dict = {
                        X_: X_batch,
                        y_: y_batch,
                        keep_prob: args.keep_prob,
                        init_state: state
                    }
                    #for each traing generation, reload zero initial states

                    _, y_prebatch, accu_train = sess.run(
                        [train_op, y_predict, accuracy], feed_dict=test_dict)

                    for y_true, y_pre in zip(y_batch, y_prebatch):
                        strout = "%d\t%f\n" % (y_true, y_pre)
                        outfile.write(strout)
            print("Testing finish, write out file {}".format(args.testout))
            #raise Exception ('Implement your testing function')

    return
Beispiel #4
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = os.path.join(args.save_dir, args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
    elif args.action == 'test':
        dm.add_data('test_data', test_path, True)
    else:
        raise Exception('Action except for train, semi, and test')

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # prepare glove embedding
    embedding_matrix = preEB(dm)

    # initial model
    print('initial model...')
    model = simpleRNN(args, embedding_matrix, dm.tokenizer.word_index)
    model.summary()

    print("args.load_model =", args.load_model)
    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)
    elif args.action == 'test':
        #print ('Warning : testing without loading any model')
        print('args.action is %s' % (args.action))
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        #earlystopping = EarlyStopping(monitor='val_loss', patience = 3, verbose=1, mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        """
        checkpoint = ModelCheckpoint(filepath=save_path, 
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max' )
        """
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size)  #,
        #callbacks=[checkpoint, earlystopping] )

        model.save(save_path)

    # testing
    elif args.action == 'test':
        args.val_ratio = 0
        (X, Y), (X_val, Y_val) = dm.split_data('test_data', args.val_ratio)
        pred = model.predict(X)
        scores = model.evaluate(X, Y)
        print("test data scores(loss = mse) = %f" % scores[1])
        print("mse: ", evaluation(pred, Y, 'mse'))
        print("micro: ", evaluation(pred, Y, 'f1_micro'))
        print("macro: ", evaluation(pred, Y, 'f1_macro'))

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_loss',
                                     mode='max')
        # repeat 10 times
        for i in range(10):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Beispiel #5
0
def cbow_main():
    parser = argparse.ArgumentParser(description='CBOW word embedding')
    #training argument
    parser.add_argument('--vocab_size', default=50000, type=int)
    parser.add_argument('-emb_dim', '--embedding_dim', default=256, type=int)
    parser.add_argument('--gpu_fraction', default=0.8, type=float)
    parser.add_argument('--skip_window', default=2, type=int)
    parser.add_argument('--num_skips', default=4, type=int)
    parser.add_argument('--batch_size', default=512, type=int)
    parser.add_argument('--learning_rate', default=0.01, type=float)
    parser.add_argument('--log_dir', default='log_embdir/')
    parser.add_argument('--nsteps', default=5000000, type=int)

    # put model in the same directory
    parser.add_argument('--load_model', default=None)
    parser.add_argument('--load_token', default=None, type=bool)
    parser.add_argument('--save_embed', default='cbowemb.npz')

    args = parser.parse_args()

    mlclass_path = 'data/all_sents.txt'
    script_path = 'data/simu_script.txt'
    pylady_path = 'data/corpusclean_news_pylady.txt'
    pttgossi_path = 'data/ptt_gossiping_201611_post_cleanf.csv'
    #semi_path = 'data/training_nolabel.txt'
    save_path = 'token/'

    #load token path
    if args.load_token is not None:
        load_path = os.path.join(save_path)

    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    sess = get_session(args.gpu_fraction)

    #read all data for tokenizer (train, semi, test)
    dm = DataManager()
    print('Loading training data...')
    dm.add_data('ml_data', mlclass_path, False, False)
    dm.add_data('script_data', script_path, False, False)
    dm.add_data('pylady_data', pylady_path, False, False)
    dm.add_data('pttgossi_data', pttgossi_path, False, False)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_token is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # prepare sequence to text dict
    reverse_word_dict = dict(map(reversed, dm.tokenizer.word_index.items()))

    # CBOW embedding [skip_window target skip_window]
    # context_size = args.skip_window*2

    # convert to sequences without pre-padding (list, not np.array)
    #dm.to_sequence(args.max_length)
    dm.to_sequence_nopad()

    # fill all sequence data into a list
    seq_data = []
    seq_data.extend(dm.get_data('ml_data')[0])
    seq_data.extend(dm.get_data('script_data')[0])
    seq_data.extend(dm.get_data('pylady_data')[0])
    seq_data.extend(dm.get_data('pttgossi_data')[0])

    #seq_data.extend(dm.get_data('test_data')[0])

    # Create the graph object
    tf.reset_default_graph()

    # pick a random validation set to sample nearest neighbors. Here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    #valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    valid_text = [
        "喘", "呼吸", "白沫", "沒有", "意識", "倒下", "電話", "臉色", "起伏", "睡著", "昏倒", "溺水",
        "清醒", "不", "微弱", "很"
    ]
    #print(dm.tokenizer.texts_to_sequences(valid_text))
    valid_examples = np.array([
        words[0] for words in (dm.tokenizer.texts_to_sequences(valid_text))
        if len(words) > 0
    ])
    #print(valid_examples)
    #valid_examples = np.array(random.sample(range(valid_window), valid_size))

    with tf.name_scope('inputs'):
        #create placeholder for training (testing) data
        X_ = tf.placeholder(tf.int32, [args.batch_size, args.num_skips],
                            name='X_')
        y_ = tf.placeholder(tf.int32, [args.batch_size, 1], name='y_')
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    #embedding here
    with tf.name_scope("embeddings"):
        embedding_mat = tf.get_variable('embedding_mat',
                                        [args.vocab_size, args.embedding_dim],
                                        tf.float32,
                                        tf.random_normal_initializer())
        #embedding num_skips words
        embedding = tf.zeros([args.batch_size, args.embedding_dim])
        for j in range(args.num_skips):
            embedding += tf.nn.embedding_lookup(embedding_mat, X_[:, j])

    with tf.name_scope("softmax"):
        soft_weights = tf.get_variable('soft_weights',
                                       [args.vocab_size, args.embedding_dim],
                                       tf.float32,
                                       tf.random_normal_initializer())
        soft_biases = tf.get_variable('soft_biases', [args.vocab_size],
                                      tf.float32, tf.constant_initializer(0.0))

    num_sampled = 64
    # Compute the loss
    with tf.name_scope('loss'):
        # tf.nn.nce_loss
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=soft_weights,
                           biases=soft_biases,
                           labels=y_,
                           inputs=embedding,
                           num_sampled=num_sampled,
                           num_classes=args.vocab_size))
        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdagradOptimizer(
            args.learning_rate).minimize(loss)

    # Compute the similarity between minibatch examples and all embeddings
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_mat), 1, keep_dims=True))
    #normalized embedding matrix by its summation of squre element value, then take squre root
    normalized_embeddings = embedding_mat / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(valid_embeddings,
                           tf.transpose(normalized_embeddings))

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # variable initializer
    init = tf.initialize_all_variables()

    #tensorflow model saver
    saver = tf.train.Saver(tf.global_variables())

    writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    average_loss = 0.0
    data_index = 0
    seq_index = 0
    with tf.Session() as sess:
        # start to training
        sess.run(init)
        for step in range(args.nsteps):
            batch_X, batch_y, data_index, seq_index = generate_batch_cbow(
                seq_data, data_index, seq_index, args.batch_size,
                args.num_skips, args.skip_window)
            feed_dict = {X_: batch_X, y_: batch_y}
            op, lo = sess.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += lo
            if (step % 2000 == 0):
                if (step > 0):
                    average_loss = average_loss / 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print('Average loss at step %d: %f' % (step, average_loss))
                    average_loss = 0

            # note that this is expensive (~20% slowdown if computed every 500 steps)
            if (step % 10000 == 0):
                sim = similarity.eval()
                for i in range(valid_size):
                    try:
                        valid_word = reverse_word_dict[valid_examples[i]]
                    except KeyError:
                        print("Skip word...")

                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        try:
                            close_word = reverse_word_dict[nearest[k]]
                            log = '%s %s,' % (log, close_word)
                        except KeyError:
                            print("Skip nearest {}-th word".format(k))
                    #print once for each word
                    print(log)

        # final_embeddings = self.normalized_embeddings.eval()
        #final_embeddings = normalized_embeddings.eval()
        final_embeddings = embedding_mat.eval()
        # Save the model for checkpoints.
        saver.save(sess, os.path.join(args.log_dir, 'embmodel.ckpt'))

        writer.close()

    #save the embedding mapping matrix
    save_fn = save_path + args.save_embed
    np.savez(save_fn, embed_m=final_embeddings)

    return