Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description="sentence Hi_LSTM-attent-pooling model")
    parser.add_argument('--model',
                        choices=[
                            'build_model', 'build_bidirectional_model',
                            'build_attention_model', 'build_attention2_model'
                        ],
                        required=True)

    parser.add_argument(
        '--embedding',
        type=str,
        default='glove',
        help='Word embedding type, word2vec, senna or glove or random')
    parser.add_argument('--embedding_dict',
                        type=str,
                        default=None,
                        help='Pretrained embedding path')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=50,
        help='Only useful when embedding is randomly initialised')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune word embeddings')

    parser.add_argument('--num_epochs',
                        type=int,
                        default=20,
                        help='number of epochs for training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of texts in each batch')
    parser.add_argument("-v",
                        "--vocab-size",
                        dest="vocab_size",
                        type=int,
                        metavar='<int>',
                        default=4000,
                        help="Vocab size (default=4000)")

    parser.add_argument('--lstm_units',
                        type=int,
                        default=100,
                        help='Num of hidden units in recurrent layer')

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Initial learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='Dropout rate for layers')
    parser.add_argument('--l2_value',
                        type=float,
                        default=0.001,
                        help='l2 regularizer value')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        help='checkpoint directory')

    parser.add_argument(
        '--train', default='../data/train.tsv')  # "data/word-level/*.train"
    parser.add_argument('--dev', default='../data/dev.tsv')
    parser.add_argument('--test', default='../data/test.tsv')
    parser.add_argument('--prompt_id',
                        type=int,
                        default=1,
                        help='prompt id of essay set')

    # to support init bias of output layer of the network
    parser.add_argument('--init_bias',
                        action='store_true',
                        help='initial bias of output' +
                        'layer with the mean score of training data')

    # parser
    args = parser.parse_args()
    model = args.model
    checkpoint_dir = args.checkpoint_path

    # model name
    modelname = "%s.prompt%s.%sunits.bs%s.hdf5" % (
        model, args.prompt_id, args.lstm_units, args.batch_size)
    modelpath = os.path.join(checkpoint_dir, modelname)

    # load data
    datapaths = [args.train]  #, args.dev, args.test]
    embedding_path = args.embedding_dict
    embedding = args.embedding
    embedd_dim = args.embedding_dim
    prompt_id = args.prompt_id
    (X_t, Y_t, mask_train), vocab, vocab_size, embed_table, max_sentlen, max_sentnum, init_mean_value = \
        prepare_sentence_data(datapaths, '../data/vocab_essay_set%d.pk'%args.prompt_id,\
            embedding_path, embedding, embedd_dim, prompt_id, args.vocab_size, tokenize_text=True, \
            to_lower=True, sort_by_len=False)

    # load pretrained embedding
    if embed_table is not None:
        embedd_dim = embed_table.shape[1]
        embed_table = [embed_table]

    nn1 = int(np.ceil(len(X_t) * 0.7))
    nn2 = int(np.ceil(len(X_t) * 0.9))

    Y_train = Y_t[0:nn1]
    Y_dev = Y_t[nn1:nn2]
    Y_test = Y_t[nn2:]

    X_train = X_t.reshape((X_t.shape[0], X_t.shape[1] * X_t.shape[2]))[0:nn1]
    X_dev = X_t.reshape((X_t.shape[0], X_t.shape[1] * X_t.shape[2]))[nn1:nn2]
    X_test = X_t.reshape((X_t.shape[0], X_t.shape[1] * X_t.shape[2]))[nn2:]

    # create log
    fh = logging.FileHandler('%s.log' % modelpath)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.ERROR)
    logger.addHandler(fh)
    logger.addHandler(ch)

    # BUild model
    model = getattr(hier_lstm,
                    model)(args, vocab_size, max_sentnum, max_sentlen,
                           embedd_dim, embed_table, True, init_mean_value)
    logger.info("X_train shape: %s" % str(X_train.shape))

    # Evaluation
    evl = Evaluator(args.prompt_id, False, checkpoint_dir, modelname, X_train,
                    X_dev, X_test, Y_train, Y_dev, Y_test)

    # Initial evaluation
    logger.info("Initial evaluation: ")
    evl.evaluate(model, -1, logger, print_info=True)
    logger.info("Train model")
    for ii in range(args.num_epochs):
        logger.info('Epoch %s/%s' % (str(ii + 1), args.num_epochs))
        start_time = time.time()
        history = model.fit(X_train,
                            Y_train,
                            batch_size=args.batch_size,
                            epochs=1,
                            verbose=0,
                            shuffle=True)
        tt_time = time.time() - start_time
        logger.info("Training one epoch in %.3f s" % tt_time)
        evl.evaluate(model, ii + 1, logger)
        evl.print_info(logger)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description="sentence Hi_CNN model")
    parser.add_argument('--train_flag',
                        action='store_true',
                        help='Train or eval')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune word embeddings')
    parser.add_argument('--embedding',
                        type=str,
                        default='word2vec',
                        help='Word embedding type, word2vec, senna or glove')
    parser.add_argument('--embedding_dict',
                        type=str,
                        default='glove/glove.6B.50d.txt',
                        help='Pretrained embedding path')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=50,
        help='Only useful when embedding is randomly initialised')
    parser.add_argument(
        '--char_embedd_dim',
        type=int,
        default=30,
        help='char embedding dimension if using char embedding')

    parser.add_argument('--use_char',
                        action='store_false',
                        help='Whether use char embedding or not')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=50,
                        help='number of epochs for training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=1,
                        help='Number of texts in each batch')
    parser.add_argument("-v",
                        "--vocab-size",
                        dest="vocab_size",
                        type=int,
                        metavar='<int>',
                        default=4000,
                        help="Vocab size (default=4000)")

    parser.add_argument('--nbfilters',
                        type=int,
                        default=100,
                        help='Num of filters in conv layer')
    parser.add_argument('--char_nbfilters',
                        type=int,
                        default=20,
                        help='Num of char filters in conv layer')
    parser.add_argument('--filter1_len',
                        type=int,
                        default=5,
                        help='filter length in 1st conv layer')
    parser.add_argument(
        '--filter2_len',
        type=int,
        default=3,
        help='filter length in 2nd conv layer or char conv layer')
    parser.add_argument('--rnn_type',
                        type=str,
                        default='LSTM',
                        help='Recurrent type')
    parser.add_argument('--lstm_units',
                        type=int,
                        default=100,
                        help='Num of hidden units in recurrent layer')

    # parser.add_argument('--project_hiddensize', type=int, default=100, help='num of units in projection layer')
    parser.add_argument(
        '--optimizer',
        choices=['sgd', 'momentum', 'nesterov', 'adagrad', 'rmsprop'],
        help='updating algorithm',
        default='rmsprop')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Initial learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='Dropout rate for layers')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        default='embedding',
                        help="Embedding for oov word")
    parser.add_argument('--l2_value', type=float, help='l2 regularizer value')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        help='checkpoint directory',
                        default='checkpoints')

    parser.add_argument('--train',
                        type=str,
                        help='train file',
                        default='data/fold_0/train.tsv'
                        )  # "data/word-level/*.trpreprocess_asap.pyain"
    parser.add_argument('--dev',
                        type=str,
                        help='dev file',
                        default='data/fold_0/dev.tsv')
    parser.add_argument('--test',
                        type=str,
                        help='test file',
                        default='data/fold_0/test.tsv')
    parser.add_argument('--prompt_id',
                        type=int,
                        default=1,
                        help='prompt id of essay set')
    parser.add_argument(
        '--init_bias',
        action='store_true',
        help='init the last layer bias with average score of training data')
    parser.add_argument('--mode', type=str, choices=['mot', 'att', 'merged'], default='att', \
                        help='Mean-over-Time pooling or attention-pooling, or two pooling merged')

    args = parser.parse_args()
    args.use_char = False
    train_flag = args.train_flag
    fine_tune = args.fine_tune
    USE_CHAR = args.use_char

    batch_size = args.batch_size
    checkpoint_dir = args.checkpoint_path
    num_epochs = args.num_epochs

    modelname = "attn-%s.prompt%s.%sfilters.bs%s.hdf5" % (
        args.mode, args.prompt_id, args.nbfilters, batch_size)
    imgname = "attn-%s.prompt%s.%sfilters.bs%s.png" % (
        args.mode, args.prompt_id, args.nbfilters, batch_size)

    if USE_CHAR:
        modelname = 'char_' + modelname
        imgname = 'char_' + imgname
    modelpath = os.path.join(checkpoint_dir, modelname)
    imgpath = os.path.join(checkpoint_dir, imgname)

    datapaths = [args.train, args.dev, args.test]
    embedding_path = args.embedding_dict
    oov = args.oov
    embedding = args.embedding
    embedd_dim = args.embedding_dim
    prompt_id = args.prompt_id

    # debug mode
    # debug = True
    # if debug:
    # 	nn_model = build_concat_model(args, args.vocab_size, 71, 20, embedd_dim, None, True)

    if args.use_char:
        (X_train, C_train, Y_train, mask_train), (X_dev, C_dev, Y_dev, mask_dev), (X_test, C_test, Y_test, mask_test), \
                vocab, vocab_size, char_vocab, char_vocabsize, embed_table, overal_maxlen, overal_maxnum, maxcharlen, init_mean_value = data_prepare.prepare_data(datapaths, \
                embedding_path, embedding, embedd_dim, prompt_id, args.vocab_size, tokenize_text=True, \
                to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)
    else:
        (X_train, Y_train, mask_train), (X_dev, Y_dev, mask_dev), (X_test, Y_test, mask_test), \
                vocab, vocab_size, embed_table, overal_maxlen, overal_maxnum, init_mean_value = data_prepare.prepare_sentence_data(datapaths, \
                embedding_path, embedding, embedd_dim, prompt_id, args.vocab_size, tokenize_text=True, \
                to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    # print type(embed_table)
    if embed_table is not None:
        embedd_dim = embed_table.shape[1]
        embed_table = [embed_table]

    max_sentnum = overal_maxnum
    max_sentlen = overal_maxlen
    # print embed_table
    # print X_train[0, 0:10, :]
    # print Y_train[0:10]
    # print C_train[0, 0, 0, :], C_train[0, 0, 1, :], C_train[0, 0, -1, :]

    X_train = X_train.reshape(
        (X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
    X_dev = X_dev.reshape((X_dev.shape[0], X_dev.shape[1] * X_dev.shape[2]))
    X_test = X_test.reshape(
        (X_test.shape[0], X_test.shape[1] * X_test.shape[2]))
    logger.info("X_train shape: %s" % str(X_train.shape))

    if not args.use_char:
        C_train, C_dev, C_test = None, None, None
        char_vocabsize = 0
        maxcharlen = 0
    else:
        C_train = C_train.reshape(
            (C_train.shape[0],
             C_train.shape[1] * C_train.shape[2] * C_train.shape[3]))
        C_dev = C_dev.reshape(
            (C_dev.shape[0], C_dev.shape[1] * C_dev.shape[2] * C_dev.shape[3]))
        C_test = C_test.reshape(
            (C_test.shape[0],
             C_test.shape[1] * C_test.shape[2] * C_test.shape[3]))

        logger.info("C_train shape: %s" % str(C_train.shape))

    model = build_hrcnn_model(args, vocab_size, char_vocabsize + 1,
                              max_sentnum, max_sentlen, maxcharlen, embedd_dim,
                              embed_table, True, init_mean_value)

    evl = Evaluator(args.prompt_id, args.use_char, checkpoint_dir, modelname,
                    X_train, X_dev, X_test, C_train, C_dev, C_test, Y_train,
                    Y_dev, Y_test)

    # Initial evaluation
    logger.info("Initial evaluation: ")
    evl.evaluate(model, -1, print_info=True)
    logger.info("Train model")
    for ii in range(args.num_epochs):
        logger.info('Epoch %s/%s' % (str(ii + 1), args.num_epochs))
        start_time = time.time()
        if args.use_char:
            model.fit([X_train, C_train],
                      Y_train,
                      batch_size=args.batch_size,
                      epochs=1,
                      verbose=0,
                      shuffle=True)
        else:
            model.fit(X_train,
                      Y_train,
                      batch_size=args.batch_size,
                      epochs=1,
                      verbose=0,
                      shuffle=True)
        tt_time = time.time() - start_time
        logger.info("Training one epoch in %.3f s" % tt_time)
        evl.evaluate(model, ii + 1)
        evl.print_info()

    evl.print_final_info()
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="sentence Hi_CNN model")
    parser.add_argument('--train_flag', action='store_true', help='Train or eval')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune word embeddings')
    parser.add_argument('--embedding', type=str, default='word2vec', help='Word embedding type, word2vec, senna or glove')
    parser.add_argument('--embedding_dict', type=str, default=None, help='Pretrained embedding path')
    parser.add_argument('--embedding_dim', type=int, default=64, help='Only useful when embedding is randomly initialised')

    parser.add_argument('--use_char', action='store_true', help='Whether use char embedding or not')
    parser.add_argument('--num_epochs', type=int, default=20, help='number of epochs for training')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of texts in each batch')
    parser.add_argument("-v", "--vocab-size", dest="vocab_size", type=int, metavar='<int>', default=4000, help="Vocab size (default=4000)")

    parser.add_argument('--nbfilters', type=int, default=100, help='Num of filters in conv layer')
    parser.add_argument('--filter1_len', type=int, default=5, help='filter length in 1st conv layer')
    parser.add_argument('--filter2_len', type=int, default=3, help='filter length in 2nd conv layer')

    # parser.add_argument('--project_hiddensize', type=int, default=100, help='num of units in projection layer')
    parser.add_argument('--optimizer', choices=['sgd', 'momentum', 'nesterov', 'adagrad', 'rmsprop'], help='updating algorithm', default='sgd')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Initial learning rate')
    parser.add_argument('--dropout', type=float, default=0.5, help='Dropout rate for layers')
    parser.add_argument('--oov', choices=['random', 'embedding'], help="Embedding for oov word", required=True)
    parser.add_argument('--l2_value', type=float, default=0.001, help='l2 regularizer value')
    parser.add_argument('--checkpoint_path', type=str, help='checkpoint directory')

    parser.add_argument('--train')
    parser.add_argument('--dev')
    parser.add_argument('--test')
    parser.add_argument('--prompt_id', type=int, default=1, help='prompt id of essay set')

    args = parser.parse_args()

    train_flag = args.train_flag
    fine_tune = args.fine_tune
    USE_CHAR = args.use_char

    batch_size = args.batch_size
    checkpoint_dir = args.checkpoint_path
    num_epochs = args.num_epochs

    modelname = "sent_hcnn-MoT.prompt%s.%sfilters.bs%s.hdf5" % (args.prompt_id, args.nbfilters, batch_size)
    imgname = "sent_hcnn-MoT.prompt%s.%sfilters.bs%s.png" % (args.prompt_id, args.nbfilters, batch_size)

    if USE_CHAR:
        modelname = 'char_' + modelname
        imgname = 'char_' + imgname
    modelpath = os.path.join(checkpoint_dir, modelname)
    imgpath = os.path.join(checkpoint_dir, imgname)

    datapaths = [args.train, args.dev, args.test]
    embedding_path = args.embedding_dict
    oov = args.oov
    embedding = args.embedding
    embedd_dim = args.embedding_dim
    prompt_id = args.prompt_id

    (X_train, Y_train, mask_train), (X_dev, Y_dev, mask_dev), (X_test, Y_test, mask_test), \
            vocab, vocab_size, embed_table, overal_maxlen, overal_maxnum, init_mean_value = prepare_sentence_data(datapaths, \
            embedding_path, embedding, embedd_dim, prompt_id, args.vocab_size, tokenize_text=True, \
            to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    # print type(embed_table)
    if embed_table is not None:
        embedd_dim = embed_table.shape[1]
        embed_table = [embed_table]
        
    max_sentnum = overal_maxnum
    max_sentlen = overal_maxlen
    # print embed_table
    # print X_train[0, 0:10, :]
    # print Y_train[0:10]

    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
    X_dev = X_dev.reshape((X_dev.shape[0], X_dev.shape[1]*X_dev.shape[2]))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]*X_test.shape[2]))

    logger.info("X_train shape: %s" % str(X_train.shape))
    if USE_CHAR:
        raise NotImplementedError

    else:
        model = build_hcnn_model(args, vocab_size, max_sentnum, max_sentlen, embedd_dim, embed_table, True)

    C_train, C_test, C_dev = None, None, None
    evl = Evaluator(args.prompt_id, args.use_char, checkpoint_dir, modelname, X_train, X_dev, X_test, C_train, C_dev, C_test, Y_train, Y_dev, Y_test)

    # Initial evaluation
    logger.info("Initial evaluation: ")
    evl.evaluate(model, -1, print_info=True)
    logger.info("Train model")
    for ii in xrange(args.num_epochs):
        logger.info('Epoch %s/%s' % (str(ii+1), args.num_epochs))
        start_time = time.time()
        if args.use_char:
            model.fit([X_train, C_train], Y_train, batch_size=args.batch_size, nb_epoch=1, verbose=0, shuffle=True)
        else:
            model.fit(X_train, Y_train, batch_size=args.batch_size, nb_epoch=1, verbose=0, shuffle=True)
        tt_time = time.time() - start_time
        logger.info("Training one epoch in %.3f s" % tt_time)
        evl.evaluate(model, ii+1)
        evl.print_info()

    evl.print_final_info()
Ejemplo n.º 4
0
def main(fold, p_id):
    parser = argparse.ArgumentParser(description="sentence Hi_CNN model")
    parser.add_argument('--embedding',
                        type=str,
                        default='glove',
                        help='Word embedding type, word2vec, senna or glove')
    parser.add_argument('--embedding_dict',
                        type=str,
                        default='glove.6B.100d.txt',
                        help='Pretrained embedding path')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=100,
        help='Only useful when embedding is randomly initialised')

    parser.add_argument('--num_epochs',
                        type=int,
                        default=30,
                        help='number of epochs for training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of texts in each batch')
    parser.add_argument("-v",
                        "--vocab-size",
                        dest="vocab_size",
                        type=int,
                        metavar='<int>',
                        default=4000,
                        help="Vocab size (default=4000)")

    parser.add_argument('--nbfilters',
                        type=int,
                        default=100,
                        help='Num of filters in conv layer')

    parser.add_argument('--filter1_len',
                        type=int,
                        default=3,
                        help='filter length in 1st conv layer')
    parser.add_argument('--rnn_type',
                        type=str,
                        default='LSTM',
                        help='Recurrent type')
    parser.add_argument('--lstm_units',
                        type=int,
                        default=100,
                        help='Num of hidden units in recurrent layer')

    # parser.add_argument('--project_hiddensize', type=int, default=100, help='num of units in projection layer')
    parser.add_argument(
        '--optimizer',
        choices=['sgd', 'momentum', 'nesterov', 'adagrad', 'rmsprop'],
        help='updating algorithm',
        default='rmsprop')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Initial learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='Dropout rate for layers')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help="Embedding for oov word",
                        default='random',
                        required=False)
    parser.add_argument('--l2_value', type=float, help='l2 regularizer value')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        help='checkpoint directory',
                        default='./checkpoint')

    parser.add_argument('--train',
                        default='prompt9_data/fold_' + str(fold) +
                        '/train.tsv')  # "data/word-level/*.train"
    parser.add_argument('--dev',
                        default='prompt9_data/fold_' + str(fold) + '/dev.tsv')
    parser.add_argument('--test',
                        default='prompt9_data/fold_' + str(fold) + '/test.tsv')
    parser.add_argument('--prompt_id',
                        type=int,
                        default=p_id,
                        help='prompt id of essay set')
    parser.add_argument(
        '--init_bias',
        action='store_true',
        help='init the last layer bias with average score of training data')
    parser.add_argument(
        '--mode',
        type=str,
        choices=['mot', 'att', 'merged'],
        default='mot',
        help=
        'Mean-over-Time pooling or attention-pooling, or two pooling merged')

    args = parser.parse_args()

    d_path = 'prompt9_info/en.txt'
    datapaths = [args.train, args.dev, args.test, d_path]
    embedding_path = args.embedding_dict
    embedding = args.embedding
    embedd_dim = args.embedding_dim
    prompt_id = args.prompt_id


    (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (
        X_test, Y_test, D_test, mask_test, test_ids), \
    vocab, vocab_size, embed_table, overal_maxlen, overal_maxnum, max_sentnum_d, init_mean_value = data_prepare.prepare_sentence_data(
        datapaths, embedding_path, embedding, embedd_dim, prompt_id, args.vocab_size, tokenize_text=True, to_lower=True,
        sort_by_len=False, vocab_path=None, score_index=6)

    # picture = np.loadtxt('img_feature')
    picture = get_picture()
    # new_pictures = []
    # for picture in pictures:
    #     picture = np.array(picture) / 255.0
    #     new_pictures.append(picture)
    # picture = np.array(new_pictures)
    print picture.shape
    # picture = get_picture('./prompt9_info/prompt9.png')
    train_num = X_train.shape[0]
    dev_num = X_dev.shape[0]
    test_num = X_test.shape[0]

    # p_train = np.empty(shape=[train_num, 4, 2048])
    # p_dev = np.empty(shape=[dev_num, 4, 2048])
    # p_test = np.empty(shape=[test_num, 4, 2048])

    img_size = 256
    # p_train = np.empty(shape=[train_num, 4, img_size, img_size, 3])
    # p_dev = np.empty(shape=[dev_num, 4, img_size, img_size, 3])
    # p_test = np.empty(shape=[test_num, 4, img_size, img_size, 3])
    p_train = np.empty(shape=[train_num, img_size, img_size, 3])
    p_dev = np.empty(shape=[dev_num, img_size, img_size, 3])
    p_test = np.empty(shape=[test_num, img_size, img_size, 3])
    for i in range(train_num):
        p_train[i] = picture

    for i in range(dev_num):
        p_dev[i] = picture
    for i in range(test_num):
        p_test[i] = picture

    embedd_dim = embed_table.shape[1]
    embed_table = [embed_table]

    max_sentnum = overal_maxnum
    max_sentlen = overal_maxlen

    X_train = X_train.reshape(
        (X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
    X_dev = X_dev.reshape((X_dev.shape[0], X_dev.shape[1] * X_dev.shape[2]))
    X_test = X_test.reshape(
        (X_test.shape[0], X_test.shape[1] * X_test.shape[2]))

    D_train = D_train.reshape(
        (D_train.shape[0], D_train.shape[1] * D_train.shape[2]))
    D_dev = D_dev.reshape((D_dev.shape[0], D_dev.shape[1] * D_dev.shape[2]))
    D_test = D_test.reshape(
        (D_test.shape[0], D_test.shape[1] * D_test.shape[2]))
    logger.info("X_train shape: %s" % str(X_train.shape))

    model = build_model_fusion(args, vocab_size, max_sentnum, max_sentlen,
                               embedd_dim, embed_table, True, init_mean_value)
    # model = build_model_with_topic(args, vocab_size, max_sentnum, max_sentlen, max_sentnum_d, embedd_dim, embed_table, True,
    #                    init_mean_value)

    evl = Evaluator(args.prompt_id, fold, X_train, X_dev, X_test, Y_train,
                    Y_dev, Y_test, D_train, D_dev, D_test, p_train, p_dev,
                    p_test)

    # Initial evaluation
    if is_training:
        logger.info("Initial evaluation: ")
        # evl.evaluate(model, -1, print_info=True)
        logger.info("Train model")
        for ii in xrange(args.num_epochs):
            logger.info('Epoch %s/%s' % (str(ii + 1), args.num_epochs))
            start_time = time.time()
            # model.fit({'word_input': X_train, 'word_input_d': D_train}, Y_train, batch_size=args.batch_size, epochs=1, verbose=0, shuffle=True)
            # model.fit({'word_input': X_train, 'p_input1': p_train[:, 0, :], 'p_input2': p_train[:, 1, :],
            #            'p_input3': p_train[:, 2, :], 'p_input4': p_train[:, 3, :]}, Y_train, batch_size=args.batch_size, epochs=1, verbose=0,
            #           shuffle=True)
            model.fit({
                'word_input': X_train,
                'p': p_train
            },
                      Y_train,
                      batch_size=args.batch_size,
                      epochs=1,
                      verbose=0,
                      shuffle=True)
            tt_time = time.time() - start_time
            logger.info("Training one epoch in %.3f s" % tt_time)
            evl.evaluate(model, ii + 1)
            evl.print_info()

        evl.print_final_info()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description="sentence Hi_CNN_LSTM model")
    parser.add_argument(
        '--embedding',
        type=str,
        default='glove',
        help='Word embedding type, glove, word2vec, senna or random')
    parser.add_argument('--embedding_dict',
                        type=str,
                        default='glove/glove.6B.50d.txt',
                        help='Pretrained embedding path')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=50,
        help='Only useful when embedding is randomly initialised')

    parser.add_argument('--num_epochs',
                        type=int,
                        default=50,
                        help='number of epochs for training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of texts in each batch')
    parser.add_argument("-v",
                        "--vocab-size",
                        dest="vocab_size",
                        type=int,
                        metavar='<int>',
                        default=4000,
                        help="Vocab size (default=4000)")

    parser.add_argument('--nbfilters',
                        type=int,
                        default=100,
                        help='Num of filters in conv layer')
    parser.add_argument('--filter1_len',
                        type=int,
                        default=5,
                        help='filter length in 1st conv layer')
    parser.add_argument('--lstm_units',
                        type=int,
                        default=100,
                        help='Num of hidden units in recurrent layer')

    parser.add_argument('--optimizer',
                        choices=['sgd', 'adagrad', 'rmsprop'],
                        help='Optimizer',
                        default='rmsprop')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Initial learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='Dropout rate for layers')
    parser.add_argument('--l2_value', type=float, help='l2 regularizer value')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        help='checkpoint directory',
                        default='checkpoints')

    parser.add_argument('--train',
                        type=str,
                        help='train file',
                        default='data/fold_0/train.tsv')
    parser.add_argument('--dev',
                        type=str,
                        help='dev file',
                        default='data/fold_0/dev.tsv')
    parser.add_argument('--test',
                        type=str,
                        help='test file',
                        default='data/fold_0/test.tsv')
    parser.add_argument('--prompt_id',
                        type=int,
                        default=3,
                        help='prompt id of essay set')
    parser.add_argument(
        '--init_bias',
        action='store_true',
        help='init the last layer bias with average score of training data')
    parser.add_argument('--mode',
                        type=str,
                        choices=['att', 'co'],
                        default='co',
                        help='attention-pooling, or co-attention pooling')

    args = parser.parse_args()

    batch_size = args.batch_size
    checkpoint_dir = args.checkpoint_path
    num_epochs = args.num_epochs

    modelname = "%s.prompt%s.%sfilters.bs%s" % (args.mode, args.prompt_id,
                                                args.nbfilters, batch_size)

    datapaths = [args.train, args.dev, args.test]
    embedding_path = args.embedding_dict
    embedding = args.embedding
    emb_dim = args.embedding_dim
    prompt_id = args.prompt_id

    mode = args.mode
    need_context = mode in ['co']

    (X_train, Y_train, mask_train, train_context, text_train), \
    (X_dev, Y_dev, mask_dev, dev_context, text_dev), \
    (X_test, Y_test, mask_test, test_context, text_test), \
    vocab, vocab_size, emb_table, overall_maxlen, overall_maxnum, init_mean_value, context_len, context_num = \
        data_prepare.prepare_sentence_data(
            datapaths,
            embedding_path,
            embedding,
            emb_dim,
            prompt_id,
            args.vocab_size,
            tokenize_text=True,
            to_lower=True,
            vocab_path=None,
            score_index=6,
            need_context=need_context
        )

    if emb_table is not None:
        emb_dim = emb_table.shape[1]
        emb_table = [emb_table]

    max_sentnum = overall_maxnum
    max_sentlen = overall_maxlen

    X_train = X_train.reshape(
        (X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
    X_dev = X_dev.reshape((X_dev.shape[0], X_dev.shape[1] * X_dev.shape[2]))
    X_test = X_test.reshape(
        (X_test.shape[0], X_test.shape[1] * X_test.shape[2]))

    train_context = train_context.reshape(
        (train_context.shape[0],
         train_context.shape[1] * train_context.shape[2]))
    dev_context = dev_context.reshape(
        (dev_context.shape[0], dev_context.shape[1] * dev_context.shape[2]))
    test_context = test_context.reshape(
        (test_context.shape[0], test_context.shape[1] * test_context.shape[2]))

    logger.info("X_train shape: %s" % str(X_train.shape))
    logger.info("X_dev shape: %s" % str(X_dev.shape))
    logger.info("X_test shape: %s" % str(X_test.shape))

    if mode == 'att':
        model = build_hrcnn_model(args, vocab_size, max_sentnum, max_sentlen,
                                  emb_dim, emb_table, True, init_mean_value)
        x_train = X_train
        y_train = Y_train
        x_dev = X_dev
        y_dev = Y_dev
        x_test = X_test
        y_test = Y_test
    elif mode == 'co':
        model = build_shrcnn_model(args, vocab_size, max_sentnum, max_sentlen,
                                   context_num, context_len, emb_dim,
                                   emb_table, True, init_mean_value)
        x_train = [X_train, train_context]
        y_train = Y_train
        x_dev = [X_dev, dev_context]
        y_dev = Y_dev
        x_test = [X_test, test_context]
        y_test = Y_test
    else:
        raise NotImplementedError

    evl = Evaluator(prompt_id, checkpoint_dir, modelname, x_train, x_dev,
                    x_test, y_train, y_dev, y_test)

    # Initial evaluation
    logger.info("Initial evaluation: ")
    evl.evaluate(model, -1, print_info=True)
    logger.info("Train model")
    for ii in range(num_epochs):
        logger.info('Epoch %s/%s' % (str(ii + 1), num_epochs))
        start_time = time.time()
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=1,
                  verbose=0,
                  shuffle=True)
        tt_time = time.time() - start_time
        logger.info("Training one epoch in %.3f s" % tt_time)
        evl.evaluate(model, ii + 1, print_info=True)

    evl.print_final_info()