Beispiel #1
0
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = MC.GPU

import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
KTF.set_session(sess)


logger = logging.getLogger(__name__)

U.mkdir_p(MC.OUT_PATH + '/preds')
U.set_logger(MC.OUT_PATH)
np.random.seed(MC.SEED)

###############################################################################################################################
# Prepare data
#

# from keras.preprocessing import sequence
#
# # data_x is a list of lists
# (train_x, train_y), (dev_x, dev_y), (test_x, test_y), vocab, vocab_size, overal_maxlen, num_outputs = dataset.get_data(
#     MC.MAXLEN, to_lower=True, sort_by_len=False)
#
# # Dump vocab
# with open(MC.OUT_PATH + '/vocab.pkl', 'wb') as vocab_file:
#     pk.dump(vocab, vocab_file)
Beispiel #2
0
    "Maximum allowed number of words during training. '0' means no limit (default=0)"
)
parser.add_argument("--seed",
                    dest="seed",
                    type=int,
                    metavar='<int>',
                    default=1234,
                    help="Random seed (default=1234)")
args = parser.parse_args()
out_dir = args.out_dir_path

with open(out_dir + '/args.pkl', 'wb') as args_file:
    pk.dump(args, args_file)

U.mkdir_p(out_dir + '/preds')
U.set_logger(out_dir)
U.print_args(args)

assert args.model_type in {'reg', 'regp', 'breg', 'bregp'}
assert args.algorithm in {
    'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
}
assert args.loss in {'mse', 'mae'}
assert args.recurrent_unit in {'lstm', 'gru', 'simple'}
assert args.aggregation in {'mot', 'attsum', 'attmean'}

if args.seed > 0:
    np.random.seed(args.seed)

if not args.prompt_id:
    args.prompt_id = 0
Beispiel #3
0
def train(args):
    out_dir = args.out_dir_path
    U.mkdir_p(out_dir + '/preds')
    timestr = U.set_logger(onscreen=args.onscreen, out_dir=out_dir)
    U.print_args(args)

    # 	assert args.model_type in {'mlp', 'cls', 'clsp', 'reg', 'regp', 'breg', 'bregp'}
    assert args.model_type in {'cls', 'reg'}
    assert args.algorithm in {
        'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
    }
    assert args.loss in {'mse', 'mae', 'cnp', 'hng'}
    assert args.recurrent_unit in {'lstm', 'gru', 'simple'}
    assert args.aggregation in {'mot', 'att'}

    if args.seed > 0:
        np.random.seed(args.seed)

    from nea.asap_evaluator import Evaluator
    import nea.asap_reader as dataset

    ###############################################################################################################################
    ## Prepare data
    #

    from keras.preprocessing import sequence

    if args.valid_split > 0:
        (train_x, train_y,
         train_pmt), (test_x, test_y,
                      test_pmt), vocab, overal_maxlen = dataset.get_data(
                          (args.train_path, args.test_path),
                          args.prompt_id,
                          args.vocab_size,
                          args.maxlen,
                          tokenize_text=True,
                          to_lower=True,
                          sort_by_len=False,
                          vocab_path=args.vocab_path)
    else:
        # data_x is a list of lists
        (train_x, train_y, train_pmt), (dev_x, dev_y, dev_pmt), (
            test_x, test_y, test_pmt), vocab, overal_maxlen = dataset.get_data(
                (args.train_path, args.dev_path, args.test_path),
                args.prompt_id,
                args.vocab_size,
                args.maxlen,
                tokenize_text=True,
                to_lower=True,
                sort_by_len=False,
                vocab_path=args.vocab_path)

    if args.pre_train_path:
        if args.valid_split == 0:
            args.valid_split = 0.2
        (pre_train_x, pre_train_y,
         pre_train_pmt), _, _, pre_overal_maxlen = dataset.get_data(
             (args.pre_train_path, args.test_path),
             args.prompt_id,
             args.vocab_size,
             args.maxlen,
             tokenize_text=True,
             to_lower=True,
             sort_by_len=False,
             vocab_path=args.vocab_path)
        overal_maxlen = max(overal_maxlen, pre_overal_maxlen)

    if args.tfidf > 0:
        train_pca, TfIdf, Pca = dataset.get_tfidf(args.train_path,
                                                  args.prompt_id,
                                                  pca_dim=args.tfidf,
                                                  training_material=True)
        if args.valid_split == 0:
            dev_pca, _, _ = dataset.get_tfidf(args.dev_path,
                                              args.prompt_id,
                                              pca_dim=args.tfidf,
                                              tfidf=TfIdf,
                                              pca=Pca,
                                              training_material=False)
        test_pca, _, _ = dataset.get_tfidf(args.test_path,
                                           args.prompt_id,
                                           pca_dim=args.tfidf,
                                           tfidf=TfIdf,
                                           pca=Pca,
                                           training_material=False)
    else:
        dev_pca = None
        test_pca = None

    if args.features:
        train_ftr = dataset.get_features(args.train_path,
                                         args.train_feature_path,
                                         args.prompt_id)
        if args.valid_split == 0:
            valid_ftr = dataset.get_features(args.dev_path,
                                             args.dev_feature_path,
                                             args.prompt_id)
        test_ftr = dataset.get_features(args.test_path, args.test_feature_path,
                                        args.prompt_id)
    else:
        test_ftr = None

    if not args.vocab_path:
        # Dump vocab
        with open(out_dir + '/vocab.pkl', 'wb') as vocab_file:
            pk.dump(vocab, vocab_file)

    # Pad sequences for mini-batch processing
# 	if args.model_type in {'breg', 'bregp', 'clsp', 'cls', 'mlp'}:
# 	assert args.rnn_dim > 0
# 	assert args.recurrent_unit == 'lstm'
    train_x = sequence.pad_sequences(train_x, maxlen=overal_maxlen)
    if args.valid_split == 0:
        dev_x = sequence.pad_sequences(dev_x, maxlen=overal_maxlen)
    if args.pre_train_path:
        pre_train_x = sequence.pad_sequences(pre_train_x, maxlen=overal_maxlen)
    test_x = sequence.pad_sequences(test_x, maxlen=overal_maxlen)
    # 	else:
    # 		train_x = sequence.pad_sequences(train_x)
    # 		dev_x = sequence.pad_sequences(dev_x)
    # 		test_x = sequence.pad_sequences(test_x)

    ###############################################################################################################################
    ## Some statistics
    #

    import keras.backend as K

    train_y = np.array(train_y, dtype=K.floatx())
    if args.valid_split == 0:
        dev_y = np.array(dev_y, dtype=K.floatx())
    if args.pre_train_path:
        pre_train_y = np.array(pre_train_y, dtype=K.floatx())
    test_y = np.array(test_y, dtype=K.floatx())

    if args.prompt_id >= 0:
        train_pmt = np.array(train_pmt, dtype='int32')
        if args.valid_split == 0:
            dev_pmt = np.array(dev_pmt, dtype='int32')
        test_pmt = np.array(test_pmt, dtype='int32')

    # count score distribution
    bincounts, mfs_list = U.bincounts(train_y)
    with open('%s/bincounts.txt' % out_dir, 'w') as output_file:
        for bincount in bincounts:
            output_file.write(str(bincount) + '\n')

    train_mean = train_y.mean(axis=0)
    train_std = train_y.std(axis=0)
    train_max = train_y.max(axis=0)
    train_min = train_y.min(axis=0)
    # 	dev_mean = dev_y.mean(axis=0)
    # 	dev_std = dev_y.std(axis=0)
    # 	test_mean = test_y.mean(axis=0)
    # 	test_std = test_y.std(axis=0)

    logger.info('Statistics:')

    logger.info('  train_x shape: ' + str(np.array(train_x).shape))
    if args.valid_split == 0:
        logger.info('  dev_x shape:   ' + str(np.array(dev_x).shape))
    logger.info('  test_x shape:  ' + str(np.array(test_x).shape))

    logger.info('  train_y shape: ' + str(train_y.shape))
    if args.valid_split == 0:
        logger.info('  dev_y shape:   ' + str(dev_y.shape))
    logger.info('  test_y shape:  ' + str(test_y.shape))

    logger.info(
        '  train_y max: %d, min: %d, mean: %.2f, stdev: %.3f, MFC: %s' %
        (train_max, train_min, train_mean, train_std, str(mfs_list)))
    logger.info('  train_y statistic: %s' % (str(bincounts[0]), ))

    # We need the dev and test sets in the original scale for evaluation
    # 	if args.valid_split == 0:
    # 		dev_y_org = dev_y.astype(dataset.get_ref_dtype())
    test_y_org = test_y.astype(dataset.get_ref_dtype())

    if "reg" in args.model_type:
        if args.normalize:
            logger.info('  normalize score to range (0,1)')
            # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
            train_y = dataset.get_model_friendly_scores(train_y, train_pmt)
            if args.valid_split == 0:
                dev_y = dataset.get_model_friendly_scores(dev_y, dev_pmt)
            test_y = dataset.get_model_friendly_scores(test_y, test_pmt)
    else:
        logger.info('  covert train_y to one hot shape')
        assert len(bincounts) == 1, "support only one y value"
        categ = int(max(bincounts[0].keys())) + 1
        if args.pre_train_path:
            categ = 5
        # covert to np array to minus 1 to get zero based value
        train_y = to_categorical(train_y, categ)
        if args.valid_split == 0:
            dev_y = to_categorical(dev_y, categ)
        if args.pre_train_path:
            pre_train_y = to_categorical(pre_train_y, categ)
        test_y = to_categorical(test_y, categ)

    ###############################################################################################################################
    ## Optimizer algorithm
    #

    from nea.optimizers import get_optimizer
    optimizer = get_optimizer(args)

    ###############################################################################################################################
    ## Building model
    #

    if "reg" in args.model_type:
        logger.info('  use regression model')
        final_categ = train_y.mean(axis=0)
        if args.loss == 'mae':
            loss = 'mean_absolute_error'
            metric = 'mean_squared_error'
        elif args.loss == 'mse':
            loss = 'mean_squared_error'
            metric = 'mean_absolute_error'
        else:
            raise NotImplementedError
    else:
        logger.info('  use classification model')
        final_categ = categ
        if args.loss == 'cnp':
            loss = 'categorical_crossentropy'
            metric = 'categorical_accuracy'
        elif args.loss == 'hng':
            loss = 'hinge'
            metric = 'squared_hinge'
        else:
            raise NotImplementedError

    from nea.models import create_model
    model = create_model(args, final_categ, overal_maxlen, vocab)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

    if args.onscreen: model.summary()

    ###############################################################################################################################
    ## Plotting model
    #

    from keras.utils.visualize_util import plot
    plot(model, to_file=out_dir + '/' + timestr + 'model_plot.png')

    ###############################################################################################################################
    ## Save model architecture
    #

    logger.info('Saving model architecture')
    with open(out_dir + '/' + timestr + 'model_config.json', 'w') as arch:
        arch.write(model.to_json(indent=2))
    logger.info('  Done')

    ###############################################################################################################################
    ## Initialize Evaluator
    #

    evl = Evaluator(args,
                    out_dir,
                    timestr,
                    metric,
                    test_x,
                    test_y,
                    test_y_org,
                    test_pmt,
                    test_pca=test_pca,
                    test_ftr=test_ftr)
    earlystop = EarlyStopping(patience=args.earlystop, verbose=1, mode='auto')

    ###############################################################################################################################
    ## Training
    #

    logger.info(
        '------------------------------------------------------------------------------------------'
    )
    logger.info('Initial Evaluation:')
    evl.eval(model, -1, print_info=True)

    model_train_x = [
        train_x,
    ]
    if not args.valid_split:
        model_dev_x = [
            dev_x,
        ]
    if args.tfidf > 0:
        model_train_x.append(train_pca)
        if not args.valid_split:
            model_dev_x.append(dev_pca)
    if args.features:
        model_train_x.append(train_ftr)
        if not args.valid_split:
            model_dev_x.append(valid_ftr)

    if args.pre_train_path:
        model.fit(pre_train_x,
                  pre_train_y,
                  validation_split=0.12,
                  batch_size=args.batch_size,
                  nb_epoch=args.pre_epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])

    if args.valid_split > 0:
        model.fit(model_train_x,
                  train_y,
                  validation_split=args.valid_split,
                  batch_size=args.batch_size,
                  nb_epoch=args.epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])
    else:
        model.fit(model_train_x,
                  train_y,
                  validation_data=(model_dev_x, dev_y),
                  batch_size=args.batch_size,
                  nb_epoch=args.epochs,
                  verbose=args.verbose,
                  callbacks=[earlystop, evl])

    return evl.print_final_info()