Example #1
0
def eval_on_dev():
    """Split train into train and dev and fit"""

    model, config = get_model()

    # load training data
    train_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'))
    x_train, y_train = train_data_provider.load_as_int_seqs()

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.20,
                                                      random_state=2020)

    train_loader = make_data_loader(x_train, torch.tensor(y_train),
                                    cfg.getint('model', 'batch'), 'train',
                                    config['max_len'])
    val_loader = make_data_loader(x_val, torch.tensor(y_val),
                                  cfg.getint('model', 'batch'), 'dev',
                                  config['max_len'])

    label_counts = torch.bincount(torch.tensor(y_train))
    weights = len(y_train) / (2.0 * label_counts)
    print('class weights:', weights)

    best_roc_auc, optimal_epochs = fit(model, train_loader, val_loader,
                                       weights, cfg.getint('model', 'epochs'))
    print('best roc %.4f after %d epochs\n' % (best_roc_auc, optimal_epochs))

    return optimal_epochs
Example #2
0
def data_dense():
    """Data to feed into code prediction model"""

    base = os.environ['DATA_ROOT']
    train_data = os.path.join(base, cfg.get('data', 'train'))
    test_data = os.path.join(base, cfg.get('data', 'test'))

    # type of pre-training (e.g. 'sparse', 'continuous')
    pretraining = cfg.get('data', 'pretraining')

    # load pre-trained model
    model = load_model(cfg.get('data', 'model_file'))
    interm_layer_model = Model(inputs=model.input,
                               outputs=model.get_layer(
                                   cfg.get('data', 'rep_layer')).output)

    if pretraining == 'sparse':
        maxlen = None
    else:
        maxlen = model.get_layer(name='EL').get_config()['input_length']

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          cfg.get('data', 'tokenizer_pickle'),
                                          maxlen)

    if pretraining == 'sparse':
        x_train, y_train = train_data_provider.load_as_one_hot()
    else:
        x_train, y_train = train_data_provider.load_as_int_seqs()

    # make training vectors for target task
    print('original x_train shape:', x_train.shape)
    x_train = interm_layer_model.predict(x_train)
    print('new x_train shape:', x_train.shape)

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         cfg.get('data', 'tokenizer_pickle'),
                                         maxlen)

    if pretraining == 'sparse':
        x_test, y_test = test_data_provider.load_as_one_hot()
    else:
        x_test, y_test = test_data_provider.load_as_int_seqs()

    # make test vectors for target task
    print('original x_test shape:', x_test.shape)
    x_test = interm_layer_model.predict(x_test)
    print('new x_test shape:', x_test.shape)

    return x_train, y_train, x_test, y_test
Example #3
0
def data_dense():
    """Data to feed into code prediction model"""

    train_data = os.path.join(base, cfg.get('data', 'train'))
    test_data = os.path.join(base, cfg.get('data', 'test'))

    # load model configuration
    pkl = open(cfg.get('data', 'config_pickle'), 'rb')
    config = pickle.load(pkl)

    # instantiate model and load parameters
    model = trans.TransformerEncoder(**config, save_config=False)
    state_dict = torch.load(cfg.get('data', 'model_file'))
    model.load_state_dict(state_dict)
    model.eval()

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          cfg.get('data', 'tokenizer_pickle'))

    x_train, y_train = train_data_provider.load_as_int_seqs()

    # make training vectors for target task
    x_train = get_dense_representations(model, x_train, config['max_len'])

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         cfg.get('data', 'tokenizer_pickle'))

    x_test, y_test = test_data_provider.load_as_int_seqs()

    # make test vectors for target task
    x_test = get_dense_representations(model, x_test, config['max_len'])

    return x_train, y_train, x_test, y_test
Example #4
0
def eval_on_test(n_epochs):
    """Train on training set and evaluate on test"""

    model, config = get_model()

    # training data
    train_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'))

    # test set
    test_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'test')),
        cfg.get('data', 'tokenizer_pickle'))

    x_train, y_train = train_data_provider.load_as_int_seqs()
    x_test, y_test = test_data_provider.load_as_int_seqs()

    train_loader = make_data_loader(x_train, torch.tensor(y_train),
                                    cfg.getint('model', 'batch'), 'train',
                                    config['max_len'])
    test_loader = make_data_loader(x_test, torch.tensor(y_test),
                                   cfg.getint('model', 'batch'), 'dev',
                                   config['max_len'])

    label_counts = torch.bincount(torch.tensor(y_train))
    weights = len(y_train) / (2.0 * label_counts)

    fit(model, train_loader, test_loader, weights, n_epochs)
Example #5
0
def main():
    """Train and evaluate"""

    data_root = os.environ['DATA_ROOT']

    if os.path.isdir('./Model/'):
        shutil.rmtree('./Model/')
    os.mkdir('./Model/')

    train_data_provider = DatasetProvider(
        os.path.join(data_root, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'), None)
    x_train, y_train = train_data_provider.load_as_one_hot()
    print('loaded x_train:', x_train.shape)

    # are we evaluating on test or dev?
    if cfg.getfloat('data', 'val_size') != 0:
        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, test_size=cfg.getfloat('data', 'val_size'))
        callbacks = [
            ModelCheckpoint('./Model/model.h5', verbose=1, save_best_only=True)
        ]
        validation_data = (x_val, y_val)
        print('x_train shape:', x_train.shape)
        print('x_val shape:', x_val.shape)

    else:
        test_data_provider = DatasetProvider(
            os.path.join(data_root, cfg.get('data', 'test')),
            cfg.get('data', 'tokenizer_pickle'), None)
        x_test, y_test = test_data_provider.load_as_one_hot()
        print('loaded x_test:', x_test.shape)
        validation_data = None
        callbacks = None

    # train the linear classification layer
    model = get_model(len(train_data_provider.label2int))
    optim = getattr(optimizers, cfg.get('linear', 'optimizer'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optim(lr=cfg.getfloat('linear', 'lr')),
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              validation_data=validation_data,
              epochs=cfg.getint('linear', 'epochs'),
              batch_size=cfg.getint('linear', 'batch'),
              validation_split=0.0,
              callbacks=callbacks)

    # fine-tune the pre-trained layers
    # https://stackoverflow.com/questions/47995324/
    # does-model-compile-initialize-all-the-weights-and-biases-in-keras-tensorflow/47996024

    if cfg.getboolean('base', 'finetune'):

        print()
        for layer in model.layers:
            layer.trainable = True
            print('%s: %s' % (layer.name, layer.trainable))

        optim = getattr(optimizers, cfg.get('base', 'optimizer'))
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optim(lr=cfg.getfloat('base', 'lr')),
                      metrics=['accuracy'])

        model.fit(x_train,
                  y_train,
                  validation_data=validation_data,
                  epochs=cfg.getint('base', 'epochs'),
                  batch_size=cfg.getint('base', 'batch'),
                  validation_split=0.0,
                  callbacks=callbacks)

    if cfg.getfloat('data', 'val_size') != 0:
        # during validation, load last best model
        model = load_model('./Model/model.h5')
        x_test, y_test = x_val, y_val

    # distribution.shape: (test size, num of classes)
    distribution = model.predict(x_test)
    predictions = np.argmax(distribution, axis=1)

    pos_label = train_data_provider.label2int['yes']
    metrics.report_roc_auc(y_test, distribution[:, pos_label])
    metrics.report_pr_auc(y_test, distribution[:, pos_label])