def eval_on_dev(): """Split train into train and dev and fit""" model, config = get_model() # load training data train_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=2020) train_loader = make_data_loader(x_train, torch.tensor(y_train), cfg.getint('model', 'batch'), 'train', config['max_len']) val_loader = make_data_loader(x_val, torch.tensor(y_val), cfg.getint('model', 'batch'), 'dev', config['max_len']) label_counts = torch.bincount(torch.tensor(y_train)) weights = len(y_train) / (2.0 * label_counts) print('class weights:', weights) best_roc_auc, optimal_epochs = fit(model, train_loader, val_loader, weights, cfg.getint('model', 'epochs')) print('best roc %.4f after %d epochs\n' % (best_roc_auc, optimal_epochs)) return optimal_epochs
def data_dense(): """Data to feed into code prediction model""" base = os.environ['DATA_ROOT'] train_data = os.path.join(base, cfg.get('data', 'train')) test_data = os.path.join(base, cfg.get('data', 'test')) # type of pre-training (e.g. 'sparse', 'continuous') pretraining = cfg.get('data', 'pretraining') # load pre-trained model model = load_model(cfg.get('data', 'model_file')) interm_layer_model = Model(inputs=model.input, outputs=model.get_layer( cfg.get('data', 'rep_layer')).output) if pretraining == 'sparse': maxlen = None else: maxlen = model.get_layer(name='EL').get_config()['input_length'] # load training data first train_data_provider = DatasetProvider(train_data, cfg.get('data', 'tokenizer_pickle'), maxlen) if pretraining == 'sparse': x_train, y_train = train_data_provider.load_as_one_hot() else: x_train, y_train = train_data_provider.load_as_int_seqs() # make training vectors for target task print('original x_train shape:', x_train.shape) x_train = interm_layer_model.predict(x_train) print('new x_train shape:', x_train.shape) # now load the test set test_data_provider = DatasetProvider(test_data, cfg.get('data', 'tokenizer_pickle'), maxlen) if pretraining == 'sparse': x_test, y_test = test_data_provider.load_as_one_hot() else: x_test, y_test = test_data_provider.load_as_int_seqs() # make test vectors for target task print('original x_test shape:', x_test.shape) x_test = interm_layer_model.predict(x_test) print('new x_test shape:', x_test.shape) return x_train, y_train, x_test, y_test
def data_dense(): """Data to feed into code prediction model""" train_data = os.path.join(base, cfg.get('data', 'train')) test_data = os.path.join(base, cfg.get('data', 'test')) # load model configuration pkl = open(cfg.get('data', 'config_pickle'), 'rb') config = pickle.load(pkl) # instantiate model and load parameters model = trans.TransformerEncoder(**config, save_config=False) state_dict = torch.load(cfg.get('data', 'model_file')) model.load_state_dict(state_dict) model.eval() # load training data first train_data_provider = DatasetProvider(train_data, cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() # make training vectors for target task x_train = get_dense_representations(model, x_train, config['max_len']) # now load the test set test_data_provider = DatasetProvider(test_data, cfg.get('data', 'tokenizer_pickle')) x_test, y_test = test_data_provider.load_as_int_seqs() # make test vectors for target task x_test = get_dense_representations(model, x_test, config['max_len']) return x_train, y_train, x_test, y_test
def eval_on_test(n_epochs): """Train on training set and evaluate on test""" model, config = get_model() # training data train_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'tokenizer_pickle')) # test set test_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'test')), cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() x_test, y_test = test_data_provider.load_as_int_seqs() train_loader = make_data_loader(x_train, torch.tensor(y_train), cfg.getint('model', 'batch'), 'train', config['max_len']) test_loader = make_data_loader(x_test, torch.tensor(y_test), cfg.getint('model', 'batch'), 'dev', config['max_len']) label_counts = torch.bincount(torch.tensor(y_train)) weights = len(y_train) / (2.0 * label_counts) fit(model, train_loader, test_loader, weights, n_epochs)
def main(): """Train and evaluate""" data_root = os.environ['DATA_ROOT'] if os.path.isdir('./Model/'): shutil.rmtree('./Model/') os.mkdir('./Model/') train_data_provider = DatasetProvider( os.path.join(data_root, cfg.get('data', 'train')), cfg.get('data', 'tokenizer_pickle'), None) x_train, y_train = train_data_provider.load_as_one_hot() print('loaded x_train:', x_train.shape) # are we evaluating on test or dev? if cfg.getfloat('data', 'val_size') != 0: x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=cfg.getfloat('data', 'val_size')) callbacks = [ ModelCheckpoint('./Model/model.h5', verbose=1, save_best_only=True) ] validation_data = (x_val, y_val) print('x_train shape:', x_train.shape) print('x_val shape:', x_val.shape) else: test_data_provider = DatasetProvider( os.path.join(data_root, cfg.get('data', 'test')), cfg.get('data', 'tokenizer_pickle'), None) x_test, y_test = test_data_provider.load_as_one_hot() print('loaded x_test:', x_test.shape) validation_data = None callbacks = None # train the linear classification layer model = get_model(len(train_data_provider.label2int)) optim = getattr(optimizers, cfg.get('linear', 'optimizer')) model.compile(loss='sparse_categorical_crossentropy', optimizer=optim(lr=cfg.getfloat('linear', 'lr')), metrics=['accuracy']) model.fit(x_train, y_train, validation_data=validation_data, epochs=cfg.getint('linear', 'epochs'), batch_size=cfg.getint('linear', 'batch'), validation_split=0.0, callbacks=callbacks) # fine-tune the pre-trained layers # https://stackoverflow.com/questions/47995324/ # does-model-compile-initialize-all-the-weights-and-biases-in-keras-tensorflow/47996024 if cfg.getboolean('base', 'finetune'): print() for layer in model.layers: layer.trainable = True print('%s: %s' % (layer.name, layer.trainable)) optim = getattr(optimizers, cfg.get('base', 'optimizer')) model.compile(loss='sparse_categorical_crossentropy', optimizer=optim(lr=cfg.getfloat('base', 'lr')), metrics=['accuracy']) model.fit(x_train, y_train, validation_data=validation_data, epochs=cfg.getint('base', 'epochs'), batch_size=cfg.getint('base', 'batch'), validation_split=0.0, callbacks=callbacks) if cfg.getfloat('data', 'val_size') != 0: # during validation, load last best model model = load_model('./Model/model.h5') x_test, y_test = x_val, y_val # distribution.shape: (test size, num of classes) distribution = model.predict(x_test) predictions = np.argmax(distribution, axis=1) pos_label = train_data_provider.label2int['yes'] metrics.report_roc_auc(y_test, distribution[:, pos_label]) metrics.report_pr_auc(y_test, distribution[:, pos_label])