Exemple #1
0
def eval_on_dev():
    """Split train into train and dev and fit"""

    model, config = get_model()

    # load training data
    train_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'))
    x_train, y_train = train_data_provider.load_as_int_seqs()

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.20,
                                                      random_state=2020)

    # convert to (n_train x vocabl_size) matrix
    x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size'])
    x_val = utils.sequences_to_matrix(x_val, config['input_vocab_size'])

    train_loader = make_data_loader(x_train, torch.tensor(y_train),
                                    cfg.getint('model', 'batch'), 'train')
    val_loader = make_data_loader(x_val, torch.tensor(y_val),
                                  cfg.getint('model', 'batch'), 'dev')

    label_counts = torch.bincount(torch.tensor(y_train))
    weights = len(y_train) / (2.0 * label_counts)
    print('class weights:', weights)

    best_roc_auc, optimal_epochs = fit(model, train_loader, val_loader,
                                       weights, cfg.getint('model', 'epochs'))
    print('best roc %.4f after %d epochs\n' % (best_roc_auc, optimal_epochs))

    return optimal_epochs
Exemple #2
0
def eval_on_test(n_epochs):
    """Train on training set and evaluate on test"""

    model, config = get_model()

    # training data
    train_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'train')),
        cfg.get('data', 'tokenizer_pickle'))

    # test set
    test_data_provider = DatasetProvider(
        os.path.join(base, cfg.get('data', 'test')),
        cfg.get('data', 'tokenizer_pickle'))

    x_train, y_train = train_data_provider.load_as_int_seqs()
    x_test, y_test = test_data_provider.load_as_int_seqs()

    x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size'])
    x_test = utils.sequences_to_matrix(x_test, config['input_vocab_size'])

    train_loader = make_data_loader(x_train, torch.tensor(y_train),
                                    cfg.getint('model', 'batch'), 'train')
    test_loader = make_data_loader(x_test, torch.tensor(y_test),
                                   cfg.getint('model', 'batch'), 'dev')

    label_counts = torch.bincount(torch.tensor(y_train))
    weights = len(y_train) / (2.0 * label_counts)

    fit(model, train_loader, test_loader, weights, n_epochs)
Exemple #3
0
def main():
    """My main main"""

    dp = data.DatasetProvider(os.path.join(base, cfg.get('data', 'cuis')),
                              os.path.join(base, cfg.get('data', 'codes')),
                              cfg.get('args', 'cui_vocab_size'),
                              cfg.get('args', 'code_vocab_size'))
    in_seqs, out_seqs = dp.load_as_sequences()

    tr_in_seqs, val_in_seqs, tr_out_seqs, val_out_seqs = train_test_split(
        in_seqs, out_seqs, test_size=0.20, random_state=2020)
    print('loaded %d training and %d validation samples' % \
          (len(tr_in_seqs), len(val_in_seqs)))

    max_cui_seq_len = max(len(seq) for seq in tr_in_seqs)
    max_code_seq_len = max(len(seq) for seq in tr_out_seqs)
    print('longest cui sequence:', max_cui_seq_len)
    print('longest code sequence:', max_code_seq_len)

    train_loader = make_data_loader(
        utils.pad_sequences(tr_in_seqs, max_len=cfg.getint('args', 'max_len')),
        utils.sequences_to_matrix(tr_out_seqs, len(dp.output_tokenizer.stoi)),
        cfg.getint('model', 'batch'), 'train')

    val_loader = make_data_loader(
        utils.pad_sequences(val_in_seqs, max_len=cfg.getint('args',
                                                            'max_len')),
        utils.sequences_to_matrix(val_out_seqs, len(dp.output_tokenizer.stoi)),
        cfg.getint('model', 'batch'), 'dev')

    model = TransformerEncoder(input_vocab_size=len(dp.input_tokenizer.stoi),
                               output_vocab_size=len(dp.output_tokenizer.stoi),
                               d_model=cfg.getint('model', 'd_model'),
                               d_inner=cfg.getint('model', 'd_inner'),
                               n_layers=cfg.getint('model', 'n_layers'),
                               n_head=cfg.getint('model', 'n_head'),
                               d_k=cfg.getint('model', 'd_k'),
                               d_v=cfg.getint('model', 'd_v'),
                               dropout=cfg.getfloat('model', 'dropout'),
                               max_len=cfg.getint('args', 'max_len'))

    best_loss, optimal_epochs = fit(model, train_loader, val_loader,
                                    cfg.getint('model', 'epochs'))
    print('best loss %.4f after %d epochs' % (best_loss, optimal_epochs))
Exemple #4
0
def main():
    """My main main"""

    dp = data.DatasetProvider(os.path.join(base, cfg.get('data', 'cuis')),
                              os.path.join(base, cfg.get('data', 'codes')),
                              cfg.get('args', 'cui_vocab_size'),
                              cfg.get('args', 'code_vocab_size'))
    in_seqs, out_seqs = dp.load_as_sequences()

    tr_in_seqs, val_in_seqs, tr_out_seqs, val_out_seqs = train_test_split(
        in_seqs, out_seqs, test_size=0.10, random_state=2020)

    print('loaded %d training and %d validation samples' % \
          (len(tr_in_seqs), len(val_in_seqs)))

    max_cui_seq_len = max(len(seq) for seq in tr_in_seqs)
    print('longest cui sequence:', max_cui_seq_len)

    max_code_seq_len = max(len(seq) for seq in tr_out_seqs)
    print('longest code sequence:', max_code_seq_len)

    train_loader = make_data_loader(
        tr_in_seqs,
        utils.sequences_to_matrix(tr_out_seqs, len(dp.output_tokenizer.stoi)),
        cfg.getint('model', 'batch'), 'train')

    val_loader = make_data_loader(
        val_in_seqs,
        utils.sequences_to_matrix(val_out_seqs, len(dp.output_tokenizer.stoi)),
        cfg.getint('model', 'batch'), 'dev')

    model = BagOfEmbeddings(input_vocab_size=len(dp.input_tokenizer.stoi),
                            output_vocab_size=len(dp.output_tokenizer.stoi),
                            embed_dim=cfg.getint('model', 'embed'),
                            hidden_units=cfg.getint('model', 'hidden'),
                            dropout_rate=cfg.getfloat('model', 'dropout'))

    best_loss, optimal_epochs = fit(model, train_loader, val_loader,
                                    cfg.getint('model', 'epochs'))
    print('best loss %.4f after %d epochs' % (best_loss, optimal_epochs))
def data_dense():
    """Data to feed into code prediction model"""

    train_data = os.path.join(base, cfg.get('data', 'train'))
    test_data = os.path.join(base, cfg.get('data', 'test'))

    # load model configuration
    pkl = open(cfg.get('data', 'config_pickle'), 'rb')
    config = pickle.load(pkl)

    # instantiate model and load parameters
    model = bow.BagOfWords(**config, save_config=False)
    state_dict = torch.load(cfg.get('data', 'model_file'))
    model.load_state_dict(state_dict)
    model.eval()

    # load training data first
    train_data_provider = DatasetProvider(train_data,
                                          cfg.get('data', 'tokenizer_pickle'))

    x_train, y_train = train_data_provider.load_as_int_seqs()
    x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size'])

    # make training vectors for target task
    x_train = get_dense_representations(model, x_train)

    # now load the test set
    test_data_provider = DatasetProvider(test_data,
                                         cfg.get('data', 'tokenizer_pickle'))

    x_test, y_test = test_data_provider.load_as_int_seqs()
    x_test = utils.sequences_to_matrix(x_test, config['input_vocab_size'])

    # make test vectors for target task
    x_test = get_dense_representations(model, x_test)

    return x_train, y_train, x_test, y_test