def lc_train(author1, author2, model):
    # evaluate linear classifier model
    # create and build evaluation dataset
    dataset = TextDataset([author1, author2], norm=None, vectorizer='tfidf')
    X, _, Y, _ = dataset.build_dataset(eval=True)
    # load model from pickle
    model = pickle.load(open(model, 'rb'))
    # predict dataset labels
    predictions = model.predict(X)
    predictions_proba = model.predict_proba(X)
    # print results
    print(f'accuracy: {accuracy_score(Y, predictions)*100}%')
    print(f'logloss: {log_loss(Y, predictions_proba)}')
def nn_train(author1, author2, model, w2v_path):
    # evaluate neural network classifier
    # define batch size
    batch_size = 128
    # select device (CPU | GPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # create and build evaluation dataset
    dataset = TextDataset([author1, author2],
                          norm=None,
                          vectorizer='embed',
                          w2v_path=w2v_path)
    X, _, Y, _ = dataset.build_dataset(eval=True)
    # load model from file
    model = torch.load(model)
    model.to(device)
    model.eval()
    valid_steps = int(len(X) / batch_size)
    predictions = list()
    # loop over all evaluation dataset
    for step in tqdm(range(valid_steps)):
        # get x and y batches
        x_batch = X[step * batch_size:(step + 1) * batch_size]
        x_batch = np.stack(x_batch, axis=0)
        x_batch = torch.from_numpy(x_batch).float().to(device)
        y_batch = Y[step * batch_size:(step + 1) * batch_size]
        y_batch = np.stack(y_batch, axis=0)
        # model forward pass
        y_out = model(x_batch)
        y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy()
        # save predictions
        predictions.append(y_out)
    # perform inference on the remaining samples
    x_batch = X[valid_steps * batch_size:]
    x_batch = np.stack(x_batch, axis=0)
    x_batch = torch.from_numpy(x_batch).float().to(device)
    y_batch = Y[valid_steps * batch_size:]
    y_batch = np.stack(y_batch, axis=0)
    y_out = model(x_batch)
    y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy()
    predictions.append(y_out)
    # concatenate all predictions
    predictions_proba = np.concatenate(predictions, axis=0)
    predictions = predictions_proba > 0.5
    # print results
    print(f'accuracy: {accuracy_score(Y, predictions)*100}%')
    print(
        f'logloss: {F.binary_cross_entropy(torch.from_numpy(predictions_proba).float(), torch.from_numpy(np.array(Y)).float())}'
    )
def lc_train(config):
    # train linear classifier (Naive Bayes)
    # create and build dataset
    dataset = TextDataset(config['txt_list'],
                          norm=config['norm'],
                          vectorizer=config['vectorizer'])
    xtrain, xvalid, ytrain, yvalid = dataset.build_dataset()
    # define model
    model = StylometryLC(truncation=config['truncation'])
    # fit model
    model.fit(xtrain, ytrain)
    # infer on validation data
    predictions = model.predict(xvalid)
    predictions_proba = model.predict_proba(xvalid)
    # dump model pickle
    pickle.dump(model, open('models/nb_model.sav', 'wb'))
    # print results
    print(f'accuracy: {accuracy_score(yvalid, predictions)*100}%')
    print(f'logloss: {log_loss(yvalid, predictions_proba)}')
Beispiel #4
0
        description='Train a simple LSTM language model.')
    parser.add_argument('weights_file', help='Path to model weights file')
    parser.add_argument('train_dataset',
                        help='Path to processed train dataset file')
    parser.add_argument('valid_dataset',
                        help='Path to processed train dataset file')
    parser.add_argument('test_dataset',
                        help='Path to processed test dataset file')
    parser.add_argument(
        '--vocab_unk_rate',
        help=
        'UNKing rate to use for vocabulary, by default will use true UNK rate based on validation set OOV rate',
        default=-1.0)
    args = parser.parse_args()

    train_dataset = TextDataset(args.train_dataset, 50)
    valid_dataset = TextDataset(args.valid_dataset, 50)
    test_dataset = TextDataset(args.test_dataset, 50)

    if args.vocab_unk_rate == -1.0:
        train_dataset.unk_vocabulary_with_true_oov_rate(valid_dataset)
    elif args.vocab_unk_rate > 0:
        train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate)
    test_dataset.use_vocabulary_from_dataset(train_dataset)

    max_word_id = train_dataset.vocabulary.get_max_word_id()
    lm_min_word_id = train_dataset.vocabulary.get_min_valid_lm_output_word_id()

    dataset_transformer = transforms.Compose(
        [Seq2Seq(),
         RemapUsingMinWordID('target', lm_min_word_id),
Beispiel #5
0
parser.add_argument('valid_dataset',
                    help='Path to processed validation dataset file')
parser.add_argument(
    '--vocab_unk_rate',
    help=
    'UNKing rate to use for vocabulary, by default will use true UNK rate based on validation set OOV rate',
    default=-1.0)
args = parser.parse_args()

n_epochs = 1000
train_samples_per_epoch = 1000
valid_samples_per_epoch = 100
batch_size = 4
max_sequence_length = 50

train_dataset = TextDataset(args.train_dataset, max_sequence_length)
valid_dataset = TextDataset(args.valid_dataset, max_sequence_length)

if args.vocab_unk_rate == -1.0:
    train_dataset.unk_vocabulary_with_true_oov_rate(valid_dataset)
elif args.vocab_unk_rate > 0:
    train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate)

max_word_id = train_dataset.vocabulary.get_max_word_id()
lm_min_word_id = train_dataset.vocabulary.get_min_valid_lm_output_word_id()
vocabulary_size = train_dataset.vocabulary.get_vocab_size()
valid_dataset.use_vocabulary_from_dataset(train_dataset)
print(f'Vocabulary Size: {vocabulary_size}')

dataset_transformer = transforms.Compose(
    [Seq2Seq(),
Beispiel #6
0
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    RANDOM_SEED = 42

    dataset = load_dataset("yelp_polarity", split="train")
    TRAIN_SIZE = len(dataset) - 2_000
    VALID_SIZE = 1_000
    TEST_SIZE = 1_000

    train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE,
                                                seed=RANDOM_SEED)
    train_dataset = train_test_split["train"]
    test_val_dataset = train_test_split["test"].train_test_split(
        train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED)
    val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset[
        "test"]

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    datasets = OrderedDict()
    datasets['train'] = TextDataset(train_dataset, tokenizer,
                                    args.max_sequence_length,
                                    not args.disable_sent_tokenize)
    datasets['valid'] = TextDataset(val_dataset, tokenizer,
                                    args.max_sequence_length,
                                    not args.disable_sent_tokenize)
    if args.test:
        datasets['text'] = TextDataset(test_dataset, tokenizer,
                                       args.max_sequence_length,
                                       not args.disable_sent_tokenize)

    print(
        f"Loading {args.model_name} model. Setting {args.trainable_layers} trainable layers."
    )
    encoder = AutoModel.from_pretrained(args.model_name, return_dict=True)
    if not args.train_embeddings:
        for p in encoder.embeddings.parameters():
            p.requires_grad = False
    encoder_layers = encoder.encoder.layer
    if args.trainable_layers > len(encoder_layers):
        warnings.warn(
            f"You are asking to train {args.trainable_layers} layers, but this model has only {len(encoder_layers)}"
        )
    for layer in range(len(encoder_layers) - args.trainable_layers):
        for p in encoder_layers[layer].parameters():
            p.requires_grad = False
    params = dict(vocab_size=datasets['train'].vocab_size,
                  embedding_size=args.embedding_size,
                  rnn_type=args.rnn_type,
                  hidden_size=args.hidden_size,
                  word_dropout=args.word_dropout,
                  embedding_dropout=args.embedding_dropout,
                  latent_size=args.latent_size,
                  num_layers=args.num_layers,
                  bidirectional=args.bidirectional,
                  max_sequence_length=args.max_sequence_length)
    model = SentenceVAE(encoder=encoder, tokenizer=tokenizer, **params)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(
            os.path.join(args.logdir, expierment_name(args, ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f:
        json.dump(params, f, indent=4)
    with open(os.path.join(save_model_path, 'train_args.json'), 'w') as f:
        json.dump(vars(args), f, indent=4)

    def kl_anneal_function(anneal_function, step, k, x0):
        if step <= x0:
            return args.initial_kl_weight
        if anneal_function == 'logistic':
            return float(1 / (1 + np.exp(-k * (step - x0 - 2500))))
        elif anneal_function == 'linear':
            return min(1, step / x0)

    NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx,
                           reduction='sum')

    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k,
                x0):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).item()].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight

    params = [{
        'params': model.encoder.parameters(),
        'lr': args.encoder_learning_rate
    }, {
        'params': [
            *model.decoder_rnn.parameters(), *model.hidden2mean.parameters(),
            *model.hidden2logv.parameters(), *model.latent2hidden.parameters(),
            *model.outputs2vocab.parameters()
        ]
    }]
    optimizer = torch.optim.Adam(params,
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = DataLoader(dataset=datasets[split],
                                     batch_size=args.batch_size,
                                     shuffle=(split == 'train'),
                                     num_workers=cpu_count(),
                                     pin_memory=torch.cuda.is_available(),
                                     collate_fn=DataCollator(tokenizer))

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'],
                                            batch['attention_mask'],
                                            batch['length'])

                # loss calculation
                NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'],
                                                       batch['length'], mean,
                                                       logv,
                                                       args.anneal_function,
                                                       step, args.k, args.x0)

                loss = (NLL_loss + KL_weight * KL_loss) / batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                # bookkeepeing
                tracker['ELBO'] = torch.cat(
                    (tracker['ELBO'], loss.data.view(1, -1)), dim=0)

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO" % split.upper(), loss.item(),
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss" % split.upper(),
                                      NLL_loss.item() / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss" % split.upper(),
                                      KL_loss.item() / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight" % split.upper(),
                                      KL_weight,
                                      epoch * len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration + 1 == len(
                        data_loader):
                    print(
                        "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        % (split.upper(), iteration, len(data_loader) - 1,
                           loss.item(), NLL_loss.item() / batch_size,
                           KL_loss.item() / batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(
                        batch['target'].tolist(), tokenizer=tokenizer)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f" %
                  (split.upper(), epoch, args.epochs, tracker['ELBO'].mean()))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO" % split.upper(),
                                  torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences, the encoded latent space and generated sequences
            if split == 'valid':
                samples, _ = model.inference(z=tracker['z'])
                generated_sents = idx2word(samples.tolist(), tokenizer)
                sents = [{
                    'original': target,
                    'generated': generated
                } for target, generated in zip(tracker['target_sents'],
                                               generated_sents)]
                dump = {'sentences': sents, 'z': tracker['z'].tolist()}
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/' + ts)
                with open(
                        os.path.join('dumps/' + ts +
                                     '/valid_E%i.json' % epoch),
                        'w') as dump_file:
                    json.dump(dump, dump_file, indent=3)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path,
                                               "E%i.pytorch" % epoch)
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s" % checkpoint_path)
Beispiel #7
0
    args = parser.parse_args()

    n_epochs = 1000
    train_samples_per_epoch = 80000
    valid_samples_per_epoch = 500
    batch_size = 24
    max_sequence_length = 50

    logfile_prefix = os.path.splitext(args.log_file)[0]
    logfile_dir = os.path.dirname(args.log_file)

    weight_files = get_model_weight_files(logfile_dir)

    lm_train_vocab = None
    if not args.character_level and not args.phoneme_level:
        lm_train_dataset = TextDataset(args.lm_train_dataset,
                                       max_sequence_length)
        lm_valid_dataset = TextDataset(args.lm_valid_dataset,
                                       max_sequence_length)

        if args.vocab_unk_rate == -1.0:
            lm_train_dataset.unk_vocabulary_with_true_oov_rate(
                lm_valid_dataset)
        elif args.vocab_unk_rate > 0:
            lm_train_dataset.unk_vocabulary_with_oov_rate(args.vocab_unk_rate)

        lm_train_vocab = lm_train_dataset.vocabulary

    train_dataset = SpeechDataset(args.train_dataset,
                                  vocabulary=lm_train_vocab,
                                  character_level=args.character_level,
                                  phoneme_level=args.phoneme_level)
def nn_train(config):
    # train neural network classifier
    # select device (CPU | GPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # create and build dataset
    dataset = TextDataset(config['txt_list'],
                          norm=config['norm'],
                          vectorizer=config['vectorizer'],
                          w2v_path=config['w2v_path'])
    xtrain, xvalid, ytrain, yvalid = dataset.build_dataset()
    # define network configuration
    network_config = {
        'emb_dim': config['emb_dim'],
        'rnn_hid_dim': config['rnn_hid_dim'],
        'dense_hid_dim': config['dense_hid_dim'],
    }
    # define model
    model = StylometryNN(network_config)
    model.to(device)
    model.train()
    # define BCE loss
    criterion = nn.BCELoss()
    # define optimizer
    optimizer = Adam(model.parameters(),
                     lr=config['initial_lr'],
                     weight_decay=config['weight_decay'])
    train_steps = int(len(xtrain) / config['batch_size'])
    valid_steps = int(len(xvalid) / config['batch_size'])
    best_accuracy = 0.5
    # training loop
    for epoch in range(config['num_epochs']):
        total_loss = 0.0
        # loop over all training dataset samples
        for step in tqdm(range(train_steps)):
            # get x and y batches
            x_batch = xtrain[step * config['batch_size']:(step + 1) *
                             config['batch_size']]
            x_batch = np.stack(x_batch, axis=0)
            x_batch = torch.from_numpy(x_batch).float().to(device)
            y_batch = ytrain[step * config['batch_size']:(step + 1) *
                             config['batch_size']]
            y_batch = np.stack(y_batch, axis=0)
            y_batch = torch.from_numpy(y_batch).float().to(device)
            # model forward pass
            y_out = model(x_batch)
            y_out = torch.squeeze(y_out, dim=1)
            # calculate loss
            loss = criterion(y_out, y_batch)
            total_loss += loss.item()
            # back propagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(
            f'Epoch [{int(epoch + 1)}/{int(config["num_epochs"])}], Total Epoch Loss: {total_loss/train_steps}'
        )
        model.eval()
        accuracies = list()
        losses = list()
        # loop over all validation dataset samples
        for step in tqdm(range(valid_steps)):
            # get x and y batches
            x_batch = xvalid[step * config['batch_size']:(step + 1) *
                             config['batch_size']]
            x_batch = np.stack(x_batch, axis=0)
            x_batch = torch.from_numpy(x_batch).float().to(device)
            y_batch = yvalid[step * config['batch_size']:(step + 1) *
                             config['batch_size']]
            y_batch = np.stack(y_batch, axis=0)
            # model forward pass
            y_out = model(x_batch)
            y_out = torch.squeeze(y_out, dim=1).cpu().detach().numpy()
            # calculate loss and accuracy
            y_out_labels = y_out > 0.5
            accuracies.append(accuracy_score(y_batch, y_out_labels))
            losses.append(
                F.binary_cross_entropy(
                    torch.from_numpy(y_out).float(),
                    torch.from_numpy(y_batch).float()))
        # print results
        print(f'Validation accuracy: {(sum(accuracies)/len(accuracies))*100}%')
        print(f'Validation logloss: {sum(losses)/len(losses)}')
        # save model (based on best validation accuracy)
        if sum(accuracies) / len(accuracies) > best_accuracy:
            torch.save(model, 'models/deep_model.pt')
            best_accuracy = sum(accuracies) / len(accuracies)
        model.train()