Ejemplo n.º 1
0
def main(params):

    dp = DataProvider(params)

    # Create vocabulary and author index
    if params['resume'] == None:
        if params['atoms'] == 'char':
            char_to_ix, ix_to_char = dp.createCharVocab(
                params['vocab_threshold'])
        else:
            char_to_ix, ix_to_char = dp.createWordVocab(
                params['vocab_threshold'])
        auth_to_ix, ix_to_auth = dp.createAuthorIdx()
    else:
        saved_model = torch.load(params['resume'])
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_auth = saved_model['ix_to_auth']
        ix_to_char = saved_model['ix_to_char']

    params['vocabulary_size'] = len(char_to_ix)
    params['num_output_layers'] = len(auth_to_ix)

    model = CharTranslator(params)
    # set to train mode, this activates dropout
    model.train()
    #Initialize the RMSprop optimizer

    if params['use_sgd']:
        optim = torch.optim.SGD(model.parameters(),
                                lr=params['learning_rate'],
                                momentum=params['decay_rate'])
    else:
        optim = torch.optim.RMSprop(model.parameters(),
                                    lr=params['learning_rate'],
                                    alpha=params['decay_rate'],
                                    eps=params['smooth_eps'])
    # Loss function
    if params['mode'] == 'generative':
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.NLLLoss()

    # Restore saved checkpoint
    if params['resume'] != None:
        model.load_state_dict(saved_model['state_dict'])
        optim.load_state_dict(saved_model['optimizer'])

    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(params['batch_size'])
    hidden_zeros = model.init_hidden(params['batch_size'])
    # Initialize the cache
    if params['randomize_batches']:
        dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros)

    # Compute the iteration parameters
    epochs = params['max_epochs']
    total_seqs = dp.get_num_sents(split='train')
    iter_per_epoch = total_seqs // params['batch_size']
    total_iters = iter_per_epoch * epochs
    best_loss = 1000000.
    best_val = 1000.
    eval_every = int(iter_per_epoch * params['eval_interval'])

    #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_score = 0.  #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_rank = 1000

    eval_function = eval_translator if params[
        'mode'] == 'generative' else eval_classify
    leakage = 0.  #params['leakage']

    print total_iters
    for i in xrange(total_iters):
        #TODO
        if params['split_generators']:
            c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())]
        else:
            c_aid = None

        batch = dp.get_sentence_batch(params['batch_size'],
                                      split='train',
                                      atoms=params['atoms'],
                                      aid=c_aid,
                                      sample_by_len=params['sample_by_len'])
        inps, targs, auths, lens = dp.prepare_data(
            batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len'])
        # Reset the hidden states for which new docs have been sampled

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        #TODO
        if params['mode'] == 'generative':
            output, _ = model.forward_mltrain(inps,
                                              lens,
                                              inps,
                                              lens,
                                              hidden_zeros,
                                              auths=auths)
            targets = pack_padded_sequence(Variable(targs).cuda(), lens)
            loss = criterion(pack_padded_sequence(output, lens)[0], targets[0])
        else:
            # for classifier auths is the target
            output, hidden = model.forward_classify(inps,
                                                    hidden,
                                                    compute_softmax=True)
            targets = Variable(auths).cuda()
            loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip'])

        # Take an optimization step
        optim.step()

        total_loss += loss.data.cpu().numpy()[0]

        # Save the hidden states in cache for later use
        if i % eval_every == 0 and i > 0:
            val_rank, val_score = eval_function(dp,
                                                model,
                                                params,
                                                char_to_ix,
                                                auth_to_ix,
                                                split='val')

        #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']:
        #    leakage = leakage * params['leakage_decay']

        #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']):
        if i % params['log_interval'] == 0 and i > 0:
            cur_loss = total_loss / params['log_interval']
            elapsed = time.time() - start_time
            print(
                '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    float(i) / iter_per_epoch, i, total_iters,
                    params['learning_rate'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0.

            if val_rank <= best_val:
                save_checkpoint(
                    {
                        'iter': i,
                        'arch': params,
                        'val_loss': val_rank,
                        'val_pplx': val_score,
                        'char_to_ix': char_to_ix,
                        'ix_to_char': ix_to_char,
                        'auth_to_ix': auth_to_ix,
                        'ix_to_auth': ix_to_auth,
                        'state_dict': model.state_dict(),
                        'loss': cur_loss,
                        'optimizer': optim.state_dict(),
                    },
                    fappend=params['fappend'],
                    outdir=params['checkpoint_output_directory'])
                best_val = val_rank
            start_time = time.time()
Ejemplo n.º 2
0
def main(params):
    dp = DataProvider(params)
    auth_to_ix = dp.createAuthorIdx()

    # Preprocess the training data
    train_docs = []
    targets = []
    model = {}

    # remove numbers
    bad_hombres = range(10)
    if params['nostop']:
        bad_hombres = bad_hombres + stopwords.words('english')
    if params['nopunct']:
        bad_hombres = bad_hombres + list(string.punctuation)

    bad_hombres = set(bad_hombres)

    all_words = Counter()

    for i, doc in enumerate(dp.data['docs']):
        no_num = re.sub(r'\d+', '', doc['text'].lower())
        curr_text = [
            w for w in wordpunct_tokenize(no_num) if w not in bad_hombres
        ]
        dp.data['docs'][i]['tokenized'] = curr_text
        if doc['split'] == 'train':
            all_words.update(curr_text)

    short_vocab = {
        w: i
        for i, w in enumerate([
            wrd for wrd in all_words
            if all_words[wrd] > params['vocab_threshold']
        ])
    }

    docCounts_train, target_train = count(dp,
                                          short_vocab,
                                          auth_to_ix,
                                          split='train')
    bow_features_train, idf_train = bow_features(docCounts_train,
                                                 params['tfidf'])

    docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val')
    bow_features_val, _ = bow_features(docCounts_val,
                                       params['tfidf'],
                                       idf=idf_train)

    # Do PCA?
    if params['pca'] > 0:
        pca_model = PCA(n_components=params['pca'])
        bow_features_train = pca_model.fit_transform(bow_features_train)
        print 'Explained variance is %.2f' % (sum(
            pca_model.explained_variance_ratio_))

        bow_features_val = pca_model.transform(bow_features_val)
        params['pca'] = bow_features_train.shape[-1]

    # Normalize the data
    bow_features_train, mean_tr, std_tr = normalize(bow_features_train)
    bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr)

    if params['mlp'] == False:
        if params['linearsvm']:
            # Linear SVC alread implements one-vs-rest
            svm_model = LinearSVC()  #verbose=1)
            svm_model.fit(bow_features_train, target_train)

        #Time to evaluate now.
        confTr = svm_model.decision_function(bow_features_train)
        confVal = svm_model.decision_function(bow_features_val)
    else:
        params['num_output_layers'] = len(auth_to_ix)
        params['inp_size'] = params['pca']
        model = MLP_classifier(params)
        model.fit(bow_features_train, target_train, bow_features_val,
                  target_val, params['epochs'], params['lr'], params['l2'])
        confTr = model.decision_function(bow_features_train)
        confVal = model.decision_function(bow_features_val)

    mean_rank_train = np.where(
        confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean()
    topk_train = (
        np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1]
        <= params['topk']).sum() * 100. / len(target_train)
    train_accuracy = 100. * float(
        (confTr.argmax(axis=1) == target_train).sum()) / len(target_train)

    mean_rank_val = np.where(
        confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean()
    topk_val = (
        np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <=
        params['topk']).sum() * 100. / len(target_val)
    val_accuracy = 100. * float(
        (confVal.argmax(axis=1) == target_val).sum()) / len(target_val)

    # DO the binary evaluation similar to the Bagnall
    #confTr = confTr - confTr.mean(axis=1)[:,None]
    n_auths = len(auth_to_ix)

    n_train = confTr.shape[0]
    neg_auths_tr = np.random.randint(0, n_auths, n_train)
    adjusted_scores_tr = ((np.argsort(
        confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])],
        axis=0) == np.concatenate([np.arange(n_train),
                                   np.arange(n_train)])).argmax(axis=0) +
                          1) / float(n_train)
    auc_tr = roc_auc_score(
        np.concatenate([
            np.ones(int(n_train), dtype=int),
            np.zeros(int(n_train), dtype=int)
        ]), adjusted_scores_tr)

    n_val = confVal.shape[0]
    neg_auths_val = np.random.randint(0, n_auths, n_val)
    adjusted_scores_val = ((np.argsort(
        confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])],
        axis=0) == np.concatenate([np.arange(n_val),
                                   np.arange(n_val)])).argmax(axis=0) +
                           1) / float(n_val)
    auc_val = roc_auc_score(
        np.concatenate(
            [np.ones(int(n_val), dtype=int),
             np.zeros(int(n_val), dtype=int)]), adjusted_scores_val)

    print '------------- Training set-------------------'
    print 'Accuracy is %.2f, Mean rank is %.2f / %d' % (
        train_accuracy, mean_rank_train, len(auth_to_ix))
    print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_train)
    print 'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_tr[:n_train] >= 0.5).sum() +
        (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train))
    print 'AUC is  %.2f' % (auc_tr)

    print '------------- Val set-------------------'
    print 'Accuracy is %.2f, Mean rank is %.2f / %d' % (
        val_accuracy, mean_rank_val, len(auth_to_ix))
    print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_val)
    print 'Accuracy per adjusted scores %.3f' % (100. * (
        (adjusted_scores_val[:n_val] >= 0.5).sum() +
        (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val))
    print 'AUC is  %.2f' % (auc_val)

    print '--------------------------------------------------------------------------'
    print '--------------------------------------------------------------------------\n\n'