def main(params):

    # Create vocabulary and author index
    saved_model = torch.load(params['model'])
    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        ix_to_auth = saved_model['ix_to_auth']
    cp_params = saved_model['arch']
    if params['softmax_scale']:
        cp_params['softmax_scale'] = params['softmax_scale']

    dp = DataProvider(cp_params)

    if params['m_type'] == 'generative':
        model = CharLstm(cp_params)
    else:
        model = CharTranslator(cp_params)
    # set to train mode, this activates dropout
    model.eval()
    auth_colors = ['red', 'blue']

    startc = dp.data['configs']['start']
    endc = dp.data['configs']['end']

    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = char_to_ix[startc]
    append_tensor = torch.LongTensor(append_tensor).cuda()

    # Restore saved checkpoint
    model.load_state_dict(saved_model['state_dict'])
    hidden = model.init_hidden(1)
    jc = '' if cp_params.get('atoms','char') == 'char' else ' '

    for i in xrange(params['num_samples']):
        c_aid = np.random.choice(auth_to_ix.values())
        if params['m_type'] == 'generative':
            batch = dp.get_random_string(slen = params['seed_length'], split=params['split'])
        else:
            batch = dp.get_sentence_batch(1,split=params['split'], atoms=cp_params.get('atoms','char'), aid=ix_to_auth[c_aid])

        inps, targs, auths, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len'])
        auths_inp = 1 - auths if params['flip'] else auths
        outs = adv_forward_pass(model, inps, lens, end_c=char_to_ix[endc], maxlen=cp_params['max_seq_len'], auths=auths_inp,
                     cycle_compute=params['show_rev'], append_symb=append_tensor)
        #char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.'])
        print '--------------------------------------------'
        #print 'Translate from %s to %s'%(batch[0]['author'], ix_to_auth[auths_inp[0]])
        print colored('Inp %6s: '%(ix_to_auth[auths[0]]),'green') + colored('%s'%(jc.join([ix_to_char[c[0]] for c in inps[1:]])),auth_colors[auths[0]])
        print colored('Out %6s: '%(ix_to_auth[auths_inp[0]]),'grey')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[0] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths_inp[0]])

        if params['show_rev']:
            print colored('Rev %6s: '%(ix_to_auth[auths[0]]),'green')+ colored('%s'%(jc.join([ix_to_char[c.data.cpu()[0]] for c in outs[-1] if c.data.cpu()[0] in ix_to_char])),auth_colors[auths[0]])
def main(params):

    dp = DataProvider(params)

    # Create vocabulary and author index
    if params['resume'] == None:
        if params['atoms'] == 'char':
            char_to_ix, ix_to_char = dp.createCharVocab(
                params['vocab_threshold'])
        else:
            char_to_ix, ix_to_char = dp.createWordVocab(
                params['vocab_threshold'])
        auth_to_ix, ix_to_auth = dp.createAuthorIdx()
    else:
        saved_model = torch.load(params['resume'])
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_auth = saved_model['ix_to_auth']
        ix_to_char = saved_model['ix_to_char']

    params['vocabulary_size'] = len(char_to_ix)
    params['num_output_layers'] = len(auth_to_ix)

    model = CharTranslator(params)
    # set to train mode, this activates dropout
    model.train()
    #Initialize the RMSprop optimizer

    if params['use_sgd']:
        optim = torch.optim.SGD(model.parameters(),
                                lr=params['learning_rate'],
                                momentum=params['decay_rate'])
    else:
        optim = torch.optim.RMSprop(model.parameters(),
                                    lr=params['learning_rate'],
                                    alpha=params['decay_rate'],
                                    eps=params['smooth_eps'])
    # Loss function
    if params['mode'] == 'generative':
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.NLLLoss()

    # Restore saved checkpoint
    if params['resume'] != None:
        model.load_state_dict(saved_model['state_dict'])
        optim.load_state_dict(saved_model['optimizer'])

    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(params['batch_size'])
    hidden_zeros = model.init_hidden(params['batch_size'])
    # Initialize the cache
    if params['randomize_batches']:
        dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros)

    # Compute the iteration parameters
    epochs = params['max_epochs']
    total_seqs = dp.get_num_sents(split='train')
    iter_per_epoch = total_seqs // params['batch_size']
    total_iters = iter_per_epoch * epochs
    best_loss = 1000000.
    best_val = 1000.
    eval_every = int(iter_per_epoch * params['eval_interval'])

    #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_score = 0.  #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_rank = 1000

    eval_function = eval_translator if params[
        'mode'] == 'generative' else eval_classify
    leakage = 0.  #params['leakage']

    print total_iters
    for i in xrange(total_iters):
        #TODO
        if params['split_generators']:
            c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())]
        else:
            c_aid = None

        batch = dp.get_sentence_batch(params['batch_size'],
                                      split='train',
                                      atoms=params['atoms'],
                                      aid=c_aid,
                                      sample_by_len=params['sample_by_len'])
        inps, targs, auths, lens = dp.prepare_data(
            batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len'])
        # Reset the hidden states for which new docs have been sampled

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()
        #TODO
        if params['mode'] == 'generative':
            output, _ = model.forward_mltrain(inps,
                                              lens,
                                              inps,
                                              lens,
                                              hidden_zeros,
                                              auths=auths)
            targets = pack_padded_sequence(Variable(targs).cuda(), lens)
            loss = criterion(pack_padded_sequence(output, lens)[0], targets[0])
        else:
            # for classifier auths is the target
            output, hidden = model.forward_classify(inps,
                                                    hidden,
                                                    compute_softmax=True)
            targets = Variable(auths).cuda()
            loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip'])

        # Take an optimization step
        optim.step()

        total_loss += loss.data.cpu().numpy()[0]

        # Save the hidden states in cache for later use
        if i % eval_every == 0 and i > 0:
            val_rank, val_score = eval_function(dp,
                                                model,
                                                params,
                                                char_to_ix,
                                                auth_to_ix,
                                                split='val')

        #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']:
        #    leakage = leakage * params['leakage_decay']

        #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']):
        if i % params['log_interval'] == 0 and i > 0:
            cur_loss = total_loss / params['log_interval']
            elapsed = time.time() - start_time
            print(
                '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    float(i) / iter_per_epoch, i, total_iters,
                    params['learning_rate'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0.

            if val_rank <= best_val:
                save_checkpoint(
                    {
                        'iter': i,
                        'arch': params,
                        'val_loss': val_rank,
                        'val_pplx': val_score,
                        'char_to_ix': char_to_ix,
                        'ix_to_char': ix_to_char,
                        'auth_to_ix': auth_to_ix,
                        'ix_to_auth': ix_to_auth,
                        'state_dict': model.state_dict(),
                        'loss': cur_loss,
                        'optimizer': optim.state_dict(),
                    },
                    fappend=params['fappend'],
                    outdir=params['checkpoint_output_directory'])
                best_val = val_rank
            start_time = time.time()
Exemple #3
0
def main(params):
    dp = DataProvider(params)

    # Create vocabulary and author index
    if params['resume'] == None:
        if params['atoms'] == 'char':
            char_to_ix, ix_to_char = dp.create_char_vocab(
                params['vocab_threshold'])
        else:
            char_to_ix, ix_to_char = dp.create_word_vocab(
                params['vocab_threshold'])
        auth_to_ix, ix_to_auth = dp.create_author_idx()
    else:
        saved_model = torch.load(params['resume'])
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']

    params['vocabulary_size'] = len(char_to_ix)
    params['num_output_layers'] = len(auth_to_ix)
    print
    params['vocabulary_size'], params['num_output_layers']

    model = get_classifier(params)
    # set to train mode, this activates dropout
    model.train()
    # Initialize the RMSprop optimizer

    if params['use_sgd']:
        optim = torch.optim.SGD(model.parameters(),
                                lr=params['learning_rate'],
                                momentum=params['decay_rate'])
    else:
        optim = torch.optim.RMSprop([{
            'params':
            [p[1] for p in model.named_parameters() if p[0] != 'decoder_W']
        }, {
            'params': model.decoder_W,
            'weight_decay': 0.000
        }],
                                    lr=params['learning_rate'],
                                    alpha=params['decay_rate'],
                                    eps=params['smooth_eps'])
    # Loss function
    if len(params['balance_loss']) == 0:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.CrossEntropyLoss(
            torch.FloatTensor(params['balance_loss']).cuda())

    # Restore saved checkpoint
    if params['resume'] != None:
        model.load_state_dict(saved_model['state_dict'])
        # optim.load_state_dict(saved_model['optimizer'])

    total_loss = 0.
    class_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(params['batch_size'])
    hidden_zeros = model.init_hidden(params['batch_size'])
    # Initialize the cache
    if params['randomize_batches']:
        dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros)

    # Compute the iteration parameters
    epochs = params['max_epochs']
    total_seqs = dp.get_num_sents(split='train')
    iter_per_epoch = total_seqs // params['batch_size']
    total_iters = iter_per_epoch * epochs
    best_loss = 0.
    best_val = 1000.
    eval_every = int(iter_per_epoch * params['eval_interval'])

    # val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_score = 0.  # eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval'])
    val_rank = 0

    eval_function = eval_model if params[
        'mode'] == 'generative' else eval_classify

    leakage = params['leakage']
    for i in xrange(total_iters):
        # TODO
        if params['randomize_batches']:
            batch, reset_next = dp.get_rand_doc_batch(params['batch_size'],
                                                      split='train')
            b_ids = [b['id'] for b in batch]
            hidden = dp.get_hid_cache(b_ids, hidden)
        elif params['use_sentences']:
            c_aid = None  # ix_to_auth[np.random.choice(auth_to_ix.values())]
            batch = dp.get_sentence_batch(
                params['batch_size'],
                split='train',
                aid=c_aid,
                atoms=params['atoms'],
                sample_by_len=params['sample_by_len'])
            hidden = hidden_zeros
        else:
            batch, reset_h = dp.get_doc_batch(split='train')
            if len(reset_h) > 0:
                hidden[0].data.index_fill_(1,
                                           torch.LongTensor(reset_h).cuda(),
                                           0.)
                hidden[1].data.index_fill_(1,
                                           torch.LongTensor(reset_h).cuda(),
                                           0.)

        inps, targs, auths, lens = dp.prepare_data(batch,
                                                   char_to_ix,
                                                   auth_to_ix,
                                                   leakage=leakage)

        # Reset the hidden states for which new docs have been sampled

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optim.zero_grad()

        # TODO
        if params['mode'] == 'generative':
            output, hidden = model.forward(inps, lens, hidden, auths)
            targets = pack_padded_sequence(Variable(targs).cuda(), lens)
            loss = criterion(pack_padded_sequence(output, lens)[0], targets[0])
        else:
            # for classifier auths is the target
            output, _ = model.forward_classify(targs,
                                               hidden,
                                               compute_softmax=False,
                                               lens=lens)
            targets = Variable(auths).cuda()
            lossClass = criterion(output, targets)
            if params['compression_layer']:
                loss = lossClass + (model.compression_W.weight.norm(
                    p=1, dim=1)).mean()
            else:
                loss = lossClass
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip'])

        # Take an optimization step
        optim.step()

        total_loss += loss.data.cpu().numpy()[0]
        class_loss += lossClass.data.cpu().numpy()[0]

        # Save the hidden states in cache for later use
        if params['randomize_batches']:
            if len(reset_next) > 0:
                hidden[0].data.index_fill_(1,
                                           torch.LongTensor(reset_next).cuda(),
                                           0.)
                hidden[1].data.index_fill_(1,
                                           torch.LongTensor(reset_next).cuda(),
                                           0.)
            dp.set_hid_cache(b_ids, hidden)

        if i % eval_every == 0 and i > 0:
            val_rank, val_score = eval_function(dp,
                                                model,
                                                params,
                                                char_to_ix,
                                                auth_to_ix,
                                                split='val',
                                                max_docs=params['num_eval'])

        if i % iter_per_epoch == 0 and i > 0 and leakage > params[
                'leakage_min']:
            leakage = leakage * params['leakage_decay']

        # if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']):
        if i % params['log_interval'] == 0 and i > 0:
            cur_loss = total_loss / params['log_interval']
            class_loss = class_loss / params['log_interval']
            elapsed = time.time() - start_time
            print(
                '| epoch {:3.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    float(i) / iter_per_epoch, i, total_iters,
                    params['learning_rate'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(class_loss)))

            if val_rank >= best_loss:
                best_loss = val_rank
                save_checkpoint(
                    {
                        'iter': i,
                        'arch': params,
                        'val_mean_rank': val_rank,
                        'val_auc': val_score,
                        'char_to_ix': char_to_ix,
                        'ix_to_char': ix_to_char,
                        'auth_to_ix': auth_to_ix,
                        'state_dict': model.state_dict(),
                        'loss': cur_loss,
                        'optimizer': optim.state_dict(),
                    },
                    fappend=params['fappend'],
                    outdir=params['checkpoint_output_directory'])
                best_val = val_rank
            start_time = time.time()
            total_loss = 0.
            class_loss = 0.
Exemple #4
0
def main(params):
    # Create vocabulary and author index
    saved_model = torch.load(params['model'])
    if 'misc' in saved_model:
        misc = saved_model['misc']
        char_to_ix = misc['char_to_ix']
        auth_to_ix = misc['auth_to_ix']
        ix_to_char = misc['ix_to_char']
        ix_to_auth = misc['ix_to_auth']
    else:
        char_to_ix = saved_model['char_to_ix']
        auth_to_ix = saved_model['auth_to_ix']
        ix_to_char = saved_model['ix_to_char']
        ix_to_auth = saved_model['ix_to_auth']
    cp_params = saved_model['arch']
    if params['softmax_scale']:
        cp_params['softmax_scale'] = params['softmax_scale']

    dp = DataProvider(cp_params)

    if params['m_type'] == 'generative':
        model = CharLstm(cp_params)
    else:
        model = CharTranslator(cp_params)
    # set to train mode, this activates dropout
    model.eval()
    auth_colors = ['red', 'blue']

    startc = dp.data['configs']['start']
    endc = dp.data['configs']['end']

    append_tensor = np.zeros((1, 1), dtype=np.int)
    append_tensor[0, 0] = char_to_ix[startc]
    append_tensor = torch.LongTensor(append_tensor).cuda()

    # Restore saved checkpoint
    model.load_state_dict(saved_model['state_dict'])
    hidden = model.init_hidden(1)
    jc = '' if cp_params.get('atoms', 'char') == 'char' else ' '

    for i in range(params['num_samples']):
        c_aid = np.random.choice(list(auth_to_ix.values()))
        if params['m_type'] == 'generative':
            batch = dp.get_random_string(slen=params['seed_length'],
                                         split=params['split'])
        else:
            batch = dp.get_sentence_batch(1,
                                          split=params['split'],
                                          atoms=cp_params.get('atoms', 'char'),
                                          aid=ix_to_auth[c_aid])

        inps, targs, auths, lens = dp.prepare_data(
            batch, char_to_ix, auth_to_ix, maxlen=cp_params['max_seq_len'])
        auths_inp = 1 - auths if params['flip'] else auths
        forward, backward = adv_forward_pass(model,
                                             inps,
                                             lens,
                                             end_c=char_to_ix[endc],
                                             maxlen=cp_params['max_seq_len'],
                                             auths=auths_inp,
                                             cycle_compute=params['show_rev'],
                                             append_symb=append_tensor)
        # char_outs = model.forward_gen(inps, hidden, auths_inp, n_max = cp_params['max_len'],end_c=char_to_ix['.'])
        print('--------------------------------------------')
        print('Translate from %s to %s' %
              (batch[0]['author'], ix_to_auth[auths_inp.item()]))

        # General helper functions

        # Clears whitespace but retains character for re.sub
        def strip_match(match):
            return match.group(0).strip()

        # Joins together decimals
        def fix_decimals(match):
            match = match.group(0)
            return re.sub('\s', '', match)

        # Cleans text by removing unnecessary whitespace and substituting back in some symbols
        def clean_text(text):
            text = re.sub('-lrb-', '(', text)
            text = re.sub('-rrb-', ')', text)
            text = re.sub('-lsb-', '[', text)
            text = re.sub('-rsb-', ']', text)
            text = re.sub('-lcb-', '{', text)
            text = re.sub('-rcb-', '}', text)
            text = re.sub('\'\'', '\"', text)
            text = re.sub('\si\s', ' I ', text)
            text = re.sub('^i\s', 'I ', text)
            text = re.sub('\sna\s', 'na ', text)
            text = re.sub('\$\s', strip_match, text)
            text = re.sub('[-#]\s|\s([-.!,\':;?]|n\'t)', strip_match, text)
            text = re.sub('\d+. \d+', fix_decimals, text)
            return text

        # Get original sentence and clean it up a bit
        input_list = [ix_to_char[c.item()] for c in inps[1:]]
        input_string = jc.join(input_list)
        input_string = clean_text(input_string)

        # Get translated sentence and clean it up a bit
        output_list = [
            ix_to_char[c.item()] for c in forward if c.item() in ix_to_char
        ]
        if output_list[-1] == 'END':
            output_list = output_list[:-1]
        output_string = jc.join(output_list)
        output_string = clean_text(output_string)

        print(
            colored('Inp %6s: ' % (ix_to_auth[auths.item()]), 'green') +
            colored('%s' % input_string, auth_colors[auths.item()]))
        print(
            colored('Out %6s: ' % (ix_to_auth[auths_inp.item()]), 'grey') +
            colored('%s' % output_string, auth_colors[auths_inp.item()]))

        if params['show_rev']:
            print(
                colored('Rev %6s: ' % (ix_to_auth[auths.item()]), 'green') +
                colored(
                    '%s' % (jc.join([
                        ix_to_char[c.item()]
                        for c in backward if c.item() in ix_to_char
                        and ix_to_char[c.item()] != 'END'
                    ])), auth_colors[auths.item()]))