Ejemplo n.º 1
0
def load_data(config):
    print >> sys.stderr, 'Reading data...',
    text_iterator = TextIterator(
        source=config.source_dataset,
        target=config.target_dataset,
        source_dicts=[config.source_vocab],
        target_dict=config.target_vocab,
        batch_size=config.batch_size,
        maxlen=config.maxlen,
        n_words_source=config.source_vocab_size,
        n_words_target=config.target_vocab_size,
        skip_empty=True,
        shuffle_each_epoch=config.shuffle_each_epoch,
        sort_by_length=config.sort_by_length,
        maxibatch_size=config.maxibatch_size,
        keep_data_in_memory=config.keep_train_set_in_memory)

    if config.validFreq:
        valid_text_iterator = TextIterator(
            source=config.valid_source_dataset,
            target=config.valid_target_dataset,
            source_dicts=[config.source_vocab],
            target_dict=config.target_vocab,
            batch_size=config.valid_batch_size,
            maxlen=config.validation_maxlen,
            n_words_source=config.source_vocab_size,
            n_words_target=config.target_vocab_size,
            shuffle_each_epoch=False,
            sort_by_length=True,
            maxibatch_size=config.maxibatch_size)
    else:
        valid_text_iterator = None
    print >> sys.stderr, 'Done'
    return text_iterator, valid_text_iterator
Ejemplo n.º 2
0
def main(model, dictionary, dictionary_target, source, target, context, outfile, wordbyword):

    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    valid_noshuf = TextIterator(source, target, context,
                         dictionary, dictionary_target,
                         n_words_source=options['n_words_src'], n_words_target=options['n_words'],
                         batch_size=options['valid_batch_size'], maxlen=2000, shuffle=False,
                         tc=options['kwargs'].get('tc', False))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, xc, xc_mask, \
        opt_ret, \
        cost, cost_, xc_mask_2, xc_mask_3 = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask, xc, xc_mask, xc_mask_2, xc_mask_3]

    f_log_probs = theano.function(inps, cost, profile=profile)

    valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True)
    numpy.save(outfile, valid_errs)
Ejemplo n.º 3
0
def score_model(source_file, target_file, scorer_settings, options):
    scores = []
    for option in options:
        g = tf.Graph()
        with g.as_default():
            with tf.Session() as sess:
                model, saver = nmt.create_model(option, sess)

                text_iterator = TextIterator(
                    source=source_file.name,
                    target=target_file.name,
                    source_dicts=option.source_dicts,
                    target_dict=option.target_dict,
                    batch_size=scorer_settings.b,
                    maxlen=float('inf'),
                    source_vocab_sizes=option.source_vocab_sizes,
                    target_vocab_size=option.target_vocab_size,
                    use_factor=(option.factors > 1),
                    sort_by_length=False)

                losses = nmt.calc_loss_per_sentence(
                    option,
                    sess,
                    text_iterator,
                    model,
                    normalization_alpha=scorer_settings.normalization_alpha)

                scores.append(losses)
    return scores
Ejemplo n.º 4
0
def score_model(source_file, target_file, scorer_settings, options):

    scores = []
    for option in options:
        with tf.Session() as sess:
            model, saver = create_model(option, sess)

            valid_text_iterator = TextIterator(
                source=source_file.name,
                target=target_file.name,
                source_dicts=option.source_dicts,
                target_dict=option.target_dict,
                batch_size=scorer_settings.b,
                maxlen=float('inf'),
                source_vocab_sizes=option.source_vocab_sizes,
                target_vocab_size=option.target_vocab_size,
                use_factor=(option.factors > 1),
                sort_by_length=False)

            score = validate(
                option,
                sess,
                valid_text_iterator,
                model,
                normalization_alpha=scorer_settings.normalization_alpha)
            scores.append(score)

    return scores
Ejemplo n.º 5
0
def score_model(source_file, target_file, scorer_settings, options):
    scores = []
    for option in options:
        g = tf.Graph()
        with g.as_default():
            tf_config = tf.ConfigProto()
            tf_config.allow_soft_placement = True
            with tf.Session(config=tf_config) as sess:
                logging.info('Building model...')
                model = rnn_model.RNNModel(option)
                saver = model_loader.init_or_restore_variables(option, sess)

                text_iterator = TextIterator(
                    source=source_file.name,
                    target=target_file.name,
                    source_dicts=option.source_dicts,
                    target_dict=option.target_dict,
                    batch_size=scorer_settings.minibatch_size,
                    maxlen=float('inf'),
                    source_vocab_sizes=option.source_vocab_sizes,
                    target_vocab_size=option.target_vocab_size,
                    use_factor=(option.factors > 1),
                    sort_by_length=False)

                losses = nmt.calc_loss_per_sentence(
                    option,
                    sess,
                    text_iterator,
                    model,
                    normalization_alpha=scorer_settings.normalization_alpha)

                scores.append(losses)
    return scores
Ejemplo n.º 6
0
def load_data(config):
    logging.info('Reading data...')
    text_iterator = TextIterator(
        source=config.source_dataset,
        target=config.target_dataset,
        source_dicts=config.source_dicts,
        target_dict=config.target_dict,
        pretrain_dict=config.pretrain_vocab,
        model_type=config.model_type,
        batch_size=config.batch_size,
        maxlen=config.maxlen,
        source_vocab_sizes=config.source_vocab_sizes,
        target_vocab_size=config.target_vocab_size,
        skip_empty=True,
        shuffle_each_epoch=config.shuffle_each_epoch,
        sort_by_length=config.sort_by_length,
        use_factor=(config.factors > 1),
        utf8_type=config.utf8_type,
        maxibatch_size=config.maxibatch_size,
        token_batch_size=config.token_batch_size,
        keep_data_in_memory=config.keep_train_set_in_memory)

    if config.valid_freq and config.valid_source_dataset and config.valid_target_dataset:
        valid_text_iterator = TextIterator(
            source=config.valid_source_dataset,
            target=config.valid_target_dataset,
            source_dicts=config.source_dicts,
            target_dict=config.target_dict,
            pretrain_dict=config.pretrain_vocab,
            model_type=config.model_type,
            batch_size=config.valid_batch_size,
            maxlen=config.maxlen,
            source_vocab_sizes=config.source_vocab_sizes,
            target_vocab_size=config.target_vocab_size,
            shuffle_each_epoch=False,
            sort_by_length=True,
            use_factor=(config.factors > 1),
            utf8_type=config.utf8_type,
            maxibatch_size=config.maxibatch_size,
            token_batch_size=config.valid_token_batch_size)
    else:
        logging.info('no validation set loaded')
        valid_text_iterator = None
    logging.info('Done')
    return text_iterator, valid_text_iterator
Ejemplo n.º 7
0
def decode():
    # Load model config
    config = load_config(FLAGS)

    # Load source data to decode
    test_set = TextIterator(source=config['decode_input'],
                            batch_size=config['decode_batch_size'],
                            source_dict=config['source_vocabulary'],
                            maxlen=None,
                            n_words_source=config['num_encoder_symbols'])

    # Load inverse dictionary used in decoding
    target_inverse_dict = data_utils.load_inverse_dict(
        config['target_vocabulary'])

    # Initiate TF session
    with tf.Session(config=tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement,
            gpu_options=tf.GPUOptions(allow_growth=True))) as sess:

        # Reload existing checkpoint
        model = load_model(sess, config)
        try:
            print('Decoding {}..'.format(FLAGS.decode_input))
            if FLAGS.write_n_best:
                fout = [data_utils.fopen(("%s_%d" % (FLAGS.decode_output, k)), 'w') \
                        for k in range(FLAGS.beam_width)]
            else:
                fout = [data_utils.fopen(FLAGS.decode_output, 'w')]

            for idx, source_seq in enumerate(test_set):
                source, source_len = prepare_batch(source_seq)
                # predicted_ids: GreedyDecoder; [batch_size, max_time_step, 1]
                # BeamSearchDecoder; [batch_size, max_time_step, beam_width]
                predicted_ids = model.predict(sess,
                                              encoder_inputs=source,
                                              encoder_inputs_length=source_len)

                # Write decoding results
                for k, f in reversed(list(enumerate(fout))):
                    for seq in predicted_ids:
                        f.write(
                            str(
                                data_utils.seq2words(
                                    seq[:, k], target_inverse_dict)) + '\n')
                    if not FLAGS.write_n_best:
                        break
                print('  {}th line decoded'.format(idx *
                                                   FLAGS.decode_batch_size))

            print('Decoding terminated')
        except IOError:
            pass
        finally:
            [f.close() for f in fout]
Ejemplo n.º 8
0
def test():
    # load dictionary
    config = {
        'use_gpu': True,
        'hidden_units': 400,
        'vocab': './train_data/enli.dict',
        'word_dim': 200,
        'gpu_id': 14,
        'dropout': 0.2,
        'n_word': 42394,
        'batch_size': 32
    }

    worddicts = joblib.load(config['vocab'])

    print('Loading data')

    prefix = './train_data/bs_new.utf8'
    test = TextIterator('{}.query'.format(prefix),
                        '{}.title'.format(prefix),
                        '{}.label'.format(prefix),
                        dict=worddicts,
                        batch_size=config['batch_size'])

    model_file_lst = [
        '/home/disk0/wangqi38/pytorch-final/save_files/enli_0.2_400_shuffle.pkl',
        '/home/disk0/wangqi38/pytorch-final/save_files/enli_0.3_400.pkl'
    ]
    model_lst = []
    print('load models')
    if config['use_gpu']:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(config['gpu_id'])
    for model_name in model_file_lst:
        model = ENLI_Model(config)
        if config['use_gpu']:
            model.load_state_dict(torch.load(model_name))
        else:
            model.load_state_dict(
                torch.load(model_name, map_location={'cuda:0': 'cpu'}))
        model.eval()
        model_lst.append(model)

    use_gpu = config['use_gpu']

    tres = pred_acc_ensemble(model_lst, prepare_data, test, use_gpu)
    print('muti test accuracy', tres[0])
    print('bi test accuracy', tres[1])
    print('test auc', tres[2])

    print('finish')
Ejemplo n.º 9
0
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model, options[i], alignweights=alignweights)
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(source_file.name, target_file.name,
                    options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1],
                     n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                     batch_size=b,
                     maxlen=float('inf'),
                     sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str,[s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
Ejemplo n.º 10
0
def validate_helper(config, sess):
    model, saver = create_model(config, sess)
    valid_text_iterator = TextIterator(
        source=config.valid_source_dataset,
        target=config.valid_target_dataset,
        source_dicts=[config.source_vocab],
        target_dict=config.target_vocab,
        batch_size=config.valid_batch_size,
        maxlen=config.validation_maxlen,
        n_words_source=config.source_vocab_size,
        n_words_target=config.target_vocab_size,
        shuffle_each_epoch=False,
        sort_by_length=False,  #TODO
        maxibatch_size=config.maxibatch_size)
    costs = validate(sess, valid_text_iterator, model)
    lines = open(config.valid_target_dataset).readlines()
    for cost, line in zip(costs, lines):
        print cost, line.strip()
Ejemplo n.º 11
0
def get_error(model, test_src, test_target):
    profile = False

    # reload options
    f = open('%s.pkl' % model, 'rb')
    model_options = pkl.load(f)
    logging.info(model_options)

    logging.info('Building model')
    params = init_params(model_options)

    # reload parameters
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    dict_src = os.path.join(model_options['baseDir'],
                            model_options['dictionaries'][0])
    if len(model_options['dictionaries']) == 1:
        dict_target = None
    else:
        dict_target = os.path.join(model_options['baseDir'],
                                   model_options['dictionaries'][1])

    valid = TextIterator(test_src,
                         test_target,
                         dict_src,
                         dict_target,
                         n_words_source=model_options['n_words_src'],
                         n_words_target=model_options['n_words'],
                         batch_size=model_options['valid_batch_size'],
                         maxlen=model_options['maxlen'])

    logging.info('Building f_log_probs...')
    f_log_probs = theano.function(inps, cost, profile=profile)
    valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid)
    valid_err = valid_errs.mean()
    logging.info('Valid Error:%s' % (str(valid_err)))
Ejemplo n.º 12
0
def validate_helper(config, sess):
    model, saver = create_model(config, sess)
    valid_text_iterator = TextIterator(
        source=config.valid_source_dataset,
        target=config.valid_target_dataset,
        source_dicts=config.source_dicts,
        target_dict=config.target_dict,
        batch_size=config.valid_batch_size,
        maxlen=config.maxlen,
        source_vocab_sizes=config.source_vocab_sizes,
        target_vocab_size=config.target_vocab_size,
        shuffle_each_epoch=False,
        sort_by_length=False,  #TODO
        use_factor=(config.factors > 1),
        maxibatch_size=config.maxibatch_size)
    costs = validate(config, sess, valid_text_iterator, model)
    lines = open(config.valid_target_dataset).readlines()
    for cost, line in zip(costs, lines):
        logging.info("{0} {1}".format(cost, line.strip()))
Ejemplo n.º 13
0
def pretrain(config):
    
    logging.info('Reading pretrain data...')
    pretrain_dictionary
    text_iterator = TextIterator(
                        source=config.pretrain_dictionary_src,
                        target=config.pretrain_dictionary_trg,
                        source_dicts=config.source_dicts,
                        target_dict=config.target_dict,
                        batch_size=config.batch_size,
                        maxlen=config.maxlen,
                        source_vocab_sizes=config.source_vocab_sizes,
                        target_vocab_size=config.target_vocab_size,
                        skip_empty=True,
                        shuffle_each_epoch=config.shuffle_each_epoch,
                        sort_by_length=config.sort_by_length,
                        use_factor=(config.factors > 1),
                        maxibatch_size=config.maxibatch_size,
                        token_batch_size=config.token_batch_size,
                        keep_data_in_memory=config.keep_train_set_in_memory)
    logging.info('Done')
    return text_iterator
Ejemplo n.º 14
0
def gen_force_train_iter(source_data, target_data, reshuffle, source_dict,
                         target_dict, batch_size, maxlen, n_words_src,
                         n_words_trg):
    iter = 0
    while True:
        if reshuffle:
            os.popen('python shuffle.py ' + source_data + ' ' + target_data)
            os.popen('mv ' + source_data + '.shuf ' + source_data)
            os.popen('mv ' + target_data + '.shuf ' + target_data)
        gen_force_train = TextIterator(source_data, target_data, source_dict,
                                       target_dict, batch_size, maxlen,
                                       n_words_src, n_words_trg)
        ExampleNum = 0
        EpochStart = time.time()
        for x, y in gen_force_train:
            if len(x) < batch_size and len(y) < batch_size:
                continue
            ExampleNum += len(x)
            yield x, y, iter
        TimeCost = time.time() - EpochStart
        iter += 1
        print('Seen', ExampleNum, 'generator samples. Time cost is ', TimeCost)
Ejemplo n.º 15
0
    worddicts = pkl.load(f)
n_words = len(worddicts)
wv_dict, wv_arr, wv_size = load_word_vectors(embedding_path, 'glove.840B',
                                             dim_word)
pretrained_emb = norm_weight(n_words, dim_word)
for word in worddicts.keys():
    try:
        pretrained_emb[worddicts[word]] = wv_arr[wv_dict[word]].numpy()
    except:
        pretrained_emb[worddicts[word]] = torch.normal(torch.zeros(dim_word),
                                                       std=1).numpy()
print('load data...')
train = TextIterator(datasets[0],
                     datasets[1],
                     datasets[2],
                     dictionary,
                     n_words=n_words,
                     batch_size=batch_size,
                     maxlen=maxlen,
                     shuffle=True)
test = TextIterator(test_datasets[0],
                    test_datasets[1],
                    test_datasets[2],
                    dictionary,
                    n_words=n_words,
                    batch_size=batch_size,
                    shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
model = ESIM(dim_word, 2, n_words, dim_word, pretrained_emb)
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
Ejemplo n.º 16
0
def multi_rescore_model(source_file,
                        target_file,
                        savetos,
                        models,
                        options,
                        b,
                        normalization_alpha,
                        verbose,
                        alignweights,
                        extra_sources=[],
                        per_word=False):

    trng = RandomStreams(1234)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        #alignments = []
        #aux_alignments = []
        costs_per_word = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model,
                                      options[i],
                                      alignweights=alignweights)
            score, all_alignments, cost_per_word = multi_pred_probs(
                f_log_probs,
                prepare_multi_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            #print 'alignment lens'
            #print len(all_alignments)
            #print len(all_alignments[0])

            scores.append(score)

            costs_per_word.append(cost_per_word)

        return scores, tuple(all_alignments), costs_per_word

    #print 'extra_sources', extra_sources

    # list of sources + target sentences (target sentences are the final list)
    # TODO: make TextIterator generic
    sents = TextIterator(source_file.name,
                         target_file.name,
                         options[0]['dictionaries'][:-1],
                         options[0]['dictionaries'][-1],
                         n_words_source=options[0]['n_words_src'],
                         n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False,
                         extra_sources=[ss.name for ss in extra_sources])
    # TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, all_alignments, costs_per_word = _score(sents, alignweights)

    source_lines = []
    source_file.seek(0)
    source_lines.append([source_file.readlines()])

    extra_source_lines = []
    for i, ss in enumerate(extra_sources):
        extra_sources[i].seek(0)
        extra_source_lines.append([extra_sources[i].readlines()])

    target_file.seek(0)
    target_lines = target_file.readlines()

    # print out scores for each translation
    for i, line in enumerate(target_lines):
        if per_word:
            score_str = ' '.join(
                map(str, [s for s in costs_per_word[0][i]
                          ][:len(line.split(" ")) + 1]))
        else:
            score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            savetos[0].write('{0} '.format(line.strip()))
        savetos[0].write('{0}\n'.format(score_str))

    # optional save weights mode.

    if alignweights:

        #print 'num alignments', len(all_alignments)

        for i, alignments in enumerate(all_alignments):
            # write out the alignments.
            #print len(alignments)
            temp_name = savetos[i].name + str(i) + ".json"
            #print temp_name
            with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
                for line in alignments:
                    #print len(line[0][0])
                    #raw_input()
                    align_OUT.write(line + "\n")
                # combine the actual source and target words.
                #print 'savetos', len(savetos)
                #print 'source files', len(extra_sources)
                if i == 0:
                    tmp_srcfile = source_file
                else:
                    tmp_srcfile = extra_sources[i - 1]
                combine_source_target_text_1to1(tmp_srcfile,
                                                target_file,
                                                savetos[i].name,
                                                align_OUT,
                                                suffix=str(i))
Ejemplo n.º 17
0
def rescore_model(source_file, nbest_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write(
                "\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights:  ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(
            prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(
                prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(
                linesplit[0])  ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(
            tmp_in.name,
            tmp_out.name,
            options[0]['dictionaries'][:-1],
            options[0]['dictionaries'][1],
            n_words_source=options[0]['n_words_src'],
            n_words_target=options[0]['n_words'],
            batch_size=b,
            maxlen=float('inf'),
            sort_by_length=False
        )  #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str, [s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name,
                                   align_OUT)
        align_OUT.close()
Ejemplo n.º 18
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of GRU units
          encoder='gru',
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 weight decay penalty
          lrate=0.01,
          n_words=100000,  # vocabulary size
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=100,  # generate some samples after every sampleFreq
          dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz',
          valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok',
          dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl',
          use_dropout=False,
          reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)

    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    # build the symbolic computational graph
    trng, use_noise, \
        x, x_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    # Training loop
    uidx = 0
    estop = False
    bad_counter = 0
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # pad batch and create mask
            x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(5):
                    sample, score = gen_sample(tparams, f_next,
                                               model_options, trng=trng,
                                               maxlen=30, argmax=False)
                    print 'Sample ', jj, ': ',
                    ss = sample
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r:
                            print worddicts_r[vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data,
                           model_options, valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Ejemplo n.º 19
0
def main(_):
    """Main procedure for training and test

    """

    tf.logging.set_verbosity(tf.logging.INFO)

    # Load vocabulary
    tf.logging.info("***** Loading Vocabulary *****")
    token_to_idx = load_vocab(FLAGS.vocab_file)

    # Load text iterator
    tf.logging.info("***** Loading Text Iterator *****")
    test = TextIterator(FLAGS.test_file,
                        token_to_idx,
                        batch_size=FLAGS.test_batch_size,
                        vocab_size=FLAGS.vocab_size,
                        shuffle=False)

    # Initialize the word embedding
    tf.logging.info("***** Initialize Word Embedding *****")
    embedding = load_word_embedding(token_to_idx)

    # Build graph
    tf.logging.info("***** Build Computation Graph *****")
    probability_op, cost_op = create_model(embedding)
    loss_op = tf.reduce_mean(cost_op)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=5)

    # training process
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(init)
        # evaluation process
        tf.logging.info("***** Final Result ***** ")
        tf.logging.info("restore model at {}".format(FLAGS.model_file))
        saver.restore(sess, os.path.join(FLAGS.output_dir, FLAGS.model_file))

        if not FLAGS.return_score:
            test_metrics, test_scores = predict_metrics(
                sess, cost_op, probability_op, test)
            tf.logging.info(
                "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s",
                *test_metrics)
        else:
            test_scores = predict_metrics(sess, cost_op, probability_op, test)
            system_level_scores = {}
            for i in range(FLAGS.number_of_systems):
                system_level_scores[i] = []
            tf.logging.info("Writing confidence score to file")
            with codecs.open("context_am_ranking_score.txt",
                             mode='w',
                             encoding='utf-8') as wf:
                wf.truncate()
            for i, score in enumerate(test_scores):
                system_level_scores[i % FLAGS.number_of_systems].append(score)
                with codecs.open("context_am_ranking_score.txt",
                                 mode='a',
                                 encoding='utf-8') as wf:
                    wf.write(str(score) + '\n')
                if i % FLAGS.number_of_systems == FLAGS.number_of_systems - 1:
                    with codecs.open("context_am_ranking_score.txt",
                                     mode='a',
                                     encoding='utf-8') as wf:
                        wf.write('\n')

            with codecs.open("context_am_ranking_score_system_level.txt",
                             mode='w',
                             encoding='utf-8') as wf:
                wf.truncate()
            for k, v in system_level_scores.items():
                avg_score = sum(v) / len(v)
                with codecs.open("context_am_ranking_score_system_level.txt",
                                 mode='a',
                                 encoding='utf-8') as wf:
                    wf.write(str(avg_score) + '\n')
            tf.logging.info("Done writing confidence score to file")
Ejemplo n.º 20
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of GRU units
        encoder='gru',
        decoder='gru_cond_simple',
        patience=10,  # early stopping patience
        max_epochs=50,
        finish_after=100000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # not used
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=[
            '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.en.tok',
            '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '/home/ubuntu/codes/dl4mt-tutorial/data/newstest2011.en.tok',
            '/home/ubuntu/codes/dl4mt-tutorial/data/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.en.tok.pkl',
            '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.fr.tok.pkl'
        ],
        use_dropout=False,
        reload_=False,
        overwrite=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'Reloading model options'
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print 'Reloading model parameters'
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # un used, attention weight regularization
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x,
                                                y,
                                                maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto,
                            history_errs=history_errs,
                            uidx=uidx,
                            **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_errs,
                                uidx=uidx,
                                **unzip(tparams))
                    print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
Ejemplo n.º 21
0
def main():
    model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__)))
    model = '../../models/{}.npz'.format(model_name)
    valid_datasets   = ['../../data/sequence_and_features/premise_snli_1.0_dev_token.txt', 
                        '../../data/sequence_and_features/hypothesis_snli_1.0_dev_token.txt',
                        '../../data/sequence_and_features/premise_snli_1.0_dev_lemma.txt', 
                        '../../data/sequence_and_features/hypothesis_snli_1.0_dev_lemma.txt',
                        '../../data/sequence_and_features/label_snli_1.0_dev.txt']
    test_datasets    = ['../../data/sequence_and_features/premise_snli_1.0_test_token.txt', 
                        '../../data/sequence_and_features/hypothesis_snli_1.0_test_token.txt',
                        '../../data/sequence_and_features/premise_snli_1.0_test_lemma.txt', 
                        '../../data/sequence_and_features/hypothesis_snli_1.0_test_lemma.txt',
                        '../../data/sequence_and_features/label_snli_1.0_test.txt']
    dictionary       = ['../../data/sequence_and_features/vocab_cased.pkl',
                        '../../data/sequence_and_features/vocab_cased_lemma.pkl']
    # load model model_options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    print options
    # load dictionary and invert
    with open(dictionary[0], 'rb') as f:
        word_dict = pkl.load(f)

    print 'Loading knowledge base ...'
    kb_dicts = options['kb_dicts']
    with open(kb_dicts[0], 'rb') as f:
        kb_dict = pkl.load(f)

    n_words = options['n_words']
    valid_batch_size = options['valid_batch_size']

    valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], valid_datasets[3], valid_datasets[4],
                         dictionary[0], dictionary[1],
                         n_words=n_words,
                         batch_size=valid_batch_size,
                         shuffle=False)
    test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], test_datasets[3], test_datasets[4],
                         dictionary[0], dictionary[1],
                         n_words=n_words,
                         batch_size=valid_batch_size,
                         shuffle=False)

    # allocate model parameters
    params = init_params(options, word_dict)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x1, x1_mask, x1_kb, x2, x2_mask, x2_kb, kb_att, y, \
        opt_ret, \
        cost, \
        f_pred, \
        f_probs = \
        build_model(tparams, options)

    use_noise.set_value(0.)
    valid_acc = pred_acc(f_pred, prepare_data, options, valid, kb_dict)
    test_acc = pred_acc(f_pred, prepare_data, options, test, kb_dict)

    print 'valid accuracy', valid_acc
    print 'test accuracy', test_acc

    predict_labels_valid = pred_label(f_pred, prepare_data, options, valid, kb_dict)
    predict_labels_test = pred_label(f_pred, prepare_data, options, test, kb_dict)

    with open('predict_gold_samples_valid.txt', 'w') as fw:
        with open(valid_datasets[0], 'r') as f1:
            with open(valid_datasets[1], 'r') as f2:
                with open(valid_datasets[-1], 'r') as f3:
                    for a, b, c, d in zip(predict_labels_valid, f3, f1, f2):
                        fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n')

    with open('predict_gold_samples_test.txt', 'w') as fw:
        with open(test_datasets[0], 'r') as f1:
            with open(test_datasets[1], 'r') as f2:
                with open(test_datasets[-1], 'r') as f3:
                    for a, b, c, d in zip(predict_labels_test, f3, f1, f2):
                        fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n')

    print 'Done'
Ejemplo n.º 22
0
def train(
        dim_word=100,
        dim_word_src=200,
        enc_dim=1000,
        dec_dim=1000,  # the number of LSTM units
        patience=-1,  # early stopping patience
        max_epochs=5000,
        finish_after=-1,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=1000,  # maximum length of the description
        maxlen_trg=1000,  # maximum length of the description
        maxlen_sample=1000,
        optimizer='rmsprop',
        batch_size=[1, 2, 3, 4],
        valid_batch_size=16,
        sort_size=20,
        save_path=None,
        save_file_name='model',
        save_best_models=0,
        dispFreq=100,
        validFreq=100,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=-1,
        pbatchFreq=-1,
        verboseFreq=10000,
        datasets=[
            'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        source_word_level=0,
        target_word_level=0,
        use_dropout=False,
        re_load=False,
        re_load_old_setting=False,
        uidx=None,
        eidx=None,
        cidx=None,
        layers=None,
        save_every_saveFreq=0,
        save_burn_in=20000,
        use_bpe=0,
        init_params=None,
        build_model=None,
        build_sampler=None,
        gen_sample=None,
        **kwargs):
    # Model options
    model_options = locals().copy()
    del model_options['init_params']
    del model_options['build_model']
    del model_options['build_sampler']
    del model_options['gen_sample']

    # load dictionaries and invert them
    # dictionaries[0] : src
    # dictionaries[1] : trg
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    # ii, dd : 0 = source, 1 = target
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = cPickle.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    print 'Building model'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_name = '%s%s.npz' % (save_path, save_file_name)
    best_file_name = '%s%s.best.npz' % (save_path, save_file_name)
    opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads')
    best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name,
                                              '.grads')
    model_name = '%s%s.pkl' % (save_path, save_file_name)
    params = init_params(model_options)
    cPickle.dump(model_options, open(model_name, 'wb'))
    history_errs = [[], [], [], []]

    # reload options
    # reload : False
    if re_load and os.path.exists(file_name):
        print 'You are reloading your experiment.. do not panic dude..'
        if re_load_old_setting:
            with open(model_name, 'rb') as f:
                models_options = cPickle.load(f)
        params = load_params(file_name, params)
        # reload history
        model = numpy.load(file_name)
        history_errs = list(lst.tolist() for lst in model['history_errs'])
        if uidx is None:
            uidx = model['uidx']
        if eidx is None:
            eidx = model['eidx']
        if cidx is None:
            try:
                cidx = model['cidx']
            except:
                cidx = 0
    else:
        if uidx is None:
            uidx = 0
        if eidx is None:
            eidx = 0
        if cidx is None:
            cidx = 0

    print 'Loading data'

    train = MultiTextIterator(source=datasets[0],
                              target=datasets[1],
                              source_dict=dictionaries[0],
                              target_dict=dictionaries[1],
                              n_words_source=n_words_src,
                              n_words_target=n_words,
                              source_word_level=source_word_level,
                              target_word_level=target_word_level,
                              batch_size=batch_size,
                              sort_size=sort_size)

    valid = [
        TextIterator(source=valid_dataset[0],
                     target=valid_dataset[1],
                     source_dict=dictionaries[0],
                     target_dict=dictionaries[1],
                     n_words_source=n_words_src,
                     n_words_target=n_words,
                     source_word_level=source_word_level,
                     target_word_level=target_word_level,
                     batch_size=valid_batch_size,
                     sort_size=sort_size) for valid_dataset in valid_datasets
    ]

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y, y_mask, \
    opt_ret, \
    cost = \
        build_model(tparams, model_options)
    # NOTE : this is where we build the model
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
    # print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost
    print 'Done'

    if re_load:  # NOTE : this whole thing is False
        use_noise.set_value(0.)
        valid_scores = []
        for ii, vv in enumerate(valid):

            valid_errs = pred_probs(f_log_probs,
                                    prepare_data,
                                    model_options,
                                    vv,
                                    verboseFreq=verboseFreq)
            valid_err = valid_errs.mean()

            if numpy.isnan(valid_err):
                import ipdb
                ipdb.set_trace()

            print 'Reload sanity check: Valid ', valid_err

    cost = cost.mean()

    # apply L2 regularization on weights
    # decay_c : 0
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    # alpha_c : 0
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    # NOTE : why is this not referenced somewhere later?
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    if clip_c > 0:
        grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c)
    else:
        not_finite = 0
        clipped = 0

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    if re_load and os.path.exists(file_name):
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped,
                file_name=opt_file_name)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr, tparams, grads, inps, cost=cost, file_name=opt_file_name)
    else:
        # re_load = False, clip_c = 1
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr,
                                                                  tparams,
                                                                  grads,
                                                                  inps,
                                                                  cost=cost)

            # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile)

            # f_update = theano.function([lr], [], updates=updates,
            #                   on_unused_input='ignore', profile=profile)
            # toptparams

    print 'Done'

    print 'Optimization'
    best_p = None
    bad_counter = 0

    # will never be true
    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    # Training loop
    ud_start = time.time()
    estop = False

    if re_load:
        # IndexError: index 14 is out of bounds for axis 1 with size 13
        print "Checkpointed minibatch number: %d" % cidx
        for cc in xrange(cidx):
            if numpy.mod(cc, 1000) == 0:
                print "Jumping [%d / %d] examples" % (cc, cidx)
            train.next()

    for epoch in xrange(max_epochs):
        time0 = time.time()
        n_samples = 0
        NaN_grad_cnt = 0
        NaN_cost_cnt = 0
        clipped_cnt = 0
        update_idx = 0
        if re_load:
            re_load = 0
        else:
            cidx = 0

        for x, y in train:
            # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length
            update_idx += 1
            cidx += 1
            uidx += 1
            use_noise.set_value(1.)

            # NOTE : n_x <= batch_size
            x, x_mask, y, y_mask, n_x = prepare_data(x,
                                                     y,
                                                     maxlen=maxlen,
                                                     maxlen_trg=maxlen_trg,
                                                     n_words_src=n_words_src,
                                                     n_words=n_words)
            n_samples += n_x

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                uidx = max(uidx, 0)
                continue

            # compute cost, grads and copy grads to shared variables

            if clip_c > 0:
                cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if clipped:
                clipped_cnt += 1

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                import ipdb
                ipdb.set_trace()
                NaN_cost_cnt += 1

            if not_finite:
                import ipdb
                ipdb.set_trace()
                NaN_grad_cnt += 1
                continue

            # do the update on parameters
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                continue

            if float(NaN_grad_cnt) > max_epochs * 0.5 or float(
                    NaN_cost_cnt) > max_epochs * 0.5:
                print 'Too many NaNs, abort training'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                wps = n_samples / float(time.time() - time0)
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt, \
                    'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps
                ud_start = time.time()

            if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1:
                pbatch(x, worddicts_r[0])

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1:

                gen_list = [
                    0, batch_size[0], batch_size[0] + batch_size[1],
                    batch_size[0] + batch_size[1] + batch_size[2]
                ]
                gen_list = [ii for ii in gen_list if ii < n_x]

                for jj in gen_list:
                    # jj = min(5, n_samples)
                    stochastic = True
                    use_noise.set_value(0.)

                    # x : maxlen X n_samples
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=maxlen_sample,
                                               stochastic=stochastic,
                                               argmax=False)
                    print
                    print 'Source ', jj, ': ',
                    if source_word_level:
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                if use_bpe:
                                    print(worddicts_r[0][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[0][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        source_ = []
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                source_.append(worddicts_r[0][vv])
                            else:
                                source_.append('UNK')
                        print "".join(source_)
                    print 'Truth ', jj, ' : ',
                    if target_word_level:
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        truth_ = []
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                truth_.append(worddicts_r[1][vv])
                            else:
                                truth_.append('UNK')
                        print "".join(truth_)
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    if target_word_level:
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        sample_ = []
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                sample_.append(worddicts_r[1][vv])
                            else:
                                sample_.append('UNK')
                        print "".join(sample_)
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                valid_scores = []
                for ii, vv in enumerate(valid):
                    use_noise.set_value(0.)
                    # NOTE : when validation, don't pass maxlen, maxlen_trg
                    # meaning, don't limit sentence lengths...
                    # sort of makes sense i suppose?
                    valid_errs = pred_probs(
                        f_log_probs,
                        prepare_data,
                        model_options,
                        vv,
                        verboseFreq=verboseFreq,
                    )
                    valid_err = valid_errs.mean()
                    valid_scores.append(valid_err)
                    history_errs[ii].append(valid_err)

                    # patience == -1, never happens
                    if len(history_errs[ii]) > patience and valid_err >= \
                            numpy.array(history_errs[ii])[:-patience].min() and patience != -1:
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    if numpy.isnan(valid_err):
                        import ipdb
                        ipdb.set_trace()

                cnt = 0
                for ii in xrange(4):
                    if uidx == 0 or valid_scores[ii] <= numpy.array(
                            history_errs[ii]).min():
                        cnt += 1

                if len(history_errs[0]) > 1:
                    if numpy.sum(valid_scores) <= numpy.sum(
                        [aa[:-2] for aa in history_errs]):
                        less_sum = True
                    else:
                        less_sum = False
                else:
                    less_sum = True

                if cnt >= 2 and less_sum:
                    best_p = unzip(tparams)
                    best_optp = unzip(toptparams)
                    bad_counter = 0

                if saveFreq != validFreq and save_best_models:
                    numpy.savez(best_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cdix,
                                **best_p)
                    numpy.savez(best_opt_file_name, **best_optp)

                print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
                    valid_scores[0], valid_scores[1], valid_scores[2],
                    valid_scores[3])

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if not os.path.exists(save_path):
                    os.mkdir(save_path)

                params = unzip(tparams)
                optparams = unzip(toptparams)
                numpy.savez(file_name,
                            history_errs=history_errs,
                            uidx=uidx,
                            eidx=eidx,
                            cidx=cidx,
                            **params)
                numpy.savez(opt_file_name, **optparams)

                if save_every_saveFreq and (uidx >= save_burn_in):
                    this_file_name = '%s%s.%d.npz' % (save_path,
                                                      save_file_name, uidx)
                    this_opt_file_name = '%s%s%s.%d.npz' % (
                        save_path, save_file_name, '.grads', uidx)
                    numpy.savez(this_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    numpy.savez(this_opt_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    if best_p is not None and saveFreq != validFreq:
                        this_best_file_name = '%s%s.%d.best.npz' % (
                            save_path, save_file_name, uidx)
                        numpy.savez(this_best_file_name,
                                    history_errs=history_errs,
                                    uidx=uidx,
                                    eidx=eidx,
                                    cidx=cidx,
                                    **best_p)
                print 'Done...',
                print 'Saved to %s' % file_name

            # finish after this many updates
            if uidx >= finish_after and finish_after != -1:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples
        lang_nos = (4535523, 12122376, 1926115, 2326893)
        lang_done = [x * update_idx for x in batch_size]
        lang_rem = [x - y for x, y in zip(lang_nos, lang_done)]
        print "Remaining : DE({}), CS({}), FI({}), RU({})".format(
            lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3])
        eidx += 1

        if estop:
            break

    use_noise.set_value(0.)

    valid_scores = []
    for ii, vv in enumerate(valid):
        valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                               vv).mean()
        valid_scores.append(valid_err)

    print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
        valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3])

    params = unzip(tparams)
    optparams = unzip(toptparams)
    file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx)
    opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads',
                                       uidx)
    numpy.savez(file_name,
                history_errs=history_errs,
                uidx=uidx,
                eidx=eidx,
                cidx=cidx,
                **params)
    numpy.savez(opt_file_name, **optparams)
    if best_p is not None and saveFreq != validFreq:
        best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx)
        best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name,
                                                     '.grads', uidx)
        numpy.savez(best_file_name,
                    history_errs=history_errs,
                    uidx=uidx,
                    eidx=eidx,
                    cidx=cidx,
                    **best_p)
        numpy.savez(best_opt_file_name, **best_optp)

    return valid_err
Ejemplo n.º 23
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            logging.debug(
                "Save weight mode ON, alignment matrix will be saved.")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][-1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Ejemplo n.º 24
0
def train(
      dim_word=100,
      dim_word_src=200,
      enc_dim=1000,
      dec_dim=1000,  # the number of LSTM units
      patience=-1,  # early stopping patience
      max_epochs=5000,
      finish_after=-1,  # finish after this many updates
      decay_c=0.,  # L2 regularization penalty
      alpha_c=0.,  # alignment regularization
      clip_c=-1.,  # gradient clipping threshold
      lrate=0.01,  # learning rate
      n_words_src=100000,  # source vocabulary size
      n_words=100000,  # target vocabulary size
      maxlen=100,  # maximum length of the description
      maxlen_trg=None,  # maximum length of the description
      maxlen_sample=1000,
      optimizer='rmsprop',
      batch_size=16,
      valid_batch_size=16,
      sort_size=20,
      save_path=None,
      save_file_name='model',
      save_best_models=0,
      dispFreq=100,
      validFreq=100,
      saveFreq=1000,   # save the parameters after every saveFreq updates
      sampleFreq=-1,
      verboseFreq=10000,
      datasets=[
          'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
          '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
      valid_datasets=['../data/dev/newstest2011.en.tok',
                      '../data/dev/newstest2011.fr.tok'],
      dictionaries=[
          '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
          '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
      source_word_level=0,
      target_word_level=0,
      use_dropout=False,
      re_load=False,
      re_load_old_setting=False,
      uidx=None,
      eidx=None,
      cidx=None,
      layers=None,
      save_every_saveFreq=0,
      save_burn_in=20000,
      use_bpe=0,
      init_params=None,
      build_model=None,
      build_sampler=None,
      gen_sample=None,
      **kwargs
    ):

    if maxlen_trg is None:
        maxlen_trg = maxlen * 10
    # Model options
    model_options = locals().copy()
    del model_options['init_params']
    del model_options['build_model']
    del model_options['build_sampler']
    del model_options['gen_sample']

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = cPickle.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    print 'Building model'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_name = '%s%s.npz' % (save_path, save_file_name)
    best_file_name = '%s%s.best.npz' % (save_path, save_file_name)
    opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads')
    best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name, '.grads')
    model_name = '%s%s.pkl' % (save_path, save_file_name)
    params = init_params(model_options)
    cPickle.dump(model_options, open(model_name, 'wb'))
    history_errs = []

    # reload options
    if re_load and os.path.exists(file_name):
        print 'You are reloading your experiment.. do not panic dude..'
        if re_load_old_setting:
            with open(model_name, 'rb') as f:
                models_options = cPickle.load(f)
        params = load_params(file_name, params)
        # reload history
        model = numpy.load(file_name)
        history_errs = list(model['history_errs'])
        if uidx is None:
            uidx = model['uidx']
        if eidx is None:
            eidx = model['eidx']
        if cidx is None:
            cidx = model['cidx']
    else:
        if uidx is None:
            uidx = 0
        if eidx is None:
            eidx = 0
        if cidx is None:
            cidx = 0

    print 'Loading data'
    train = TextIterator(source=datasets[0],
                         target=datasets[1],
                         source_dict=dictionaries[0],
                         target_dict=dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         source_word_level=source_word_level,
                         target_word_level=target_word_level,
                         batch_size=batch_size,
                         sort_size=sort_size)
    valid = TextIterator(source=valid_datasets[0],
                         target=valid_datasets[1],
                         source_dict=dictionaries[0],
                         target_dict=dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         source_word_level=source_word_level,
                         target_word_level=target_word_level,
                         batch_size=valid_batch_size,
                         sort_size=sort_size)

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
    #print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'
    if re_load:
        use_noise.set_value(0.)
        valid_errs = pred_probs(f_log_probs, prepare_data,
                                model_options, valid, verboseFreq=verboseFreq)
        valid_err = valid_errs.mean()

        if numpy.isnan(valid_err):
            import ipdb
            ipdb.set_trace()

        print 'Reload sanity check: Valid ', valid_err

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    if clip_c > 0:
        grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c)
    else:
        not_finite = 0
        clipped = 0

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    if re_load and os.path.exists(file_name):
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost,
                                                                  not_finite=not_finite, clipped=clipped,
                                                                  file_name=opt_file_name)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost,
                                                                  file_name=opt_file_name)
    else:
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost,
                                                                  not_finite=not_finite, clipped=clipped)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost)
    print 'Done'

    print 'Optimization'
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    # Training loop
    ud_start = time.time()
    estop = False

    if re_load:
        print "Checkpointed minibatch number: %d" % cidx
        for cc in xrange(cidx):
            if numpy.mod(cc, 1000)==0:
                print "Jumping [%d / %d] examples" % (cc, cidx)
            train.next()

    for epoch in xrange(max_epochs):
        n_samples = 0
        NaN_grad_cnt = 0
        NaN_cost_cnt = 0
        clipped_cnt = 0
        if re_load:
            re_load = 0
        else:
            cidx = 0

        for x, y in train:
            cidx += 1
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask, n_x = prepare_data(x, y, maxlen=maxlen,
                                                     maxlen_trg=maxlen_trg,
                                                     n_words_src=n_words_src,
                                                     n_words=n_words)
            n_samples += n_x

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                uidx = max(uidx, 0)
                continue

            # compute cost, grads and copy grads to shared variables
            if clip_c > 0:
                cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if clipped:
                clipped_cnt += 1

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                NaN_cost_cnt += 1

            if not_finite:
                NaN_grad_cnt += 1
                continue

            # do the update on parameters
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                continue

            if float(NaN_grad_cnt) > max_epochs * 0.5 or float(NaN_cost_cnt) > max_epochs * 0.5:
                print 'Too many NaNs, abort training'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\
                      'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud
                ud_start = time.time()

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    use_noise.set_value(0.)
                    sample, score = gen_sample(tparams, f_init, f_next,
                                               x[:, jj][:, None],
                                               model_options, trng=trng, k=1,
                                               maxlen=maxlen_sample,
                                               stochastic=stochastic,
                                               argmax=False)
                    print
                    print 'Source ', jj, ': ',
                    if source_word_level:
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                if use_bpe:
                                    print (worddicts_r[0][vv]).replace('@@', ''),
                                else:
                                    print worddicts_r[0][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        source_ = []
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                source_.append(worddicts_r[0][vv])
                            else:
                                source_.append('UNK')
                        print "".join(source_)
                    print 'Truth ', jj, ' : ',
                    if target_word_level:
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print (worddicts_r[1][vv]).replace('@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        truth_ = []
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                truth_.append(worddicts_r[1][vv])
                            else:
                                truth_.append('UNK')
                        print "".join(truth_)
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    if target_word_level:
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print (worddicts_r[1][vv]).replace('@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        sample_ = []
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                sample_.append(worddicts_r[1][vv])
                            else:
                                sample_.append('UNK')
                        print "".join(sample_)
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid, verboseFreq=verboseFreq)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    best_optp = unzip(toptparams)
                    bad_counter = 0

                if saveFreq != validFreq and save_best_models:
                    numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx,
                                cidx=cidx, **best_p)
                    numpy.savez(best_opt_file_name, **best_optp)

                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min() and patience != -1:
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if not os.path.exists(save_path):
                    os.mkdir(save_path)

                params = unzip(tparams)
                optparams = unzip(toptparams)
                numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx,
                            cidx=cidx, **params)
                numpy.savez(opt_file_name, **optparams)

                if save_every_saveFreq and (uidx >= save_burn_in):
                    this_file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx)
                    this_opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx)
                    numpy.savez(this_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx,
                                cidx=cidx, **params)
                    numpy.savez(this_opt_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx,
                                cidx=cidx, **params)
                    if best_p is not None and saveFreq != validFreq:
                        this_best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx)
                        numpy.savez(this_best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx,
                                    cidx=cidx, **best_p)
                print 'Done...',
                print 'Saved to %s' % file_name

            # finish after this many updates
            if uidx >= finish_after and finish_after != -1:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples
        eidx += 1

        if estop:
            break

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data,
                           model_options, valid).mean()

    print 'Valid ', valid_err

    params = unzip(tparams)
    optparams = unzip(toptparams)
    file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx)
    opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx)
    numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params)
    numpy.savez(opt_file_name, **optparams)
    if best_p is not None and saveFreq != validFreq:
        best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx)
        best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name, '.grads',uidx)
        numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p)
        numpy.savez(best_opt_file_name, **best_optp)

    return valid_err
Ejemplo n.º 25
0
def main(_):
    """Main procedure for training and test

    """

    ud_start_whole = time.time()

    tf.logging.set_verbosity(tf.logging.INFO)

    # Load vocabulary
    tf.logging.info("***** Loading Vocabulary *****")
    token_to_idx = load_vocab(FLAGS.vocab_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    # Load text iterator
    tf.logging.info("***** Loading Text Iterator *****")
    train = TextIterator(FLAGS.train_file, token_to_idx,
                         batch_size=FLAGS.train_batch_size,
                         vocab_size=FLAGS.vocab_size,
                         shuffle=True)
    print(type(train))
    valid = TextIterator(FLAGS.valid_file, token_to_idx,
                         batch_size=FLAGS.valid_batch_size,
                         vocab_size=FLAGS.vocab_size,
                         shuffle=False)
    test = TextIterator(FLAGS.test_file, token_to_idx,
                        batch_size=FLAGS.test_batch_size,
                        vocab_size=FLAGS.vocab_size,
                        shuffle=False)
    # Text iterator of training set for evaluation
    train_eval = TextIterator(FLAGS.train_file, token_to_idx,
                              vocab_size=FLAGS.vocab_size, batch_size=FLAGS.train_batch_size, shuffle=False)

    # Initialize the word embedding
    tf.logging.info("***** Initialize Word Embedding *****")
    embedding = load_word_embedding(token_to_idx)

    # Build graph
    tf.logging.info("***** Build Computation Graph *****")
    probability_op, cost_op = create_model(embedding)
    loss_op = tf.reduce_mean(cost_op)

    lr = tf.Variable(0.0, name="learning_rate", trainable=False)

    optimizer = tf.train.AdamOptimizer(learning_rate=lr)

    tf.logging.info("***** Trainable Variables *****")

    tvars = tf.trainable_variables()
    for var in tvars:
        tf.logging.info(" name = %s, shape = %s", var.name, var.shape)

    if FLAGS.clip_c > 0.:
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost_op, tvars), FLAGS.clip_c)

    train_op = optimizer.apply_gradients(zip(grads, tvars))
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=5)

    # training process
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(init)

        uidx = 0
        bad_counter = 0
        history_errs = []

        current_lr = FLAGS.learning_rate
        sess.run(tf.assign(lr, current_lr))

        for eidx in range(FLAGS.max_train_epochs):
            tf.logging.info("***** Training at Epoch %s *****", eidx)
            n_samples = 0
            for instance in train:
                n_samples += len(instance)
                uidx += 1

                (batch_x1, batch_x1_mask, batch_x2, batch_x2_mask, batch_y) = prepare_data(
                    instance)

                if batch_x1 is None:
                    tf.logging.info("Minibatch with zero sample")
                    uidx -= 1
                    continue

                ud_start = time.time()
                _, loss = sess.run([train_op, loss_op],
                                   feed_dict={
                    "x1:0": batch_x1, "x1_mask:0": batch_x1_mask,
                    "x2:0": batch_x2, "x2_mask:0": batch_x2_mask,
                    "y:0": batch_y, "keep_rate:0": 0.5})
                ud = time.time() - ud_start

                if numpy.mod(uidx, FLAGS.disp_freq) == 0:
                    tf.logging.info(
                        "epoch %s update %s loss %s samples/sec %s", eidx, uidx, loss, 1. * batch_x1.shape[1] / ud)

            tf.logging.info("***** Evaluation at Epoch %s *****", eidx)
            tf.logging.info("seen samples %s each epoch", n_samples)
            tf.logging.info("current learning rate: %s", current_lr)

            # validate model on validation set and early stop if necessary
            valid_metrics, valid_scores = predict_metrics(
                sess, cost_op, probability_op, valid)

            # select best model based on recall@1 of validation set
            valid_err = 1.0 - valid_metrics[3]
            history_errs.append(valid_err)

            tf.logging.info(
                "valid set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *valid_metrics)

            test_metrics, test_scores = predict_metrics(
                sess, cost_op, probability_op, test)

            tf.logging.info(
                "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *test_metrics)

            if eidx == 0 or valid_err <= numpy.array(history_errs).min():
                best_epoch_num = eidx
                tf.logging.info(
                    "saving current best model at epoch %s based on metrics on valid set", best_epoch_num)
                saver.save(sess, os.path.join(
                    FLAGS.output_dir, "model_epoch_{}.ckpt".format(best_epoch_num)))

            if valid_err > numpy.array(history_errs).min():
                bad_counter += 1
                tf.logging.info("bad_counter: %s", bad_counter)

                current_lr = current_lr * 0.5
                sess.run(tf.assign(lr, current_lr))
                tf.logging.info(
                    "half the current learning rate to %s", current_lr)

            if bad_counter > FLAGS.patience:
                tf.logging.info("***** Early Stop *****")
                estop = True
                break

        # evaluation process
        tf.logging.info("***** Final Result ***** ")
        tf.logging.info(
            "restore best model at epoch %s ", best_epoch_num)
        saver.restore(sess, os.path.join(
            FLAGS.output_dir, "model_epoch_{}.ckpt".format(best_epoch_num)))

        valid_metrics, valid_scores = predict_metrics(
            sess, cost_op, probability_op, valid)
        tf.logging.info(
            "valid set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *valid_metrics)

        test_metrics, test_scores = predict_metrics(
            sess, cost_op, probability_op, test)
        tf.logging.info(
            "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *test_metrics)

        train_acc, train_cost = predict_accuracy(
            sess, cost_op, probability_op, train_eval)
        tf.logging.info("train set: ACC %s Cost %s", train_acc, train_cost)

        ud_whole = (time.time() - ud_start_whole) / 3600

        tf.logging.info("training epochs: %s", eidx + 1)
        tf.logging.info("training duration: %s hours", ud_whole)
Ejemplo n.º 26
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=100,  # the number of GRU units
        encoder='lstm',  # encoder model
        decoder='lstm',  # decoder model
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.0004,  # learning rate
        n_words=100000,  # vocabulary size
        n_words_lemma=100000,
        maxlen=100,  # maximum length of the description
        optimizer='adam',
        batch_size=32,
        valid_batch_size=32,
        save_model='../../models/',
        saveto='model.npz',
        dispFreq=100,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        use_dropout=False,
        reload_=False,
        verbose=False,  # print verbose information for debug but slow speed
        delay1=3,
        delay2=7,
        delay_tech=5,
        types='title',
        cut_word=False,
        cut_news=False,
        last_layer="LSTM",
        CNN_filter=64,
        CNN_kernel=3,
        keep_prob = 0.8,
        datasets=[],
        valid_datasets=[],
        test_datasets=[],
        tech_data = [],
        dictionary=[],
        kb_dicts=[],
        embedding='',  # pretrain embedding file, such as word2vec, GLOVE
        dim_kb=5,
        RUN_NAME="histogram_visualization",
        wait_N=10
):
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s",
                        filename='./log_result.txt')
    # Model options
    model_options = locals().copy()
    #tf.reset_default_graph()
    #tf.set_random_seed(2345)
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    logger.info("Loading knowledge base ...")

    # reload options
    if reload_ and os.path.exists(saveto):
        logger.info("Reload options")
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    logger.debug(pprint.pformat(model_options))

    logger.info("Loading data")
    train = TextIterator(datasets[0], datasets[1],tech_data,
                         dict=dictionary,
                         delay1=delay1,
                         delay2=delay2,
                         delay_tech=delay_tech,
                         types=types,
                         n_words=n_words,
                         batch_size=batch_size,
                         cut_word=cut_word,
                         cut_news=cut_news,
                         shuffle=True, shuffle_sentence=False)
    train_valid = TextIterator(datasets[0], datasets[1],tech_data,
                               dict=dictionary,
                               delay1=delay1,
                               delay2=delay2,
                               delay_tech=delay_tech,
                               types=types,
                               n_words=n_words,
                               batch_size=valid_batch_size,
                               cut_word=cut_word,
                               cut_news=cut_news,
                               shuffle=False, shuffle_sentence=False)
    valid = TextIterator(valid_datasets[0], valid_datasets[1],tech_data,
                         dict=dictionary,
                         delay1=delay1,
                         delay2=delay2,
                         delay_tech=delay_tech,
                         types=types,
                         n_words=n_words,
                         batch_size=valid_batch_size,
                         cut_word=cut_word,
                         cut_news=cut_news,
                         shuffle=False, shuffle_sentence=False)
    test = TextIterator(test_datasets[0], test_datasets[1],tech_data,
                        dict=dictionary,
                        delay1=delay1,
                        delay2=delay2,
                        delay_tech=delay_tech,
                        types=types,
                        n_words=n_words,
                        batch_size=valid_batch_size,
                        cut_word=cut_word,
                        cut_news=cut_news,
                        shuffle=False, shuffle_sentence=False)

    # Initialize (or reload) the parameters using 'model_options'
    # then build the tensorflow graph
    logger.info("init_word_embedding")
    params = init_params(model_options, worddicts)
    embedding = word_embedding(model_options, params)
    is_training, cost, x, x_mask, y, n_timesteps, pred, summary = build_model(embedding, model_options)
    with tf.variable_scope('train'):
        lr = tf.Variable(0.0, trainable=False)

        def assign_lr(session, lr_value):
            session.run(tf.assign(lr, lr_value))

        logger.info('Building optimizers...')
        #optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr,rho=0.95)
        logger.info('Done')
        # print all variables
        tvars = tf.trainable_variables()
        for var in tvars:
            print(var.name, var.shape)
        lossL = tf.add_n([tf.nn.l2_loss(v) for v in tvars if ('embeddings' not in v.name and 'bias' not in v.name)])#
        lossL2=lossL * 0.0005
        print("don't do L2 variables:")
        print([v.name for v in tvars if ('embeddings' in v.name or 'bias' in v.name)])
        print("\n do L2 variables:")
        print([v.name for v in tvars if ('embeddings' not in v.name and 'bias' not in v.name)])
        cost = cost + lossL2
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), model_options['clip_c'])
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            train_op = optimizer.apply_gradients(zip(grads, tvars))
        # train_op = optimizer.minimize(cost)
        op_loss = tf.reduce_mean(cost)
        op_L2 = tf.reduce_mean(lossL)
        logger.info("correct_pred")
        correct_pred = tf.equal(tf.argmax(input=pred, axis=1), y)  # make prediction
        logger.info("Done")

        temp_accuracy = tf.cast(correct_pred, tf.float32)  # change to float32

    logger.info("init variables")
    init = tf.global_variables_initializer()
    logger.info("Done")
    # saver
    saver = tf.train.Saver(max_to_keep=15)

    config = tf.ConfigProto()
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    config.gpu_options.allow_growth = True
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        training_writer = tf.summary.FileWriter("./logs/{}/training".format(RUN_NAME), sess.graph)
        validate_writer = tf.summary.FileWriter("./logs/{}/validate".format(RUN_NAME), sess.graph)
        testing_writer = tf.summary.FileWriter("./logs/{}/testing".format(RUN_NAME), sess.graph)
        sess.run(init)
        history_errs = []
        history_valid_result = []
        history_test_result = []
        # reload history
        if reload_ and os.path.exists(saveto):
            logger.info("Reload history error")
            history_errs = list(numpy.load(saveto)['history_errs'])

        bad_counter = 0

        if validFreq == -1:
            validFreq = len(train[0]) / batch_size
        if saveFreq == -1:
            saveFreq = len(train[0]) / batch_size
        
        loss_plot=defaultdict(list)
        uidx = 0
        estop = False
        valid_acc_record = []
        test_acc_record = []
        best_num = -1
        best_epoch_num = 0
        lr_change_list = []
        wait_counter = 0
        wait_N = model_options['wait_N']
        learning_rate = model_options['lrate']
        assign_lr(sess, learning_rate)
        for eidx in range(max_epochs):
            n_samples = 0
            training_cost = 0
            training_acc = 0
            for x, x_d1, x_d2, y, y_tech in train:
                n_samples += len(x)
                uidx += 1
                keep_prob = model_options['keep_prob']
                is_training = True
                data_x, data_x_mask, data_x_d1, data_x_d1_mask, data_x_d2, data_x_d2_mask, data_y, final_mask = prepare_data(
                    x,
                    x_d1,
                    x_d2,
                    y,
                    model_options,
                    maxlen=maxlen)
                print(data_x.shape, data_x_mask.shape, data_x_d1.shape, data_x_d1_mask.shape, data_x_d2.shape,
                      data_x_d2_mask.shape, final_mask.shape, data_y.shape)
                assert data_y.shape[0] == data_x.shape[0], 'Size does not match'
                if x is None:
                    logger.debug('Minibatch with zero sample under length {0}'.format(maxlen))
                    uidx -= 1
                    continue
                ud_start = time.time()
                _, loss,loss_no_mean,temp_acc,l2_check = sess.run([train_op, op_loss,cost,temp_accuracy,op_L2],
                                   feed_dict={'input/x:0': data_x, 'input/x_mask:0': data_x_mask, 'input/y:0': data_y,
                                              'input/x_d1:0': data_x_d1, 'input/x_d1_mask:0': data_x_d1_mask,
                                              'input/x_d2:0': data_x_d2, 'input/x_d2_mask:0': data_x_d2_mask,
                                              'input/final_mask:0': final_mask,
                                              'input/technical:0':y_tech,
                                              'input/keep_prob:0': keep_prob, 'input/is_training:0': is_training})
                ud = time.time() - ud_start
                training_cost += loss_no_mean.sum()
                training_acc += temp_acc.sum()
                loss_plot['training'].append(loss)
                '''train_summary = sess.run(summary, feed_dict={'input/x:0': data_x, 'input/x_mask:0': data_x_mask,
                                                              'input/y:0': data_y,'input/keep_prob:0':keep_prob,'input/is_training:0':is_training})
                training_writer.add_summary(train_summary, eidx)'''
                if numpy.mod(uidx, dispFreq) == 0:
                    logger.debug('Epoch {0} Update {1} Cost {2} L2 {3} TIME {4}'.format(eidx, uidx, loss,l2_check,ud))

                # validate model on validation set and early stop if necessary
                if numpy.mod(uidx, validFreq) == 0:
                    is_training = False

                    valid_acc, valid_loss,valid_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, valid, maxlen,
                                                            correct_pred, pred, summary, eidx, is_training, train_op,loss_plot,
                                                            validate_writer,validate=True)
                    test_acc, test_loss,test_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, test, maxlen,
                                                          correct_pred, pred, summary, eidx, is_training, train_op,loss_plot,
                                                          testing_writer)
                    # valid_err = 1.0 - valid_acc
                    valid_err = valid_loss
                    history_errs.append(valid_err)
                    history_valid_result.append(valid_final_result)
                    history_test_result.append(test_final_result)
                    loss_plot['validate_ep'].append(valid_loss)
                    loss_plot['testing_ep'].append(test_loss)
                    logger.debug('Epoch  {0}'.format(eidx))
                    logger.debug('Valid cost  {0}'.format(valid_loss))
                    logger.debug('Valid accuracy  {0}'.format(valid_acc))
                    logger.debug('Test cost  {0}'.format(test_loss))
                    logger.debug('Test accuracy  {0}'.format(test_acc))
                    logger.debug('learning_rate:  {0}'.format(learning_rate))

                    valid_acc_record.append(valid_acc)
                    test_acc_record.append(test_acc)
                    if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                        best_num = best_num + 1
                        best_epoch_num = eidx
                        wait_counter = 0
                        logger.info("Saving...")
                        saver.save(sess, _s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt"))
                        logger.info(_s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt"))
                        numpy.savez(saveto, history_errs=history_errs, **params)
                        pkl.dump(model_options, open('{}.pkl'.format(saveto), 'wb'))
                        logger.info("Done")

                    if valid_err > numpy.array(history_errs).min():
                        wait_counter += 1
                    # wait_counter +=1 if valid_err>numpy.array(history_errs).min() else 0
                    if wait_counter >= wait_N:
                        logger.info("wait_counter max, need to half the lr")
                        # print 'wait_counter max, need to half the lr'
                        bad_counter += 1
                        wait_counter = 0
                        logger.debug('bad_counter:  {0}'.format(bad_counter))
                        # TODO change the learining rate
                        #learning_rate = learning_rate * 0.9
                        # learning_rate = learning_rate
                        #assign_lr(sess, learning_rate)
                        lr_change_list.append(eidx)
                        logger.debug('lrate change to:   {0}'.format(learning_rate))
                        # print 'lrate change to: ' + str(lrate)

                    if bad_counter > patience:
                        logger.info("Early Stop!")
                        estop = True
                        break

                    if numpy.isnan(valid_err):
                        pdb.set_trace()

                        # finish after this many updates
                if uidx >= finish_after:
                    logger.debug('Finishing after iterations!  {0}'.format(uidx))
                    # print 'Finishing after %d iterations!' % uidx
                    estop = True
                    break
            logger.debug('Seen samples:  {0}'.format(n_samples))
            logger.debug('Training accuracy:  {0}'.format(1.0 * training_acc/n_samples))
            loss_plot['training_ep'].append(training_cost/n_samples)
            # print 'Seen %d samples' % n_samples
            logger.debug('Saved loss_plot pickle')
            with open("important_plot.pickle",'wb') as handle:
                pkl.dump(loss_plot, handle, protocol=pkl.HIGHEST_PROTOCOL)
            if estop:
                break

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
        # Restore variables from disk.
        saver.restore(sess, _s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt"))
        keep_prob = 1
        is_training = False
        logger.info('=' * 80)
        logger.info('Final Result')
        logger.info('=' * 80)
        logger.debug('best epoch   {0}'.format(best_epoch_num))

        valid_acc, valid_cost,valid_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, valid,
                                                maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None)
        logger.debug('Valid cost   {0}'.format(valid_cost))
        logger.debug('Valid accuracy   {0}'.format(valid_acc))

        # print 'Valid cost', valid_cost
        # print 'Valid accuracy', valid_acc

        test_acc, test_cost,test_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, test,
                                              maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None)
        logger.debug('Test cost   {0}'.format(test_cost))
        logger.debug('Test accuracy   {0}'.format(test_acc))

        # print 'best epoch ', best_epoch_num
        train_acc, train_cost,_ = predict_pro_acc(sess, cost, prepare_data, model_options, train_valid,
                                                maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None)
        logger.debug('Train cost   {0}'.format(train_cost))
        logger.debug('Train accuracy   {0}'.format(train_acc))
        valid_m=numpy.array(history_valid_result)
        test_m=numpy.array(history_test_result)
        valid_final_result = (numpy.array([valid_final_result])==False)
        test_final_result = (numpy.array([test_final_result])==False)
        #print(numpy.all(valid_m, axis = 0))
        #print(numpy.all(test_m, axis=0))
        print('validation: all prediction through every epoch that are the same:',numpy.where(numpy.all(valid_m, axis = 0)))
        print('testing: all prediction through every epoch that are the same:',numpy.where(numpy.all(test_m, axis=0)))
        print('validation: final prediction that is False:',numpy.where(valid_final_result))
        print('testing: final prediction that is False:',numpy.where(test_final_result))
        if os.path.exists('history_predict.npz'):
            logger.info("Load and save to history_predict.npz")
            valid_history = numpy.load('history_predict.npz')['valid_final_result']
            test_history = numpy.load('history_predict.npz')['test_final_result']
            vv=numpy.concatenate((valid_history,valid_final_result),axis=0)
            tt=numpy.concatenate((test_history,valid_final_result),axis=0)
            print('Concate shape valid:',vv.shape)
            print('Print all validate history outputs that return False',numpy.where(numpy.all(vv,axis=0)))
            print('Concate shape test:',tt.shape)
            print('Print all test history outputs that return False',numpy.where(numpy.all(tt,axis=0)))
            numpy.savez('history_predict.npz',valid_final_result=vv,test_final_result=tt,**params)
        else:
            numpy.savez('history_predict.npz',valid_final_result=valid_final_result,test_final_result=test_final_result,**params)
        # print 'Train cost', train_cost
        # print 'Train accuracy', train_acc

        # print 'Test cost   ', test_cost
        # print 'Test accuracy   ', test_acc

        return None
Ejemplo n.º 27
0
def main(model,
         src_dict,
         trg_dict,
         src,
         trg,
         multibleu,
         batch_size=60,
         pred_dir='',
         model_list=False):
    if pred_dir is not '' and not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    if model_list:
        model_list_file = model
        with open(model_list_file) as f:
            model = f.readline().strip()

    # load dictionaries and invert them
    worddicts = [None] * 2
    worddicts_r = [None] * 2
    for ii, dd in enumerate([src_dict, trg_dict]):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # load model options
    with open('%s.pkl' % model, 'rb') as f:
        options = pkl.load(f)

    trng = RandomStreams(options['trng'])
    use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    f_init_2, f_next_2 = build_sampler_2(tparams, options, trng, use_noise)

    iterator = TextIterator(src,
                            trg,
                            src_dict,
                            trg_dict,
                            n_words_source=options['n_words_src'],
                            n_words_target=options['n_words'],
                            batch_size=batch_size,
                            maxlen=2000,
                            shuffle=False,
                            replace=False)

    if not model_list:
        try:
            valid_out, valid_bleu = greedy_decoding(
                options,
                trg,
                iterator,
                worddicts_r,
                tparams,
                prepare_data,
                gen_sample_2,
                f_init_2,
                f_next_2,
                trng,
                multibleu,
                fname=os.path.join(pred_dir,
                                   os.path.basename(model)[:-3] + 'out'),
                maxlen=100,
                verbose=False)
        except:
            valid_out = ''
            valid_bleu = 0.0
        print valid_out, valid_bleu
    else:
        best_score = 0.
        best_model = ''
        with open(model_list_file) as f:
            for line in f:
                start = time.time()
                model = line.strip()
                if model == '':
                    continue
                params = load_params(model, params)
                for kk, pp in params.iteritems():
                    tparams[kk].set_value(params[kk])
                print model,
                try:
                    valid_out, valid_bleu = greedy_decoding(
                        options,
                        trg,
                        iterator,
                        worddicts_r,
                        tparams,
                        prepare_data,
                        gen_sample_2,
                        f_init_2,
                        f_next_2,
                        trng,
                        multibleu,
                        fname=os.path.join(
                            pred_dir,
                            os.path.basename(model)[:-3] + 'out'),
                        maxlen=100,
                        verbose=False)
                except:
                    valid_out = ''
                    valid_bleu = 0.0
                print valid_out, valid_bleu,
                if valid_bleu > best_score:
                    best_score = valid_bleu
                    best_model = model
                end = time.time()
                print "Time: ", end - start
        print 'Best model: ', best_model
        print 'Best BLEU: ', best_score
Ejemplo n.º 28
0
def main(model,
         src_dict,
         target_dict,
         source_file,
         target_file,
         saveto,
         source_word_level=1,
         target_word_level=0,
         valid_batch_size=128,
         n_words_src=302,
         n_words=302):
    from char_base import (init_params, build_model, build_sampler)
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from nmt import (pred_probs, prepare_data)

    # load model model_options
    pkl_file = model.split('.')[0] + '.pkl'
    with open(pkl_file, 'rb') as f:
        options = pkl.load(f)

    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
    x, x_mask, y, y_mask, \
    opt_ret, \
    cost = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, options, trng, use_noise)
    print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost)
    print 'Done'

    print('Preparing dataset...')
    dataset = TextIterator(source=source_file,
                           target=target_file,
                           source_dict=src_dict,
                           target_dict=target_dict,
                           n_words_source=n_words_src,
                           n_words_target=n_words,
                           source_word_level=source_word_level,
                           target_word_level=target_word_level,
                           batch_size=valid_batch_size,
                           sort_size=sort_size)

    print('Predicting probs...')
    log_probs = pred_probs(f_log_probs,
                           prepare_data,
                           options,
                           dataset,
                           verboseFreq=10000)
    print('Done...')
    output_file = open(saveto, 'w')
    pwd_cnt = 0
    for line in open(target_file):
        output_file.writelines(line.rstrip() + '\t' +
                               str(1.0 / (math.e**log_probs[pwd_cnt])) + '\n')
        pwd_cnt += 1
    """
    for prob in log_probs:
        output_file.writelines(str(prob) + '\n')
    """
    output_file.flush()
    output_file.close()
    print('Evaluation finished...')
Ejemplo n.º 29
0
    for kk, vv in worddicts[ii].iteritems():
        worddicts_r[ii][vv] = kk


funcs, tp = build_networks(model_options)

if model_options['see_pretrain']:
    tparams, tparams_xy0 = tp
else:
    tparams = tp
# print 'save the compiled functions/tparams for temperal usage'


print 'Loading data'
train = TextIterator(model_options['datasets'], model_options['dictionaries'],
                     [0 for _ in range(model_options['n_inputs'])],
                     batch_size=model_options['batch_size'], maxlen=model_options['maxlen'])
valid = TextIterator(model_options['valid_datasets'], model_options['dictionaries'],
                     [0 for _ in range(model_options['n_inputs'])],
                     batch_size=model_options['batch_size'], maxlen=500)


print clr('-------------------------------------------- Main-Loop -------------------------------------------------', 'yellow')

# ------------------ initlization --------------- #
best_p       = None
bad_counter  = 0
uidx         = 0
estop        = False
history_errs = []
max_epochs   = 100
Ejemplo n.º 30
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=100,  # the number of GRU units
        encoder='lstm',  # encoder model
        decoder='lstm',  # decoder model 
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words=100000,  # vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='adadelta',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        dispFreq=100,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        use_dropout=False,
        reload_=False,
        verbose=False,  # print verbose information for debug but slow speed
        datasets=[],
        valid_datasets=[],
        test_datasets=[],
        dictionary='',
        embedding='',  # pretrain embedding file, such as word2vec, GLOVE
):

    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
    # Model options
    model_options = locals().copy()

    model_options[
        'alphabet'] = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    model_options['l_alphabet'] = len(model_options['alphabet'])
    model_options['dim_char_emb'] = 15
    model_options['char_nout'] = 100
    model_options['char_k_rows'] = 5
    model_options['char_k_cols'] = model_options['dim_char_emb']

    # load dictionary and invert them
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'Reload options'
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    logger.debug(pprint.pformat(model_options))

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         datasets[2],
                         dictionary,
                         n_words=n_words,
                         batch_size=batch_size)
    train_valid = TextIterator(datasets[0],
                               datasets[1],
                               datasets[2],
                               dictionary,
                               n_words=n_words,
                               batch_size=valid_batch_size,
                               shuffle=False)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         valid_datasets[2],
                         dictionary,
                         n_words=n_words,
                         batch_size=valid_batch_size,
                         shuffle=False)
    test = TextIterator(test_datasets[0],
                        test_datasets[1],
                        test_datasets[2],
                        dictionary,
                        n_words=n_words,
                        batch_size=valid_batch_size,
                        shuffle=False)

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options, worddicts)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print 'Reload parameters'
        params = load_params(saveto, params)

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    trng, use_noise, \
        x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \
        opt_ret, \
        cost, \
        f_pred, f_prods = \
        build_model(tparams, model_options)
    inps = [
        x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask,
        y
    ]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    updated_params = OrderedDict([(key, value)
                                  for (key, value) in tparams.iteritems()
                                  if not key.startswith('Wemb')])

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(updated_params))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads
        if verbose:
            print 'Building function of gradient\'s norm'
            f_norm_g = theano.function(inps, tensor.sqrt(g2))

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps,
                                              cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        print 'Reload history error'
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    valid_acc_record = []
    test_acc_record = []
    best_epoch_num = 0
    lr_change_list = []
    wait_counter = 0
    wait_N = 1
    for eidx in xrange(max_epochs):
        n_samples = 0
        for x1, x2, y in train:
            n_samples += len(x1)
            uidx += 1
            use_noise.set_value(1.)
            x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(
                x1, x2, y, worddicts_r, maxlen=maxlen)

            if x1 is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x1, x1_mask, char_x1, char_x1_mask, x2,
                                 x2_mask, char_x2, char_x2_mask, y)
            if verbose:
                if clip_c > 0.:
                    norm_g = f_norm_g(x1, x1_mask, char_x1, char_x1_mask, x2,
                                      x2_mask, char_x2, char_x2_mask, y)

            # do the update on parameters
            f_update(lrate)
            ud = time.time() - ud_start
            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return None

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format(
                    eidx, uidx, cost, ud))
                if verbose:
                    if clip_c > 0.:
                        logger.debug('Grad {0}'.format(norm_g))

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_cost = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid,
                                        worddicts_r).mean()
                valid_acc = pred_acc(f_pred, prepare_data, model_options,
                                     valid, worddicts_r)
                valid_err = 1.0 - valid_acc
                history_errs.append(valid_err)
                test_cost = pred_probs(f_log_probs, prepare_data,
                                       model_options, test,
                                       worddicts_r).mean()
                test_acc = pred_acc(f_pred, prepare_data, model_options, test,
                                    worddicts_r)

                print 'Valid cost', valid_cost
                print 'Valid accuracy', valid_acc
                print 'Test cost', test_cost
                print 'Test accuracy', test_acc
                print 'lrate:', lrate

                valid_acc_record.append(valid_acc)
                test_acc_record.append(test_acc)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    best_epoch_num = eidx
                    wait_counter = 0

                if valid_err > numpy.array(history_errs).min():
                    wait_counter += 1

                if wait_counter >= wait_N:
                    print 'wait_counter max, need to half the lr'
                    bad_counter += 1
                    wait_counter = 0
                    print 'bad_counter: ' + str(bad_counter)
                    lrate = lrate * 0.5
                    lr_change_list.append(eidx)
                    print 'lrate change to: ' + str(lrate)
                    zipp(best_p, tparams)

                if bad_counter > patience:
                    print 'Early Stop!'
                    estop = True
                    break

                if numpy.isnan(valid_err):
                    pdb.set_trace()

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    with open('record.csv', 'w') as f:
        f.write(str(best_epoch_num) + '\n')
        f.write(','.join(map(str, lr_change_list)) + '\n')
        f.write(','.join(map(str, valid_acc_record)) + '\n')
        f.write(','.join(map(str, test_acc_record)) + '\n')

    use_noise.set_value(0.)

    print '=' * 80
    print 'Final Result'
    print '=' * 80
    train_cost = pred_probs(f_log_probs, prepare_data, model_options,
                            train_valid, worddicts_r).mean()
    train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid,
                         worddicts_r)
    print 'Train cost', train_cost
    print 'Train accuracy', train_acc
    valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid,
                            worddicts_r).mean()
    valid_acc = pred_acc(f_pred, prepare_data, model_options, valid,
                         worddicts_r)
    print 'Valid cost', valid_cost
    print 'Valid accuracy', valid_acc
    test_cost = pred_probs(f_log_probs, prepare_data, model_options, test,
                           worddicts_r).mean()
    test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r)
    print 'Test cost', test_cost
    print 'Test accuracy', test_acc
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)
    logger.debug('Done')

    return None
Ejemplo n.º 31
0
def ptb_iterator(source, source_dict, batch_size, maxlen, char_level=False,
                 n_words_source=-1, rng=None):
    data = []
    if char_level:
        # Character level PTB
        if source.endswith('.gz'):
            source_file = gzip.open(source, 'r')
        else:
            source_file = open(source, 'r')
        
        # Make a dictionary mapping known characters to integers
        #   0 is 'unk'
        #   1 is 'end of sentence'
        # (48 entries)
        chars = ['<unk>', '\n', '#', '$', '&', "'", '*', '-', '.', '/', '\\',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'N', ' ',
                'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
        char_dict = dict(zip(chars, np.arange(len(chars))))
        
        # Make a list of all lines in integer encoded format
        
        for line in source_file:
            if len(line) > maxlen:
                continue
            encoded_line = []
            i = 0
            while i < len(line):
                ch = line[i]
                try:
                    encoded_line.append(char_dict[ch])
                except KeyError:
                    # Unknown characters are 0, including '<unk>'
                    encoded_line.append(0)
                    if line[i:i+5]=='<unk>':
                        i = i+4
                i += 1
            data.append(encoded_line)
            
    else:
        # Word level PTB
        text_iter = TextIterator(source=source,
                                source_dict=source_dict,
                                batch_size=batch_size,
                                maxlen=maxlen,
                                n_words_source=n_words_source)
        data = []
        for batch in text_iter:
            data.extend(batch)
            
    # Prepare data to sample batches from
    x_arr = np.zeros((len(data), maxlen), dtype=np.int32)
    m_arr = np.zeros((len(data), maxlen), dtype=np.uint8)
    y_arr = np.zeros((len(data), maxlen), dtype=np.int32)
    for i, line in enumerate(data):
        x_arr[i, 0:len(line)] = line
        m_arr[i, 0:len(line)+1] = 1
    y_arr[:,:-1] = x_arr[:,1:]
    
    if rng is None:
        rng = np.random.RandomState()
    
    num_batches = len(data)//batch_size
    if len(data)%batch_size:
        num_batches += 1
    
    def gen():
        indices = rng.permutation(len(data))
        for i in range(num_batches):
            x = x_arr[indices[i*batch_size:(i+1)*batch_size]]
            m = m_arr[indices[i*batch_size:(i+1)*batch_size]]
            y = y_arr[indices[i*batch_size:(i+1)*batch_size]]
            yield x, m, y
            
    return gen