def BuildTokenList(filter_freq, drop_ends=False):
    input_delimiters = ''
    if 'input_delimiters' in hparams:
        input_delimiters = hparams['input_delimiters']
    return ['UNK'] + \
           BuildStatFromCorpus(lambda x:[Multiencoder.prepare_token(x)], filter_freq) + \
           input_delimiters
Ejemplo n.º 2
0
def _i_generator(so_file, hparams):
    # get encoder
    input_encoder = Multiencoder(
        hparams['input_encoders'], hparams['max_source_len'],
        'tokenizer_emit_spaces' in hparams
        and hparams['tokenizer_emit_spaces'],
        'tokenizer_omit_ending_space' in hparams
        and hparams['tokenizer_omit_ending_space'])
    # generate batches
    batch_so = []
    for so_l in so_file:
        so_l = so_l.strip()
        batch_so.append(input_encoder.encode(so_l))
        if len(batch_so) == hparams['batch_size']:
            yield {'so': np.array(batch_so)}
            batch_so = []
    if len(batch_so) != 0:
        yield {'so': np.array(batch_so)}
def BuildTrigramList(filter_freq, drop_ends=False):
    input_delimiters = ''
    if 'input_delimiters' in hparams:
        input_delimiters = [
            delim + '_' for delim in hparams['input_delimiters']
        ]
    return ['UNK'] + BuildStatFromCorpus(lambda x: \
                         Multiencoder.prepare_ngrams(x, 3, drop_ends), \
                         filter_freq, \
                         input_delimiters) + input_delimiters
Ejemplo n.º 4
0
# Get current epoch
epoch_data_filename = hparams['checkpoints_path'] + 'current_epoch.txt'
current_epoch = 0
try:
    with open(epoch_data_filename, 'r') as epoch_data_file:
        current_epoch = int(epoch_data_file.read())
except:
    pass
if current_epoch >= num_epochs:
    print('Training is already finished. hparams["num_epochs"] = %i' %
          hparams['num_epochs'])
    exit()

# Load encoder
output_encoder = Multiencoder([hparams['output_encoder']],
                              hparams['max_answer_len'])

# get max BLEU
try:
    max_bleu_epoch, max_bleu = [
        float(l.strip())
        for l in open(hparams['checkpoints_path'] + 'max_bleu.txt')
    ]
except:
    max_bleu_epoch, max_bleu = -1, 0

# Make checkpointing config
chkp_config = tf.estimator.RunConfig(
    keep_checkpoint_max=hparams['keep_checkpoint_max'])
summary_writer = tf.summary.FileWriter(hparams["checkpoints_path"])
# For future versions with distribution
    request = predict_pb2.PredictRequest()
    request.model_spec.name = model_name
    request.model_spec.signature_name = 'output'
    request.inputs['examples'].CopyFrom(
        tf.contrib.util.make_tensor_proto(data_so.SerializeToString(),
                                          shape=[1]))
    result = stub.Predict(request, 100.0)  # 10 secs timeout
    if result.outputs['output'].int64_val != []:
        return result.outputs['output'].int64_val
    elif result.outputs['output'].int_val != []:
        return result.outputs['output'].int_val
    else:
        return result.outputs['output'].float_val


encoder_in = Multiencoder(hparams['input_encoders'], hparams['max_source_len'])
encoder_out = Multiencoder([hparams['output_encoder']],
                           hparams['max_answer_len'])


def encode_batch(lines, model_name='vae_encoder'):
    sources = []
    if len(lines) != hparams['batch_size']:
        raise BaseException('number of lines must be equal batch size')
    for l in lines:
        source = encoder_in.encode(l)
        source = np.concatenate(
            (source, np.zeros(max_source_length - len(source), 'int32')))
        sources.append(source)
    sources = np.concatenate(sources)
    mu_sigmas = do_inference(server, {'so': sources}, model_name)
Ejemplo n.º 6
0
def _io_generator(so_file, a_file, hparams):
    # Load encoders
    input_encoder = Multiencoder(
        hparams['input_encoders'], hparams['max_source_len'],
        'tokenizer_emit_spaces' in hparams
        and hparams['tokenizer_emit_spaces'],
        'tokenizer_omit_ending_space' in hparams
        and hparams['tokenizer_omit_ending_space'])
    output_encoder = Multiencoder([hparams['output_encoder']],
                                  hparams['max_answer_len'])

    def generate_batch(pairs):
        # generate batches
        index0 = min([index for index in pairs])
        lens0 = pairs[index0]['lens']
        dists_dict = {
            np.sum(np.abs(pairs[index]['lens'] - lens0)) +
            np.random.random() / 100: index
            for index in pairs if index != index0
        }  # random to make them uniq (meh)
        dists = sorted(list(set([d for d in dists_dict])))  # lowest to highest
        mli, mlo = pairs[index0]['lens']
        sum_words = mli + mlo
        batch_so = [pairs[index0]['so_encoded']]
        batch_a = [pairs[index0]['a_encoded']]
        del pairs[index0]
        for d in dists:
            so = pairs[dists_dict[d]]['so_encoded']
            a = pairs[dists_dict[d]]['a_encoded']
            max_len_in = np.max([mli, pairs[dists_dict[d]]['lens'][0]])
            max_len_out = np.max([mlo, pairs[dists_dict[d]]['lens'][1]])
            words_count = (max_len_in + max_len_out) * (len(batch_so) + 1)
            if words_count > hparams['tokens_per_batch'] and len(
                    batch_so) != 0:
                break
            sum_words += pairs[dists_dict[d]]['lens'][0] + pairs[
                dists_dict[d]]['lens'][1]
            mli = max_len_in
            mlo = max_len_out
            batch_so.append(so)
            batch_a.append(a)
            del pairs[dists_dict[d]]
        percent_words = sum_words / (
            (mli + mlo) * len(batch_so))  # 0 to 1. more - better
        return {'so': np.array(batch_so)}, np.array(batch_a)

    # get lines
    pairs = {}
    for this_index, (so_l, a_l) in enumerate(zip(so_file, a_file)):
        so, a = so_l.strip(), a_l.strip()
        if len(so) == 0 or len(a) == 0:
            continue
        if len(pairs) < hparams['max_chunk_size']:
            try:
                so_encoded = input_encoder.encode(so)
                a_encoded = output_encoder.encode(a, emit_eos=True)
            except:
                print(
                    'Warning: failed to encode pair (check hparams max_source_len and max_answer_len)'
                )
                continue
            pairs[this_index] = {
                'so':
                so,
                'a':
                a,
                'so_encoded':
                so_encoded,
                'a_encoded':
                a_encoded,
                'lens':
                np.array([
                    np.sum(np.sign(so_encoded[:hparams['max_source_len']])),
                    np.sum(np.sign(a_encoded[:hparams['max_answer_len']]))
                ])
            }
            continue
        if this_index % 1e5 == 0:
            pairs = {index: pairs[index] for index in pairs}
        yield generate_batch(pairs)
    yield generate_batch(pairs)
def process_hp(hparams):
    # read generated answers
    f = open(hparams['answers_path'] + 'answers.txt')
    answers = [l[:-1] for l in f]
    # read p_gen
    if hparams['copy']:
        f = open(hparams['answers_path'] + 'p_gen.txt')
        p_gens = []
        line = ''.join([l[:-1] for l in f])
        for l in line.split('['):
            p = l[:-1].split(' ')
            p = [float(pp) for pp in p if pp != '']
            p_gens.append(p)
        p_gens = p_gens[1:]
    if len(answers) != len(pairs):
        print('Error: len(answers) != len(pairs) (%d != %d)' %
              (len(answers), len(pairs)))
    # calculate scores
    correct_answers = sum([
        1 if conv_line(a) == conv_line(p[1]) else 0
        for (a, p) in zip(answers, pairs)
    ])
    bleu_scores = str(
        check_output(
            't2t-bleu --translation=%sanswers.txt --reference=%stest_output.txt'
            % (hparams['answers_path'], list_hparams[0]['pairs_path']),
            shell=True)).replace('\\n', '\n')[2:-1].strip()
    print(hparams['model_name'])
    print('Accuracy: %.03f' % (correct_answers / len(answers)))
    print(bleu_scores)
    f = open(hparams['answers_path'] + 'scores.txt', 'w')
    f.write('Accuracy: ' + str(correct_answers / len(answers)) + '\n')
    f.write(bleu_scores + '\n')
    # calculate top-n accuracy
    if hparams['use_beam_search']:
        topn_lines = [
            l.strip()
            for l in open(hparams['answers_path'] + 'answers_topn.txt')
        ]
        count_top = [0 for _ in range(hparams['beam_width'])]
        for i, pair in enumerate(pairs):
            story, exp_answer = pair
            tn = []
            for j in range(hparams['beam_width']):
                tn.append(topn_lines[i * hparams['beam_width'] + j])
            tn = [conv_line(l.split('\t')[0]) for l in tn]
            if conv_line(exp_answer) in tn:
                for j in range(tn.index(conv_line(exp_answer)),
                               hparams['beam_width']):
                    count_top[j] += 1
        for n, cn in enumerate(count_top):
            f.write(hparams['model_name'] + ' Top-%i accuracy: ' % (n + 1) +
                    str(count_top[n] / len(answers)) + '\n')
    # make copied parts colored blue
    if hparams['copy']:
        encoder = Multiencoder([hparams['output_encoder']],
                               hparams['max_answer_len'])
        new_answers = []
        for i, answer in enumerate(answers):
            subwords = encoder.decode_list(encoder.encode(answer))[0]
            words = []
            for j, w in enumerate(subwords):
                if j < hparams['max_answer_len'] and hparams['copy'] and not (
                        'train_only_vectors' in hparams
                        and hparams['train_only_vectors']):
                    p = p_gens[i][j]
                else:
                    p = 1
                c = '%02X' % int((1 - p) * 255)
                for pun in string.punctuation + '\\u':
                    if j != len(subwords
                                ) - 1 and pun != '_' and pun in subwords[j +
                                                                         1]:
                        w = w.replace('_', '')
                    if pun != '_' and pun in w:
                        rep = True
                        for l in letters:
                            if l in w.lower():
                                rep = False
                        if rep:
                            w = w.replace('_', '')
                w = w.replace('_', ' ')
                w1 = '<font color=#0000%s title="p_gen = %f">' % (
                    c, p) + w + '</font>'
                words.append(w1)
            ans = ''.join(words)
            ans = ans.replace('\\u', '_')
            new_answers.append(ans)
        answers = new_answers
    # write table
    copy_info, beam_info = '', ''
    if hparams['use_beam_search']:
        beam_info = '<h3>Beam size %i</h3>' % hparams['beam_width']
    if hparams['copy']:
        copy_info = '<h3><font title="Hover mouse over words to see generation probability" color=#0000ff >Using copy<font></h3>'
    result = {
        'header': '<h3>Model: ' + hparams['model_name'] + '</h3><br>' \
                + 'Accuracy: %.03f' % (correct_answers / len(answers)) + '<br>' \
                + bleu_scores.replace('\n', '<br>') \
                + beam_info + copy_info,
        'content': answers,
    }
    return result
def main(unused_argv):
    # Create the Estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=hparams["checkpoints_path"])

    # read sources
    if ('draw_attention' in hparams
            and hparams['draw_attention']) or ('draw_entropy' in hparams
                                               and hparams['draw_entropy']):
        sources = [
            l.strip() for l in open(hparams['pairs_path'] + 'test_input.txt')
        ]
        in_encoder = Multiencoder(hparams['input_encoders'],
                                  hparams['max_answer_len'])
        from tensor2tensor.data_generators import text_encoder, tokenizer

    # Predict function
    so_file = open(hparams['pairs_path'] + 'test_input.txt')
    predict_input_fn = lambda: i_input_fn(so_file, hparams)

    # Prepare for making predictions
    results = estimator.predict(input_fn=predict_input_fn)
    if not os.path.isdir(hparams["answers_path"]):
        os.system('mkdir ' + hparams["answers_path"])
    f1 = open(hparams['answers_path'] + 'answers.txt', 'w')
    if not ('train_only_vectors' in hparams
            and hparams['train_only_vectors']) and hparams['use_beam_search']:
        f1_topn = open(hparams['answers_path'] + 'answers_topn.txt', 'w')
    if hparams['copy']:
        f2 = open(hparams['answers_path'] + 'p_gen.txt', 'w')
    entropy = np.zeros(hparams['max_answer_len'])
    count = np.zeros(hparams['max_answer_len'])

    # Get predictions
    for i, r in enumerate(results):
        sent_vec = r['classes']
        if 1 in sent_vec:
            sent_vec = sent_vec[:list(sent_vec).index(1)]
        sent = encoder.decode(sent_vec)
        f1.write(sent + '\n')
        if hparams['use_beam_search']:
            for j in range(hparams['beam_width']):
                sent_j = r['classes_topn'][:, j]
                if 1 in sent_j:
                    sent_j = sent_j[:list(sent_j).index(1)]
                sent_j_w = encoder.decode(sent_j)
                f1_topn.write(sent_j_w + '\t' + str(r['beam_scores'][0, j]) +
                              '\n')
        if hparams['copy']:
            f2.write(str(r['p_gens']) + '\n')
        # draw attention matrix image
        if ('draw_attention' in hparams and hparams['draw_attention']) or (
                'draw_entropy' in hparams and hparams['draw_entropy']):
            img = r['attention_image']
            print(sources[i])
            print(in_encoder.decode_list(in_encoder.encode(sources[i])))
            in_words = []
            cur_words = []
            for w in in_encoder.decode_list(in_encoder.encode(sources[i]))[0]:
                if 'NEWWORD' in w:
                    in_words.append(' '.join(cur_words).replace('_', '') +
                                    '  ')
                    cur_words = []
                else:
                    cur_words.append(w)
            out_words = encoder.decode_list(encoder.encode(sent))[0]
            img = img[:len(in_words), :len(sent_vec)]
        if 'draw_attention' in hparams and hparams['draw_attention']:
            plt.figure(figsize=(len(out_words), len(in_words)))
            plt.imshow(img,
                       cmap='hot',
                       vmin=0,
                       vmax=1,
                       interpolation='nearest')
            plt.yticks(range(len(in_words)), in_words)
            plt.xticks(range(len(out_words)),
                       [w for j, w in enumerate(out_words)],
                       rotation='vertical')
            mng = plt.get_current_fig_manager()
            plt.show()
        # calc entropy
        if 'draw_entropy' in hparams and hparams['draw_entropy']:
            ent = np.zeros(hparams['max_answer_len'])
            ent[:len(sent_vec)] = -np.sum(img * np.log(img + 1e-8),
                                          axis=0).reshape(len(sent_vec))
            entropy += ent / np.log(len(in_words) + 2)
            mask = np.zeros(hparams['max_answer_len'])
            mask[:len(sent_vec)] = 1
            count += mask
    entropy /= count + 1e-8
    if 'draw_entropy' in hparams and hparams['draw_entropy']:
        plt.title('Entropy of attention distribution')
        plt.xlabel('Decoder step')
        plt.ylabel('Relative entropy')
        plt.plot(entropy)
        plt.show()
hparams = NormalizeEncoderSettings(hparams)

PrintHparamsInfo(hparams)

# Create answers folder if necessary
if not os.path.exists(hparams['answers_path']):
    os.makedirs(hparams['answers_path'])

if ('draw_attention' in hparams
        and hparams['draw_attention']) or ('draw_entropy' in hparams
                                           and hparams['draw_entropy']):
    import matplotlib.pyplot as plt

model_fn = get_model_fn(hparams)

encoder = Multiencoder([hparams['output_encoder']], hparams['max_answer_len'])


def main(unused_argv):
    # Create the Estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=hparams["checkpoints_path"])

    # read sources
    if ('draw_attention' in hparams
            and hparams['draw_attention']) or ('draw_entropy' in hparams
                                               and hparams['draw_entropy']):
        sources = [
            l.strip() for l in open(hparams['pairs_path'] + 'test_input.txt')
        ]
        in_encoder = Multiencoder(hparams['input_encoders'],