default=os.path.join('~', 'cased_L-12_H-768_A-12/'),
                    help='Path to Tensorflow checkpoint folder. '
                    'Default is /home/ubuntu/cased_L-12_H-768_A-12/')
parser.add_argument('--out_dir',
                    type=str,
                    default=os.path.join('~', 'output'),
                    help='Path to output folder. The folder must exist. '
                    'Default is /home/ubuntu/output/')
parser.add_argument('--debug', action='store_true', help='debugging mode')
args = parser.parse_args()
logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
logging.info(args)

# convert vocabulary
vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
vocab, reserved_token_idx_map = convert_vocab(vocab_path)

# vocab serialization
tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
with open(tmp_file_path, 'w') as f:
    f.write(vocab.to_json())
hash_full, hash_short = get_hash(tmp_file_path)
gluon_vocab_path = os.path.expanduser(
    os.path.join(args.out_dir, hash_short + '.vocab'))
with open(gluon_vocab_path, 'w') as f:
    f.write(vocab.to_json())
    logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path,
                 hash_full)

# load tf model
tf_checkpoint_file = os.path.expanduser(
Example #2
0
def test(args: Namespace):
    cfg = json.load(open(args.config_path, 'r', encoding='UTF-8'))

    batch_size = 1  # for predicting one sentence.

    encoder = Encoder(cfg['vocab_input_size'], cfg['embedding_dim'],
                      cfg['units'], batch_size, 0)
    decoder = Decoder(cfg['vocab_target_size'], cfg['embedding_dim'],
                      cfg['units'], cfg['method'], batch_size, 0)
    optimizer = select_optimizer(cfg['optimizer'], cfg['learning_rate'])

    ckpt = tf.train.Checkpoint(optimizer=optimizer,
                               encoder=encoder,
                               decoder=decoder)
    manager = tf.train.CheckpointManager(ckpt,
                                         cfg['checkpoint_dir'],
                                         max_to_keep=3)
    ckpt.restore(manager.latest_checkpoint)

    while True:
        sentence = input(
            'Input Sentence or If you want to quit, type Enter Key : ')

        if sentence == '': break

        sentence = re.sub(r"(\.\.\.|[?.!,¿])", r" \1 ", sentence)
        sentence = re.sub(r'[" "]+', " ", sentence)

        sentence = '<s> ' + sentence.lower().strip() + ' </s>'

        input_vocab = load_vocab('./data/', 'en')
        target_vocab = load_vocab('./data/', 'de')

        input_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            filters='', oov_token='<unk>')
        input_lang_tokenizer.word_index = input_vocab

        target_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            filters='', oov_token='<unk>')
        target_lang_tokenizer.word_index = target_vocab

        convert_vocab(input_lang_tokenizer, input_vocab)
        convert_vocab(target_lang_tokenizer, target_vocab)

        inputs = [
            input_lang_tokenizer.word_index[i]
            if i in input_lang_tokenizer.word_index else
            input_lang_tokenizer.word_index['<unk>']
            for i in sentence.split(' ')
        ]
        inputs = tf.keras.preprocessing.sequence.pad_sequences(
            [inputs], maxlen=cfg['max_len_input'], padding='post')

        inputs = tf.convert_to_tensor(inputs)

        result = ''

        enc_hidden = encoder.initialize_hidden_state()
        enc_cell = encoder.initialize_cell_state()
        enc_state = [[enc_hidden, enc_cell], [enc_hidden, enc_cell],
                     [enc_hidden, enc_cell], [enc_hidden, enc_cell]]

        enc_output, enc_hidden = encoder(inputs, enc_state)

        dec_hidden = enc_hidden
        #dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<eos>']], 0)
        dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<s>']],
                                   1)

        print('dec_input:', dec_input)

        h_t = tf.zeros((batch_size, 1, cfg['embedding_dim']))

        for t in range(int(cfg['max_len_target'])):
            predictions, dec_hidden, h_t = decoder(dec_input, dec_hidden,
                                                   enc_output, h_t)

            # predeictions shape == (1, 50002)

            predicted_id = tf.argmax(predictions[0]).numpy()
            print('predicted_id', predicted_id)

            result += target_lang_tokenizer.index_word[predicted_id] + ' '

            if target_lang_tokenizer.index_word[predicted_id] == '</s>':
                print('Early stopping')
                break

            dec_input = tf.expand_dims([predicted_id], 1)
            print('dec_input:', dec_input)

        print('<s> ' + result)
        print(sentence)
        sys.stdout.flush()