コード例 #1
0
def main(args):
    print(args)
    print('-' * 100)
    print('Loading models and options...')
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    print('Loading vocabulary...')
    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    shards = glob(args.test_prefix)
    shards.sort()
    # print(shards)
    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }
    print(f'Building dataset...')
    datasets = []
    for shard in shards:
        if options.get('bidirectional'):
            datasets.append(BidirectionalLMDataset(shard, vocab, **kwargs))
        else:
            datasets.append(LMDataset(shard, vocab, **kwargs))

    print('Predicting...')
    tag(options, ckpt_file, shards, datasets, batch_size=args.batch_size)
    print('-' * 100)
    print('done.')
コード例 #2
0
ファイル: run_test.py プロジェクト: cheng18/crs
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    # vocab = load_vocab(args.vocab_file, max_word_length)
    vocab = load_vocab(args.vocab_file, args.stroke_vocab_file,
                       50)  # Winfred stroke_vocab

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
コード例 #3
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    permute_number = options.get('permute_number', 4)

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    elif options.get('multidirectional'):
        data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
コード例 #4
0
def main(args):

    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        if options.get('polyglot'):
            data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    #ipy.embed()

    test(options, ckpt_file, data, batch_size=args.batch_size)
コード例 #5
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options, data, args.n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)
コード例 #6
0
def test_original_dataset_implementation():
    """Trying to show how the original `LMDataset`
    and `BidirectionalLMDataset` works.
    """
    from bilm.data import LMDataset, BidirectionalLMDataset, \
         UnicodeCharsVocabulary

    test_prefix = 'data/test/violin_test.txt'
    vocab_path = 'dump/bilm_pretrain/vocab-2016-09-10.txt'

    vocabulary = UnicodeCharsVocabulary(vocab_path, max_word_length=50)
    dataset = LMDataset(test_prefix, vocabulary)
    a = dataset.iter_batches(batch_size=10, num_steps=50)
    b = next(a)
    print(f'Keys: {b.keys()}')
    for k, v in b.items():
        print(f'Shape of {k}: {v.shape}')

    print(vocabulary.decode(b['token_ids'][0]))
    print(vocabulary.decode(b['next_token_id'][0]))
    print(vocabulary.decode_chars(b['tokens_characters'][0]))

    from IPython import embed
    embed()
    import os
    os._exit(1)
コード例 #7
0
ファイル: restart.py プロジェクト: jungokasai/poly_share
def main(args):

    if args.gpu is not None:
        if ',' in args.gpu:
            args.gpu = args.gpu.split(',')
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    if 'polyglot' in options or args.polyglot:
        polyglot = True
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=polyglot)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        if 'polyglot' in options or args.polyglot:
            data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options,
          data,
          None,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
コード例 #8
0
    def _load_data(self, reverse, chars, bidirectional=False):
        if chars:
            vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5)
        else:
            vocab = Vocabulary(self._tmp_vocab)

        if not bidirectional:
            data = LMDataset(self._tmp_train, vocab, reverse=reverse)
        else:
            data = BidirectionalLMDataset(self._tmp_train, vocab)

        return data
コード例 #9
0
    def _get_data(self, bidirectional, use_chars, test=False):
        vocab_file = os.path.join(FIXTURES, 'vocab.txt')
        if use_chars:
            vocab = load_vocab(vocab_file, 10)
        else:
            vocab = load_vocab(vocab_file, None)

        prefix = os.path.join(FIXTURES, 'data.txt')

        if bidirectional:
            data = BidirectionalLMDataset(prefix, vocab, test=test)
        else:
            data = LMDataset(prefix, vocab, test=test, reverse=False)

        return data, vocab
コード例 #10
0
ファイル: restarter.py プロジェクト: davidchan2/elmoUser
def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file):
    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }
    tf.reset_default_graph()
    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
    clean_checkpoint(tf_save_dir)
コード例 #11
0
ファイル: tester.py プロジェクト: davidchan2/elmoUser
def top_level(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    vocab_file = os.path.join(args.save_dir, 'vocabs.txt')

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
コード例 #12
0
    words, vocab_unicodechars.word_to_char_ids(words)))
print('====> word \t{}\t encoded chars id result: {}'.format(
    words, vocab_unicodechars.encode_chars(words)))
ids = [1234, 3234, 22, 34, 341324, 21, 345]
print('====> decode \t{}\t to words: {}'.format(
    ids, vocab_unicodechars.decode(ids)))
'''
UE for LMDataset
'''
print('\n\n\tUE for LMDataset:')
vocab_file = '../data/vocab_seg_words_elmo.txt'
vocab_unicodechars = UnicodeCharsVocabulary(vocab_file,
                                            max_word_length=10,
                                            validate_file=True)
filepattern = '../data/example/*_seg_words.txt'
lmds = LMDataset(filepattern, vocab_unicodechars, test=True)
batch_size = 128
n_gpus = 1
unroll_steps = 10
data_gen = lmds.iter_batches(batch_size * n_gpus, unroll_steps)
jump_cnt = 0
for num, batch in enumerate(data_gen, start=1):
    jump_cnt += 1
    if jump_cnt > 10:
        break
    print('====> iter [{}]\ttoken ids shape: {}'.format(
        num, batch['token_ids'].shape))
    print('====> iter [{}]\ttokens characters shape: {}'.format(
        num, batch['tokens_characters'].shape))
    print('====> iter [{}]\tnext token ids shape: {}'.format(
        num, batch['next_token_id'].shape))
コード例 #13
0
        if args.model == 'hmm':
            res0, res1, res2, OOV, IN = get_perplexities(sents, model, k=k)
            res_perplexities2.append(res2)
            count_in += OOV
            count_oov += IN
            res_perplexities0.append(res0)
            res_perplexities1.append(res1)

        if args.model == 'elmo':

            filepath = subdir + os.sep
            if options.get('bidirectional'):
                data = BidirectionalLMDataset(filepath, vocab, **kwargs)
                # print(data)
            else:
                data = LMDataset(filepath, vocab, **kwargs)

            res2 = test(options, ckpt_file, data, batch_size=args.batch_size)

            res_perplexities2.append(res2)

        outfile.write(file + '\t' + label + '\t' + str(res2) + '\n')

        if count % 5 == 0:
            print('I have calculated perplexities for %s files' % count,
                  file=sys.stderr)

print('=== Just a sanity check on the perplexity calculations: ')
print(labels[:5], fns[:5], res_perplexities2[:5])

print('Texts with the most extreme text-level perplexities:')