Ejemplo n.º 1
0
    def __init__(self, lm, data_fn, batch_size, target_seq_len, logger=None, tokenize_regime='words'):
        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('EnblockEvaluator')
        self.batch_size = batch_size
        self.lm = lm

        ids = tokens_from_fn(data_fn, lm.vocab, regime=tokenize_regime, randomize=False)
        oov_mask = ids == lm.vocab.unk_ind
        nb_oovs = oov_mask.sum().item()

        nb_tokens = len(ids)
        oov_msg = 'Nb oovs: {} / {} ({:.2f} %)\n'.format(nb_oovs, len(ids), 100.0 * nb_oovs/nb_tokens)
        if nb_oovs / nb_tokens > 0.05:
            self.logger.warning(oov_msg)
        else:
            self.logger.info(oov_msg)

        batched = batchify(ids, batch_size, lm.device == torch.device('cuda:0'))
        data_tb = TemporalSplits(
            batched,
            nb_inputs_necessary=lm.model.in_len,
            nb_targets_parallel=target_seq_len
        )
        self.data = TransposeWrapper(data_tb)
Ejemplo n.º 2
0
 def __init__(self, f, vocab, unroll_length):
     """
         Args:
             f (file): File with a document.
             vocab (Vocabulary): Vocabulary for translation word -> index
     """
     ts_builder = lambda seq: TemporalSplits(
         seq, nb_inputs_necessary=unroll_length, nb_targets_parallel=1)
     super().__init__(f, vocab, ts_builder)
Ejemplo n.º 3
0
 def __init__(self, f, vocab, hist_len, nb_targets_parallel, end_portion):
     """
         Args:
             f (file): File with a document.
             vocab (Vocabulary): Vocabulary for translation word -> index
     """
     ts_builder = lambda seq: TemporalSplits(seq,
                                             nb_inputs_necessary=hist_len,
                                             nb_targets_parallel=
                                             nb_targets_parallel)
     super().__init__(f, vocab, end_portion, ts_builder)
Ejemplo n.º 4
0
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("preparing data...")
    tokenize_regime = 'words'
    if args.characters:
        tokenize_regime = 'chars'

    train_ids = tokens_from_fn(args.train,
                               lm.vocab,
                               randomize=False,
                               regime=tokenize_regime)
    train_batched = batchify(train_ids, args.batch_size, args.cuda)
    train_data_tb = TemporalSplits(train_batched,
                                   nb_inputs_necessary=lm.model.in_len,
                                   nb_targets_parallel=args.target_seq_len)
    train_data = TransposeWrapper(train_data_tb)

    valid_ids = tokens_from_fn(args.valid,
                               lm.vocab,
                               randomize=False,
                               regime=tokenize_regime)
    valid_batched = batchify(valid_ids, 10, args.cuda)
    valid_data_tb = TemporalSplits(valid_batched,
                                   nb_inputs_necessary=lm.model.in_len,
                                   nb_targets_parallel=args.target_seq_len)
    valid_data = TransposeWrapper(valid_data_tb)

    print('Initial perplexity {:.2f}'.format(
        math.exp(
Ejemplo n.º 5
0
 def ts_from_file(f):
     return TokenizedSplitFFBase(
         f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                 target_seq_len))
Ejemplo n.º 6
0
 def ivec_ts_from_file(f):
     ts = TokenizedSplitFFBase(
         f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                 target_seq_len))
     return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)
Ejemplo n.º 7
0
 def temp_splits_from_fn(fn):
     tokens = tokens_from_file(fn, lm.vocab, randomize=False)
     return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len)