def __init__(self, lm, data_fn, batch_size, target_seq_len, logger=None, tokenize_regime='words'): if logger: self.logger = logger else: self.logger = logging.getLogger('EnblockEvaluator') self.batch_size = batch_size self.lm = lm ids = tokens_from_fn(data_fn, lm.vocab, regime=tokenize_regime, randomize=False) oov_mask = ids == lm.vocab.unk_ind nb_oovs = oov_mask.sum().item() nb_tokens = len(ids) oov_msg = 'Nb oovs: {} / {} ({:.2f} %)\n'.format(nb_oovs, len(ids), 100.0 * nb_oovs/nb_tokens) if nb_oovs / nb_tokens > 0.05: self.logger.warning(oov_msg) else: self.logger.info(oov_msg) batched = batchify(ids, batch_size, lm.device == torch.device('cuda:0')) data_tb = TemporalSplits( batched, nb_inputs_necessary=lm.model.in_len, nb_targets_parallel=target_seq_len ) self.data = TransposeWrapper(data_tb)
def __init__(self, f, vocab, unroll_length): """ Args: f (file): File with a document. vocab (Vocabulary): Vocabulary for translation word -> index """ ts_builder = lambda seq: TemporalSplits( seq, nb_inputs_necessary=unroll_length, nb_targets_parallel=1) super().__init__(f, vocab, ts_builder)
def __init__(self, f, vocab, hist_len, nb_targets_parallel, end_portion): """ Args: f (file): File with a document. vocab (Vocabulary): Vocabulary for translation word -> index """ ts_builder = lambda seq: TemporalSplits(seq, nb_inputs_necessary=hist_len, nb_targets_parallel= nb_targets_parallel) super().__init__(f, vocab, end_portion, ts_builder)
if args.cuda: lm.cuda() print(lm.model) print("preparing data...") tokenize_regime = 'words' if args.characters: tokenize_regime = 'chars' train_ids = tokens_from_fn(args.train, lm.vocab, randomize=False, regime=tokenize_regime) train_batched = batchify(train_ids, args.batch_size, args.cuda) train_data_tb = TemporalSplits(train_batched, nb_inputs_necessary=lm.model.in_len, nb_targets_parallel=args.target_seq_len) train_data = TransposeWrapper(train_data_tb) valid_ids = tokens_from_fn(args.valid, lm.vocab, randomize=False, regime=tokenize_regime) valid_batched = batchify(valid_ids, 10, args.cuda) valid_data_tb = TemporalSplits(valid_batched, nb_inputs_necessary=lm.model.in_len, nb_targets_parallel=args.target_seq_len) valid_data = TransposeWrapper(valid_data_tb) print('Initial perplexity {:.2f}'.format( math.exp(
def ts_from_file(f): return TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len))
def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)
def temp_splits_from_fn(fn): tokens = tokens_from_file(fn, lm.vocab, randomize=False) return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len)