def __init__(self, lm, data_fn, batch_size, target_seq_len, corruptor, nb_rounds, logger=None, tokenize_regime='words'): if logger: self.logger = logger else: self.logger = logging.getLogger('SubstitutionalEnblockEvaluator_v2') self.batch_size = batch_size self.lm = lm self.nb_rounds = nb_rounds ids = tokens_from_fn(data_fn, lm.vocab, regime=tokenize_regime, randomize=False) oov_mask = ids == lm.vocab.unk_ind nb_oovs = oov_mask.sum().item() nb_tokens = len(ids) oov_msg = 'Nb oovs: {} / {} ({:.2f} %)\n'.format(nb_oovs, len(ids), 100.0 * nb_oovs/nb_tokens) if nb_oovs / nb_tokens > 0.05: self.logger.warning(oov_msg) else: self.logger.info(oov_msg) streams = form_input_targets(ids) corrupted_provider = corruptor(streams) batch_former = LazyBatcher(batch_size, corrupted_provider) data_tb = TemplSplitterClean(target_seq_len, batch_former) self.data = CudaStream(TransposeWrapper(data_tb))
if args.cuda: lm.cuda() print(lm.model) print("loading SMM iVector extractor ...") with open(args.ivec_extractor, 'rb') as f: ivec_extractor = smm_ivec_extractor.load(f) if args.ivec_nb_iters is not None: ivec_extractor._nb_iters = args.ivec_nb_iters print(ivec_extractor) print("preparing data...") def ts_from_file(f): return TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) tss = filelist_to_objects(args.file_list, ts_from_file) data = BatchBuilder(tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: data = CudaStream(data) data_ivecs = ivec_appenders.ParalelIvecAppender( data, ivec_extractor, ivec_extractor.build_translator(lm.vocab)) print("evaluating...") loss = evaluate(lm, data_ivecs, use_ivecs=True) print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor) train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file) print("\tvalidation...") valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file) valid_data = BatchBuilder(valid_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None for epoch in range(1, args.epochs + 1): random.shuffle(train_data_ivecs) train_data = BatchBuilder(train_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: train_data = CudaStream(train_data) logger = InfinityLogger(epoch, args.log_interval, lr) train_data_filtered = BatchFilter(train_data, args.batch_size,