def ivec_ts_from_file(f): da_ts = DomainAdaptationSplitFFMultiTarget( f, lm.vocab, lm.model.in_len, args.target_seq_len, end_portion=args.domain_portion, ) return ivec_appenders.CheatingIvecAppender(da_ts, ivec_extractor)
def test_iter_ends(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) appender = iter(appender) next(appender) next(appender) next(appender) self.assertRaises(StopIteration, next, appender)
def test_single_data(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) # cannot acces ts._tokens, it's an implementation tokens = [self.vocab[w] for w in self.test_words_short] expectation = self.ivec_eetor(" ".join(self.test_words_short[:-1])) seqs = next(iter(appender)) first = seqs[2] self.assertEqual(first, expectation)
def test_whole_seq_with_next(self): data_source = getStream(self.test_words_short) ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1) appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor) appender = iter(appender) # cannot acces ts._tokens, it's an implementation tokens = [self.vocab[w] for w in self.test_words_short] expectation = [ self.ivec_eetor(" ".join(self.test_words_short[:-1])), self.ivec_eetor(" ".join(self.test_words_short[:-1])), self.ivec_eetor(" ".join(self.test_words_short[:-1])) ] seq0 = next(appender)[2] self.assertEqual(seq0, expectation[0]) seq1 = next(appender)[2] self.assertEqual(seq1, expectation[1]) seq2 = next(appender)[2] self.assertEqual(seq2, expectation[2])
def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)
print("loading LSTM model...") lm = torch.load(args.load) if args.cuda: lm.cuda() print(lm.model) print("loading SMM iVector extractor ...") with open(args.ivec_extractor, 'rb') as f: ivec_extractor = smm_ivec_extractor.load(f) if args.ivec_nb_iters: ivec_extractor._nb_iters = args.ivec_nb_iters print(ivec_extractor) print("preparing data...") ivec_app_creator = lambda ts: ivec_appenders.CheatingIvecAppender( ts, ivec_extractor) print("\ttraining...") def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor) train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file) print("\tvalidation...") valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file) valid_data = BatchBuilder(valid_data_ivecs, args.batch_size,
def setUp(self): self.vocab = {"a": 0, "b": 1, "c": 2} self.ivec_eetor = lambda x: torch.from_numpy( np.asarray([hash(x) % 1337], dtype=np.float32)) self.ivec_app_ctor = lambda ts: ivec_appenders.CheatingIvecAppender( ts, self.ivec_eetor)