Exemple #1
0
 def ivec_ts_from_file(f):
     da_ts = DomainAdaptationSplitFFMultiTarget(
         f,
         lm.vocab,
         lm.model.in_len,
         args.target_seq_len,
         end_portion=args.domain_portion,
     )
     return ivec_appenders.CheatingIvecAppender(da_ts, ivec_extractor)
    def test_iter_ends(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        next(appender)
        next(appender)
        next(appender)

        self.assertRaises(StopIteration, next, appender)
    def test_single_data(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]

        expectation = self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        seqs = next(iter(appender))
        first = seqs[2]

        self.assertEqual(first, expectation)
    def test_whole_seq_with_next(self):
        data_source = getStream(self.test_words_short)
        ts = split_corpus_dataset.TokenizedSplit(data_source, self.vocab, 1)
        appender = ivec_appenders.CheatingIvecAppender(ts, self.ivec_eetor)
        appender = iter(appender)

        # cannot acces ts._tokens, it's an implementation
        tokens = [self.vocab[w] for w in self.test_words_short]
        expectation = [
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1])),
            self.ivec_eetor(" ".join(self.test_words_short[:-1]))
        ]

        seq0 = next(appender)[2]
        self.assertEqual(seq0, expectation[0])

        seq1 = next(appender)[2]
        self.assertEqual(seq1, expectation[1])

        seq2 = next(appender)[2]
        self.assertEqual(seq2, expectation[2])
Exemple #5
0
 def ivec_ts_from_file(f):
     ts = TokenizedSplitFFBase(
         f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                 target_seq_len))
     return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)
Exemple #6
0
    print("loading LSTM model...")
    lm = torch.load(args.load)
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("loading SMM iVector extractor ...")
    with open(args.ivec_extractor, 'rb') as f:
        ivec_extractor = smm_ivec_extractor.load(f)
    if args.ivec_nb_iters:
        ivec_extractor._nb_iters = args.ivec_nb_iters
    print(ivec_extractor)

    print("preparing data...")
    ivec_app_creator = lambda ts: ivec_appenders.CheatingIvecAppender(
        ts, ivec_extractor)

    print("\ttraining...")

    def ivec_ts_from_file(f):
        ts = TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))
        return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)

    train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file)

    print("\tvalidation...")
    valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file)
    valid_data = BatchBuilder(valid_data_ivecs,
                              args.batch_size,
Exemple #7
0
 def setUp(self):
     self.vocab = {"a": 0, "b": 1, "c": 2}
     self.ivec_eetor = lambda x: torch.from_numpy(
         np.asarray([hash(x) % 1337], dtype=np.float32))
     self.ivec_app_ctor = lambda ts: ivec_appenders.CheatingIvecAppender(
         ts, self.ivec_eetor)