def test_from_corpus(self): self.sentences1_file.seek(0) vocabulary = Vocabulary.from_corpus([self.sentences1_file]) self.assertEqual(vocabulary.num_words(), 10 + 3) self.assertEqual(vocabulary.num_classes(), 10 + 3) self.sentences1_file.seek(0) self.sentences2_file.seek(0) vocabulary = Vocabulary.from_corpus([self.sentences1_file, self.sentences2_file], 3) self.assertEqual(vocabulary.num_words(), 10 + 3) self.assertEqual(vocabulary.num_classes(), 3 + 3) sos_id = vocabulary.word_to_id['<s>'] eos_id = vocabulary.word_to_id['</s>'] unk_id = vocabulary.word_to_id['<unk>'] self.assertEqual(sos_id, 10) self.assertEqual(eos_id, 11) self.assertEqual(unk_id, 12) self.assertEqual(vocabulary.word_id_to_class_id[sos_id], 3) self.assertEqual(vocabulary.word_id_to_class_id[eos_id], 4) self.assertEqual(vocabulary.word_id_to_class_id[unk_id], 5) word_ids = set() class_ids = set() for word in vocabulary.words(): if not word.startswith('<'): word_id = vocabulary.word_to_id[word] word_ids.add(word_id) class_ids.add(vocabulary.word_id_to_class_id[word_id]) self.assertEqual(word_ids, set(range(10))) self.assertEqual(class_ids, set(range(3)))
def test_from_state(self): self.classes_file.seek(0) vocabulary1 = Vocabulary.from_file(self.classes_file, 'srilm-classes') f = h5py.File('in-memory.h5', driver='core', backing_store=False) vocabulary1.get_state(f) vocabulary2 = Vocabulary.from_state(f) self.assertTrue(numpy.array_equal(vocabulary1.id_to_word, vocabulary2.id_to_word)) self.assertDictEqual(vocabulary1.word_to_id, vocabulary2.word_to_id) self.assertTrue(numpy.array_equal(vocabulary1.word_id_to_class_id, vocabulary2.word_id_to_class_id)) self.assertListEqual(list(vocabulary1._word_classes), list(vocabulary2._word_classes))
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words') self.sos_id = self.vocabulary.word_to_id['<s>'] self.yksi_id = self.vocabulary.word_to_id['yksi'] self.kaksi_id = self.vocabulary.word_to_id['kaksi'] self.eos_id = self.vocabulary.word_to_id['</s>'] projection_vector = tensor.zeros(shape=(self.vocabulary.num_words(),), dtype=theano.config.floatX) self.sos_prob = 0.1 projection_vector = tensor.set_subtensor(projection_vector[self.sos_id], self.sos_prob) self.yksi_prob = 0.2 projection_vector = tensor.set_subtensor(projection_vector[self.yksi_id], self.yksi_prob) self.kaksi_prob = 0.3 projection_vector = tensor.set_subtensor(projection_vector[self.kaksi_id], self.kaksi_prob) self.eos_prob = 0.4 projection_vector = tensor.set_subtensor(projection_vector[self.eos_id], self.eos_prob) self.network = DummyNetwork(self.vocabulary, projection_vector) lattice_path = os.path.join(script_path, 'lattice.slf') with open(lattice_path) as lattice_file: self.lattice = SLFLattice(lattice_file)
def from_file(cls, model_path, mode=None, exclude_unk=False): """Reads a model from an HDF5 file. :type model_path: str :param model_path: path to a HDF5 model file :type mode: Network.Mode :param mode: selects mini-batch or single time step processing :type exclude_unk: bool :param exclude_unk: if set to ``True``, sets ``<unk>`` probability to zero. """ with h5py.File(model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of words in shortlist:", vocabulary.num_shortlist_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) result = cls(architecture, vocabulary, mode=mode, exclude_unk=exclude_unk) print("Restoring neural network state.") sys.stdout.flush() result.set_state(state) return result
def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) sequences = sampler.generate(30, args.num_sentences) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos+1] except: pass args.output_file.write(' '.join(sequence) + '\n')
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') oos_words = ['yksitoista', 'kaksitoista'] with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words', oos_words=oos_words) word_counts = { 'yksi': 1, 'kaksi': 2, 'kolme': 3, 'neljä': 4, 'viisi': 5, 'kuusi': 6, 'seitsemän': 7, 'kahdeksan': 8, 'yhdeksän': 9, 'kymmenen': 10, '<s>': 11, '</s>': 12, '<unk>': 13, 'yksitoista': 3, 'kaksitoista': 7 } self.vocabulary.compute_probs(word_counts) self.dummy_network = DummyNetwork(self.vocabulary)
def test_get_class_memberships(self): vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes') word_ids = numpy.array([vocabulary.word_to_id['yksi'], vocabulary.word_to_id['kaksi'], vocabulary.word_to_id['kolme'], vocabulary.word_to_id['neljä'], vocabulary.word_to_id['viisi'], vocabulary.word_to_id['kuusi'], vocabulary.word_to_id['seitsemän'], vocabulary.word_to_id['kahdeksan'], vocabulary.word_to_id['yhdeksän'], vocabulary.word_to_id['kymmenen']]) class_ids, probs = vocabulary.get_class_memberships(word_ids) assert_equal(class_ids, vocabulary.word_id_to_class_id[word_ids]) assert_equal(class_ids[3], vocabulary.word_id_to_class_id[word_ids[3]]) assert_almost_equal(probs, [1.0, 1.0, 0.599 / (0.599 + 0.400), 0.400 / (0.599 + 0.400), 1.0, 0.281 / (0.281 + 0.226 + 0.262 + 0.228), 0.226 / (0.281 + 0.226 + 0.262 + 0.228), 0.262 / (0.281 + 0.226 + 0.262 + 0.228), 0.228 / (0.281 + 0.226 + 0.262 + 0.228), 1.0])
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) sentences1_path = os.path.join(script_path, 'sentences1.txt') sentences2_path = os.path.join(script_path, 'sentences2.txt') sentences3_path = os.path.join(script_path, 'sentences3.txt') vocabulary_path = os.path.join(script_path, 'vocabulary.txt') self.sentences1_file = open(sentences1_path) self.sentences2_file = open(sentences2_path) self.sentences3_file = open(sentences3_path) self.vocabulary_file = open(vocabulary_path) self.vocabulary = Vocabulary.from_file(self.vocabulary_file, 'words') self.vocabulary_file.seek(0) self.shortlist_vocabulary = \ Vocabulary.from_file(self.vocabulary_file, 'words', oos_words=['yksitoista'])
def test_bigram_statistics(self): self.sentences_file.seek(0) word_counts = compute_word_counts([self.sentences_file]) self.vocabulary = Vocabulary.from_word_counts(word_counts) self.sentences_file.seek(0) statistics = BigramStatistics([self.sentences_file], self.vocabulary) unigram_counts = statistics.unigram_counts vocabulary = self.vocabulary self.assertEqual(unigram_counts[vocabulary.word_to_id['a']], 13) self.assertEqual(unigram_counts[vocabulary.word_to_id['b']], 8) self.assertEqual(unigram_counts[vocabulary.word_to_id['c']], 8) self.assertEqual(unigram_counts[vocabulary.word_to_id['d']], 11) self.assertEqual(unigram_counts[vocabulary.word_to_id['e']], 15) self.assertEqual(unigram_counts[vocabulary.word_to_id['<unk>']], 0) self.assertEqual(unigram_counts[vocabulary.word_to_id['<s>']], 11) self.assertEqual(unigram_counts[vocabulary.word_to_id['</s>']], 11) bigram_counts = statistics.bigram_counts vocabulary = self.vocabulary a_id = vocabulary.word_to_id['a'] b_id = vocabulary.word_to_id['b'] self.assertEqual(bigram_counts[a_id, a_id], 3) self.assertEqual(bigram_counts[a_id, b_id], 2) self.assertEqual(bigram_counts[b_id, a_id], 1) self.assertEqual(bigram_counts[b_id, b_id], 0)
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words') self.sos_id = self.vocabulary.word_to_id['<s>'] self.yksi_id = self.vocabulary.word_to_id['yksi'] self.kaksi_id = self.vocabulary.word_to_id['kaksi'] self.eos_id = self.vocabulary.word_to_id['</s>'] projection_vector = tensor.zeros(shape=(self.vocabulary.num_words(), ), dtype=theano.config.floatX) self.sos_prob = 0.1 projection_vector = tensor.set_subtensor( projection_vector[self.sos_id], self.sos_prob) self.yksi_prob = 0.2 projection_vector = tensor.set_subtensor( projection_vector[self.yksi_id], self.yksi_prob) self.kaksi_prob = 0.3 projection_vector = tensor.set_subtensor( projection_vector[self.kaksi_id], self.kaksi_prob) self.eos_prob = 0.4 projection_vector = tensor.set_subtensor( projection_vector[self.eos_id], self.eos_prob) self.network = DummyNetwork(self.vocabulary, projection_vector) lattice_path = os.path.join(script_path, 'lattice.slf') with open(lattice_path) as lattice_file: self.lattice = SLFLattice(lattice_file)
def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture, predict_next_distribution=True) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) for i in range(args.num_sentences): words = sampler.generate() args.output_file.write('{}: {}\n'.format( i, ' '.join(words)))
def sample(args): numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) print("Building text sampler.") sys.stdout.flush() sampler = TextSampler(network) sequences = sampler.generate(30, args.num_sentences) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos + 1] except: pass args.output_file.write(' '.join(sequence) + '\n')
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) sentences_path = os.path.join(script_path, 'sentences.txt') with open(sentences_path) as sentences_file: self.vocabulary = Vocabulary.from_corpus(sentences_file) sentences_file.seek(0) self.statistics = WordStatistics([sentences_file], self.vocabulary)
def test_compute_probs(self): self.classes_file.seek(0) vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes') vocabulary.compute_probs([self.sentences1_file, self.sentences2_file]) word_id = vocabulary.word_to_id['yksi'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0) word_id = vocabulary.word_to_id['kaksi'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0) word_id = vocabulary.word_to_id['kolme'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5) word_id = vocabulary.word_to_id['neljä'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5) word_id = vocabulary.word_to_id['viisi'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0) word_id = vocabulary.word_to_id['kuusi'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25) word_id = vocabulary.word_to_id['seitsemän'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25) word_id = vocabulary.word_to_id['kahdeksan'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25) word_id = vocabulary.word_to_id['yhdeksän'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25) word_id = vocabulary.word_to_id['kymmenen'] self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
def setUp(self): theano.config.compute_test_value = 'warn' script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words') self.dummy_network = DummyNetwork(self.vocabulary)
def setUp(self): self.maxDiff = None script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words', oos_words=['oos1', 'oos2']) self.vocabulary.compute_probs({ 'yksi': 1, 'kaksi': 1, 'kolme': 1, 'neljä': 1, 'viisi': 1, 'kuusi': 1, 'seitsemän': 1, 'kahdeksan': 1, 'yhdeksän': 1, 'kymmenen': 1, 'oos1': 1, 'oos2': 2 }) self.sos_id = self.vocabulary.word_to_id['<s>'] self.yksi_id = self.vocabulary.word_to_id['yksi'] self.kaksi_id = self.vocabulary.word_to_id['kaksi'] self.eos_id = self.vocabulary.word_to_id['</s>'] self.unk_id = self.vocabulary.word_to_id['<unk>'] self.oos1_id = self.vocabulary.word_to_id['oos1'] self.oos2_id = self.vocabulary.word_to_id['oos2'] projection_vector = tensor.zeros( shape=(self.vocabulary.num_shortlist_words(), ), dtype=theano.config.floatX) self.sos_prob = 0.1 projection_vector = tensor.set_subtensor( projection_vector[self.sos_id], self.sos_prob) self.yksi_prob = 0.2 projection_vector = tensor.set_subtensor( projection_vector[self.yksi_id], self.yksi_prob) self.kaksi_prob = 0.3 projection_vector = tensor.set_subtensor( projection_vector[self.kaksi_id], self.kaksi_prob) self.eos_prob = 0.4 projection_vector = tensor.set_subtensor( projection_vector[self.eos_id], self.eos_prob) self.unk_prob = 0.3 projection_vector = tensor.set_subtensor( projection_vector[self.unk_id], self.unk_prob) self.network = DummyNetwork(self.vocabulary, projection_vector) lattice_path = os.path.join(script_path, 'lattice.slf') with open(lattice_path) as lattice_file: self.lattice = SLFLattice(lattice_file)
def restoreModel(path): with h5py.File(path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of words in shortlist:", vocabulary.num_shortlist_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") network.set_state(state) return network
def test_class_ids(self): self.classes_file.seek(0) vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes') word_id = vocabulary.word_to_id['yksi'] yksi_class_id = vocabulary.word_id_to_class_id[word_id] word_id = vocabulary.word_to_id['kaksi'] kaksi_class_id = vocabulary.word_id_to_class_id[word_id] word_id = vocabulary.word_to_id['kolme'] kolme_class_id = vocabulary.word_id_to_class_id[word_id] word_id = vocabulary.word_to_id['neljä'] nelja_class_id = vocabulary.word_id_to_class_id[word_id] word_id = vocabulary.word_to_id['</s>'] eos_class_id = vocabulary.word_id_to_class_id[word_id] self.assertNotEqual(yksi_class_id, kaksi_class_id) self.assertEqual(kolme_class_id, nelja_class_id) self.assertNotEqual(kolme_class_id, eos_class_id) self.assertEqual(kaksi_class_id, eos_class_id)
def from_file(cls, model_path, mode=None, exclude_unk=False, default_device=None): """Reads a model from an HDF5 file. :type model_path: str :param model_path: path to a HDF5 model file :type mode: Network.Mode :param mode: selects mini-batch or single time step processing :type exclude_unk: bool :param exclude_unk: if set to ``True``, set ``<unk>`` probability to zero before normalizing the network outputs (required to get exact normalization during inference) :type default_device: str :param default_device: default device where to store the shared variables """ with h5py.File(model_path, 'r') as state: logging.info("Reading vocabulary from network state.") #sys.stdout.flush() vocabulary = Vocabulary.from_state(state) logging.info("Number of words in vocabulary: {}".format( vocabulary.num_words())) logging.info("Number of words in shortlist: {}".format( vocabulary.num_shortlist_words())) logging.info("Number of word classes: {}".format( vocabulary.num_classes())) logging.info("Building neural network.") #sys.stdout.flush() architecture = Architecture.from_state(state) result = cls(architecture, vocabulary, mode=mode, exclude_unk=exclude_unk, default_device=default_device) logging.info("Restoring neural network state.") result.set_training() logging.info("Reseting to the neural network to evaluate.") result.set_state(state) return result
def sample(args): """A function that performs the "theanolm sample" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ numpy.random.seed(args.random_seed) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' with h5py.File(args.model_path, 'r') as state: logging.info("Reading vocabulary from network state.") vocabulary = Vocabulary.from_state(state) logging.info("Number of words in vocabulary: %d", vocabulary.num_words()) logging.info("Number of words in shortlist: %d", vocabulary.num_shortlist_words()) logging.info("Number of word classes: %d", vocabulary.num_classes()) logging.info("Building neural network.") architecture = Architecture.from_state(state) default_device = get_default_device(args.default_device) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False), default_device=default_device) logging.info("Restoring neural network state.") network.set_state(state) logging.info("Building text sampler.") sampler = TextSampler(network) sequences = sampler.generate(args.sentence_length, args.num_sentences, seed_sequence=args.seed_sequence) for sequence in sequences: try: eos_pos = sequence.index('</s>') sequence = sequence[:eos_pos + 1] except ValueError: pass args.output_file.write(' '.join(sequence) + '\n')
def score(args): with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) print("Building text scorer.") sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty scorer = TextScorer(network, ignore_unk, unk_penalty) print("Scoring text.") if args.output == 'perplexity': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, False) elif args.output == 'word-scores': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, True) elif args.output == 'utterance-scores': _score_utterances(args.input_file, vocabulary, scorer, args.output_file, args.log_base) else: print("Invalid output format requested:", args.output) sys.exit(1)
def __init__(self, model_path): self.model_path = model_path numpy.random.seed() theano.config.compute_test_value = 'off' with h5py.File(model_path, 'r') as self.state: print("Reading vocabulary from network state.") #sys.stdout.flush() self.vocabulary = Vocabulary.from_state(self.state) print("Number of words in vocabulary:", self.vocabulary.num_words()) print("Number of words in shortlist:", self.vocabulary.num_shortlist_words()) print("Number of word classes:", self.vocabulary.num_classes()) print("Building neural network.") #sys.stdout.flush() self.architecture = Architecture.from_state(self.state) self.network = Network(self.architecture, self.vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") self.network.set_state(self.state) print("Building text sampler.") #sys.stdout.flush() self.sampler = TextSampler(self.network)
def score(args): with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) print("Building text scorer.") sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty scorer = TextScorer(network, ignore_unk, unk_penalty) print("Scoring text.") if args.output == 'perplexity': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, False) elif args.output == 'word-scores': _score_text(args.input_file, vocabulary, scorer, args.output_file, args.log_base, True) elif args.output == 'utterance-scores': _score_utterances(args.input_file, vocabulary, scorer, args.output_file, args.log_base)
def test_word_ids_to_names(self): self.classes_file.seek(0) vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes') word_ids = [vocabulary.word_to_id['yksi'], vocabulary.word_to_id['kaksi'], vocabulary.word_to_id['kolme'], vocabulary.word_to_id['neljä'], vocabulary.word_to_id['viisi'], vocabulary.word_to_id['kuusi'], vocabulary.word_to_id['seitsemän'], vocabulary.word_to_id['kahdeksan'], vocabulary.word_to_id['yhdeksän'], vocabulary.word_to_id['kymmenen']] names = vocabulary.word_ids_to_names(word_ids) self.assertEqual(names[0], 'yksi') self.assertEqual(names[1], 'kaksi') self.assertTrue(names[2].startswith('CLASS-')) self.assertEqual(names[2], names[3]) self.assertEqual(names[4], 'viisi') self.assertTrue(names[5].startswith('CLASS-')) self.assertEqual(names[5], names[6]) self.assertEqual(names[5], names[7]) self.assertEqual(names[5], names[8]) self.assertEqual(names[9], 'kymmenen')
def decode(args): log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False)) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building word lattice decoder.") sys.stdout.flush() decoder = LatticeDecoder(network, decoding_options) # Combine paths from command line and lattice list. lattices = args.lattices lattices.extend(args.lattice_list.readlines()) lattices = [path.strip() for path in lattices] # Ignore empty lines in the lattice list. lattices = list(filter(None, lattices)) # Pick every Ith lattice, if --num-jobs is specified and > 1. if args.num_jobs < 1: print("Invalid number of jobs specified:", args.num_jobs) sys.exit(1) if (args.job < 0) or (args.job > args.num_jobs - 1): print("Invalid job specified:", args.job) sys.exit(1) lattices = lattices[args.job::args.num_jobs] file_type = TextFileType('r') for index, path in enumerate(lattices): logging.info("Reading word lattice: %s", path) lattice_file = file_type(path) lattice = SLFLattice(lattice_file) if not lattice.utterance_id is None: utterance_id = lattice.utterance_id else: utterance_id = os.path.basename(lattice_file.name) logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id, index + 1, len(lattices), args.job) tokens = decoder.decode(lattice) for index in range(min(args.n_best, len(tokens))): line = format_token(tokens[index], utterance_id, vocabulary, log_scale, args.output) args.output_file.write(line + "\n")
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = "%(asctime)s %(funcName)s: %(message)s" if args.log_file == "-": logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = "warn" print("Enabled computing test values for tensor variables.") print("Warning: GpuArray backend will fail random number generation!") else: theano.config.compute_test_value = "off" theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, "a", driver="core") as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, "rt", encoding="utf-8") as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == "classes": print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) if args.num_noise_samples > vocabulary.num_classes(): print( "Number of noise samples ({}) is larger than the number of " "classes. This doesn't make sense and would cause sampling " "to fail.".format(args.num_noise_samples) ) sys.exit(1) if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight training_options = { "batch_size": args.batch_size, "sequence_length": args.sequence_length, "validation_frequency": args.validation_frequency, "patience": args.patience, "stopping_criterion": args.stopping_criterion, "max_epochs": args.max_epochs, "min_epochs": args.min_epochs, "max_annealing_count": args.max_annealing_count, } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) optimization_options = { "method": args.optimization_method, "epsilon": args.numerical_stability_term, "gradient_decay_rate": args.gradient_decay_rate, "sqr_gradient_decay_rate": args.sqr_gradient_decay_rate, "learning_rate": args.learning_rate, "weights": weights, "momentum": args.momentum, "max_gradient_norm": args.gradient_normalization, "cost_function": args.cost, "num_noise_samples": args.num_noise_samples, "noise_sharing": args.noise_sharing, "ignore_unk": ignore_unk, "unk_penalty": unk_penalty, } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ", ".join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) print("Creating trainer.") sys.stdout.flush() trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling) trainer.set_logging(args.log_interval) print("Building neural network.") sys.stdout.flush() if args.architecture == "lstm300" or args.architecture == "lstm1500": architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, "rt", encoding="utf-8") as arch_file: architecture = Architecture.from_description(arch_file) network = Network( architecture, vocabulary, trainer.class_prior_probs, args.noise_dampening, default_device=args.default_device, profile=args.profile, ) print("Compiling optimization function.") sys.stdout.flush() optimizer = create_optimizer(optimization_options, network, device=args.default_device, profile=args.profile) if args.print_graph: print("Cost function computation graph:") theano.printing.debugprint(optimizer.gradient_update_function) trainer.initialize(network, state, optimizer) if not args.validation_file is None: print("Building text scorer for cross-validation.") sys.stdout.flush() scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) print("Validation text:", args.validation_file.name) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = LinearBatchIterator( validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None ) trainer.set_validation(validation_iter, scorer) else: print("Cross-validation will not be performed.") validation_iter = None print("Training neural network.") sys.stdout.flush() trainer.train() if not "layers" in state.keys(): print( "The model has not been trained. No cross-validations were " "performed or training did not improve the model." ) elif not validation_iter is None: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == 'classes': print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) network = Network(vocabulary, architecture, profile=args.profile) sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight print("Building text scorer.") scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = \ LinearBatchIterator(validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None) optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'cost_function': args.cost, 'num_noise_samples': args.num_noise_samples, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ', '.join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) training_options = { 'strategy': args.training_strategy, 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building neural network trainer.") sys.stdout.flush() if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) trainer = create_trainer( training_options, optimization_options, network, vocabulary, scorer, args.training_set, args.sampling, validation_iter, state, args.profile) trainer.set_logging(args.log_interval) print("Training neural network.") sys.stdout.flush() trainer.train() if not 'layers' in state.keys(): print("The model has not been trained. No cross-validations were " "performed or training did not improve the model.") else: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def test_decode(self): vocabulary = Vocabulary.from_word_counts({ 'to': 1, 'and': 1, 'it': 1, 'but': 1, 'a.': 1, 'in': 1, 'a': 1, 'at': 1, 'the': 1, "didn't": 1, 'elaborate': 1 }) projection_vector = tensor.ones( shape=(vocabulary.num_shortlist_words(), ), dtype=theano.config.floatX) projection_vector *= 0.05 network = DummyNetwork(vocabulary, projection_vector) decoding_options = { 'nnlm_weight': 0.0, 'lm_scale': None, 'wi_penalty': None, 'unk_penalty': None, 'use_shortlist': False, 'unk_from_lattice': False, 'linear_interpolation': True, 'max_tokens_per_node': None, 'beam': None, 'recombination_order': 20 } decoder = LatticeDecoder(network, decoding_options) tokens = decoder.decode(self.lattice)[0] # Compare tokens to n-best list given by SRILM lattice-tool. log_scale = math.log(10) print() for token in tokens: print(token.ac_logprob / log_scale, token.lat_lm_logprob / log_scale, token.total_logprob / log_scale, ' '.join(token.history_words(vocabulary))) all_paths = [ "<s> it didn't elaborate </s>", "<s> but it didn't elaborate </s>", "<s> the didn't elaborate </s>", "<s> and it didn't elaborate </s>", "<s> e. didn't elaborate </s>", "<s> in it didn't elaborate </s>", "<s> a didn't elaborate </s>", "<s> at it didn't elaborate </s>", "<s> it it didn't elaborate </s>", "<s> to it didn't elaborate </s>", "<s> a. it didn't elaborate </s>", "<s> a it didn't elaborate </s>" ] paths = [' '.join(token.history_words(vocabulary)) for token in tokens] self.assertListEqual(paths, all_paths) token = tokens[0] history = ' '.join(token.history_words(vocabulary)) self.assertAlmostEqual(token.ac_logprob / log_scale, -8686.28, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -94.3896, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4) token = tokens[1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8743.96, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -111.488, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5) token = tokens[-1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8696.26, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -178.00, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
def test_decode(self): vocabulary = Vocabulary.from_word_counts({ 'TO': 1, 'AND': 1, 'IT': 1, 'BUT': 1, 'A.': 1, 'IN': 1, 'A': 1, 'AT': 1, 'THE': 1, 'E.': 1, "DIDN'T": 1, 'ELABORATE': 1 }) projection_vector = tensor.ones(shape=(vocabulary.num_words(), ), dtype=theano.config.floatX) projection_vector *= 0.05 network = DummyNetwork(vocabulary, projection_vector) decoding_options = { 'nnlm_weight': 0.0, 'lm_scale': None, 'wi_penalty': None, 'ignore_unk': False, 'unk_penalty': None, 'linear_interpolation': True, 'max_tokens_per_node': None, 'beam': None, 'recombination_order': None } decoder = LatticeDecoder(network, decoding_options) tokens = decoder.decode(self.lattice) # Compare tokens to n-best list given by SRILM lattice-tool. log_scale = math.log(10) print() for token in tokens: print(token.ac_logprob / log_scale, token.lat_lm_logprob / log_scale, token.total_logprob / log_scale, ' '.join(vocabulary.id_to_word[token.history])) all_paths = [ "<s> IT DIDN'T ELABORATE </s>", "<s> BUT IT DIDN'T ELABORATE </s>", "<s> THE DIDN'T ELABORATE </s>", "<s> AND IT DIDN'T ELABORATE </s>", "<s> E. DIDN'T ELABORATE </s>", "<s> IN IT DIDN'T ELABORATE </s>", "<s> A DIDN'T ELABORATE </s>", "<s> AT IT DIDN'T ELABORATE </s>", "<s> IT IT DIDN'T ELABORATE </s>", "<s> TO IT DIDN'T ELABORATE </s>", "<s> A. IT DIDN'T ELABORATE </s>", "<s> A IT DIDN'T ELABORATE </s>" ] paths = [ ' '.join(vocabulary.id_to_word[token.history]) for token in tokens ] self.assertListEqual(paths, all_paths) token = tokens[0] history = ' '.join(vocabulary.id_to_word[token.history]) self.assertAlmostEqual(token.ac_logprob / log_scale, -8686.28, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -94.3896, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4) token = tokens[1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8743.96, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -111.488, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5) token = tokens[-1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8696.26, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -178.00, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
def train(args): numpy.random.seed(args.random_seed) log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'a', driver='core') as state: if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for training_file in args.training_set: training_file.seek(0) vocabulary.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file: vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format) if args.vocabulary_format == 'classes': print("Computing class membership probabilities from " "unigram word counts.") sys.stdout.flush() vocabulary.compute_probs(args.training_set) vocabulary.get_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() if args.architecture == 'lstm300' or args.architecture == 'lstm1500': architecture = Architecture.from_package(args.architecture) else: with open(args.architecture, 'rt', encoding='utf-8') as arch_file: architecture = Architecture.from_description(arch_file) network = Network(vocabulary, architecture, profile=args.profile) sys.stdout.flush() if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty num_training_files = len(args.training_set) if len(args.weights) > num_training_files: print("You specified more weights than training files.") sys.exit(1) weights = numpy.ones(num_training_files).astype(theano.config.floatX) for index, weight in enumerate(args.weights): weights[index] = weight print("Building text scorer.") scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile) validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ) validation_iter = LinearBatchIterator(validation_mmap, vocabulary, batch_size=32) optimization_options = { 'method': args.optimization_method, 'epsilon': args.numerical_stability_term, 'gradient_decay_rate': args.gradient_decay_rate, 'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate, 'learning_rate': args.learning_rate, 'weights': weights, 'momentum': args.momentum, 'max_gradient_norm': args.gradient_normalization, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty } logging.debug("OPTIMIZATION OPTIONS") for option_name, option_value in optimization_options.items(): if type(option_value) is list: value_str = ', '.join(str(x) for x in option_value) logging.debug("%s: [%s]", option_name, value_str) else: logging.debug("%s: %s", option_name, str(option_value)) training_options = { 'strategy': args.training_strategy, 'batch_size': args.batch_size, 'sequence_length': args.sequence_length, 'validation_frequency': args.validation_frequency, 'patience': args.patience, 'stopping_criterion': args.stopping_criterion, 'max_epochs': args.max_epochs, 'min_epochs': args.min_epochs, 'max_annealing_count': args.max_annealing_count } logging.debug("TRAINING OPTIONS") for option_name, option_value in training_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building neural network trainer.") sys.stdout.flush() if len(args.sampling) > len(args.training_set): print("You specified more sampling coefficients than training " "files.") sys.exit(1) trainer = create_trainer( training_options, optimization_options, network, vocabulary, scorer, args.training_set, args.sampling, validation_iter, state, args.profile) trainer.set_logging(args.log_interval) print("Training neural network.") sys.stdout.flush() trainer.run() if not state.keys(): print("The model has not been trained.") else: network.set_state(state) perplexity = scorer.compute_perplexity(validation_iter) print("Best validation set perplexity:", perplexity)
def decode(args): log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile with h5py.File(args.model_path, 'r') as state: print("Reading vocabulary from network state.") sys.stdout.flush() vocabulary = Vocabulary.from_state(state) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Building neural network.") sys.stdout.flush() architecture = Architecture.from_state(state) network = Network(vocabulary, architecture, mode=Network.Mode.target_words) print("Restoring neural network state.") sys.stdout.flush() network.set_state(state) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building word lattice decoder.") sys.stdout.flush() decoder = LatticeDecoder(network, decoding_options) # Combine paths from command line and lattice list. lattices = args.lattices lattices.extend(args.lattice_list.readlines()) lattices = [path.strip() for path in lattices] # Ignore empty lines in the lattice list. lattices = list(filter(None, lattices)) # Pick every Ith lattice, if --num-jobs is specified and > 1. if args.num_jobs < 1: print("Invalid number of jobs specified:", args.num_jobs) sys.exit(1) if (args.job < 0) or (args.job > args.num_jobs - 1): print("Invalid job specified:", args.job) sys.exit(1) lattices = lattices[args.job::args.num_jobs] file_type = TextFileType('r') for index, path in enumerate(lattices): logging.info("Reading word lattice: %s", path) lattice_file = file_type(path) lattice = SLFLattice(lattice_file) if not lattice.utterance_id is None: utterance_id = lattice.utterance_id else: utterance_id = os.path.basename(lattice_file.name) logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id, index + 1, len(lattices), args.job) tokens = decoder.decode(lattice) for index in range(min(args.n_best, len(tokens))): line = format_token(tokens[index], utterance_id, vocabulary, log_scale, args.output) args.output_file.write(line + "\n")
def setUp(self): script_path = os.path.dirname(os.path.realpath(__file__)) vocabulary_path = os.path.join(script_path, 'vocabulary.txt') with open(vocabulary_path) as vocabulary_file: self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words') self.dummy_network = DummyNetwork(self.vocabulary)
def _read_vocabulary(args, state): """If ``state`` contains data, reads the vocabulary from the HDF5 state. Otherwise reads a vocabulary file or constructs the vocabulary from the training set and writes it to the HDF5 state. If the state does not contain data and --vocabulary argument is given, reads the vocabulary from the file given after the argument. The rest of the words in the training set will be added as out-of-shortlist words. If the state does not contain data and no vocabulary is given, constructs a vocabulary that contains all the training set words. In that case, --num-classes argument can be used to control the number of classes. :type args: argparse.Namespace :param args: a collection of command line arguments :type state: hdf5.File :param state: HDF5 file where the vocabulary should be saved :rtype: Vocabulary :returns: the created vocabulary """ if state.keys(): print("Reading vocabulary from existing network state.") sys.stdout.flush() result = Vocabulary.from_state(state) if not result.has_unigram_probs(): # This is for backward compatibility. Remove at some point. print("Computing unigram word probabilities from training set.") sys.stdout.flush() word_counts = compute_word_counts(args.training_set) shortlist_words = list(result.id_to_word) shortlist_set = set(shortlist_words) oos_words = [ x for x in word_counts.keys() if x not in shortlist_set ] result.id_to_word = numpy.asarray(shortlist_words + oos_words, dtype=object) result.word_to_id = { word: word_id for word_id, word in enumerate(result.id_to_word) } result.compute_probs(word_counts, update_class_probs=False) result.get_state(state) elif args.vocabulary is None: print("Constructing vocabulary from training set.") sys.stdout.flush() word_counts = compute_word_counts(args.training_set) result = Vocabulary.from_word_counts(word_counts, args.num_classes) result.get_state(state) else: print("Reading vocabulary from {}.".format(args.vocabulary)) sys.stdout.flush() word_counts = compute_word_counts(args.training_set) oos_words = word_counts.keys() with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file: result = Vocabulary.from_file(vocab_file, args.vocabulary_format, oos_words=oos_words) if args.vocabulary_format == 'classes': print("Computing class membership probabilities and unigram " "probabilities for out-of-shortlist words.") sys.stdout.flush() update_class_probs = True else: print( "Computing unigram probabilities for out-of-shortlist words.") sys.stdout.flush() update_class_probs = False result.compute_probs(word_counts, update_class_probs=update_class_probs) result.get_state(state) print("Number of words in vocabulary:", result.num_words()) print("Number of words in shortlist:", result.num_shortlist_words()) print("Number of word classes:", result.num_classes()) return result
def test_decode(self): vocabulary = Vocabulary.from_word_counts({ 'TO': 1, 'AND': 1, 'IT': 1, 'BUT': 1, 'A.': 1, 'IN': 1, 'A': 1, 'AT': 1, 'THE': 1, 'E.': 1, "DIDN'T": 1, 'ELABORATE': 1}) projection_vector = tensor.ones(shape=(vocabulary.num_words(),), dtype=theano.config.floatX) projection_vector *= 0.05 network = DummyNetwork(vocabulary, projection_vector) decoding_options = { 'nnlm_weight': 0.0, 'lm_scale': None, 'wi_penalty': None, 'ignore_unk': False, 'unk_penalty': None, 'linear_interpolation': True, 'max_tokens_per_node': None, 'beam': None, 'recombination_order': None } decoder = LatticeDecoder(network, decoding_options) tokens = decoder.decode(self.lattice) # Compare tokens to n-best list given by SRILM lattice-tool. log_scale = math.log(10) print() for token in tokens: print(token.ac_logprob / log_scale, token.lat_lm_logprob / log_scale, token.total_logprob / log_scale, ' '.join(vocabulary.id_to_word[token.history])) all_paths = ["<s> IT DIDN'T ELABORATE </s>", "<s> BUT IT DIDN'T ELABORATE </s>", "<s> THE DIDN'T ELABORATE </s>", "<s> AND IT DIDN'T ELABORATE </s>", "<s> E. DIDN'T ELABORATE </s>", "<s> IN IT DIDN'T ELABORATE </s>", "<s> A DIDN'T ELABORATE </s>", "<s> AT IT DIDN'T ELABORATE </s>", "<s> IT IT DIDN'T ELABORATE </s>", "<s> TO IT DIDN'T ELABORATE </s>", "<s> A. IT DIDN'T ELABORATE </s>", "<s> A IT DIDN'T ELABORATE </s>"] paths = [' '.join(vocabulary.id_to_word[token.history]) for token in tokens] self.assertListEqual(paths, all_paths) token = tokens[0] history = ' '.join(vocabulary.id_to_word[token.history]) self.assertAlmostEqual(token.ac_logprob / log_scale, -8686.28, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -94.3896, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4) token = tokens[1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8743.96, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -111.488, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5) token = tokens[-1] self.assertAlmostEqual(token.ac_logprob / log_scale, -8696.26, places=2) self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -178.00, places=2) self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
def main(): parser = argparse.ArgumentParser(prog='wctool') argument_group = parser.add_argument_group("files") argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text or .gz files containing training data (one sentence per ' 'line)') argument_group.add_argument( '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None, help='text or .gz file containing a list of words to include in class ' 'forming, and possibly their initial classes') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='vocabulary format, one of "words" (one word per line, default), ' '"classes" (word and class ID per line), "srilm-classes" (class ' 'name, membership probability, and word per line)') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the word classes (default stdout)') argument_group.add_argument( '--output-format', metavar='FORMAT', type=str, default='srilm-classes', help='format of the output file, one of "classes" (word and class ID ' 'per line), "srilm-classes" (default; class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--output-frequency', metavar='N', type=int, default='1', help='save classes N times per optimization iteration (default 1)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--num-classes', metavar='N', type=int, default=2000, help='number of classes to form, if vocabulary is not specified ' '(default 2000)') argument_group.add_argument( '--method', metavar='NAME', type=str, default='bigram-theano', help='method for creating word classes, one of "bigram-theano", ' '"bigram-numpy" (default "bigram-theano")') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics after every Nth word; quiet if less than one ' '(default 1000)') args = parser.parse_args() log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.vocabulary is None: vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for subset_file in args.training_set: subset_file.seek(0) else: vocabulary = Vocabulary.from_file(args.vocabulary, args.vocabulary_format) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Number of normal word classes:", vocabulary.num_normal_classes) logging.info("Reading word unigram and bigram statistics.") statistics = WordStatistics(args.training_set, vocabulary) if args.method == 'bigram-theano': optimizer = TheanoBigramOptimizer(statistics, vocabulary) elif args.method == 'bigram-numpy': optimizer = NumpyBigramOptimizer(statistics, vocabulary) else: raise ValueError("Invalid method requested: " + args.method) iteration = 1 while True: logging.info("Starting iteration %d.", iteration) num_words = 0 num_moves = 0 for word in vocabulary.words(): start_time = time() num_words += 1 if optimizer.move_to_best_class(word): num_moves += 1 duration = time() - start_time if (args.log_interval >= 1) and \ (num_words % args.log_interval == 0): logging.info( "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms", num_words, num_words / vocabulary.num_words() * 100, iteration, num_moves, optimizer.log_likelihood(), duration * 100) if is_scheduled(num_words, args.output_frequency, vocabulary.num_words()): save(optimizer, args.output_file, args.output_format) if num_moves == 0: break iteration += 1 logging.info("Optimization finished.") save(optimizer, args.output_file, args.output_format)
def main(): parser = argparse.ArgumentParser(prog='wctool') argument_group = parser.add_argument_group("files") argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text or .gz files containing training data (one sentence per ' 'line)') argument_group.add_argument( '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None, help='text or .gz file containing a list of words to include in class ' 'forming, and possibly their initial classes') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='vocabulary format, one of "words" (one word per line, default), ' '"classes" (word and class ID per line), "srilm-classes" (class ' 'name, membership probability, and word per line)') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the word classes (default stdout)') argument_group.add_argument( '--output-format', metavar='FORMAT', type=str, default='srilm-classes', help='format of the output file, one of "classes" (word and class ID ' 'per line), "srilm-classes" (default; class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--output-frequency', metavar='N', type=int, default='1', help='save classes N times per optimization iteration (default 1)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--num-classes', metavar='N', type=int, default=2000, help='number of classes to form, if vocabulary is not specified ' '(default 2000)') argument_group.add_argument( '--method', metavar='NAME', type=str, default='bigram-theano', help='method for creating word classes, one of "bigram-theano", ' '"bigram-numpy" (default "bigram-theano")') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics after every Nth word; quiet if less than one ' '(default 1000)') args = parser.parse_args() log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.vocabulary is None: vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for subset_file in args.training_set: subset_file.seek(0) else: vocabulary = Vocabulary.from_file(args.vocabulary, args.vocabulary_format) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Number of normal word classes:", vocabulary.num_normal_classes) logging.info("Reading word unigram and bigram statistics.") statistics = WordStatistics(args.training_set, vocabulary) if args.method == 'bigram-theano': optimizer = TheanoBigramOptimizer(statistics, vocabulary) elif args.method == 'bigram-numpy': optimizer = NumpyBigramOptimizer(statistics, vocabulary) else: raise ValueError("Invalid method requested: " + args.method) iteration = 1 while True: logging.info("Starting iteration %d.", iteration) num_words = 0 num_moves = 0 for word in vocabulary.words(): start_time = time() num_words += 1 if optimizer.move_to_best_class(word): num_moves += 1 duration = time() - start_time if (args.log_interval >= 1) and \ (num_words % args.log_interval == 0): logging.info("[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms", num_words, num_words / vocabulary.num_words() * 100, iteration, num_moves, optimizer.log_likelihood(), duration * 100) if is_scheduled(num_words, args.output_frequency, vocabulary.num_words()): save(optimizer, args.output_file, args.output_format) if num_moves == 0: break iteration += 1 logging.info("Optimization finished.") save(optimizer, args.output_file, args.output_format)
def test_from_file(self): self.vocabulary_file.seek(0) vocabulary = Vocabulary.from_file(self.vocabulary_file, 'words') self.assertEqual(vocabulary.num_words(), 10 + 3) self.assertEqual(vocabulary.num_classes(), 10 + 3)