def test_from_corpus(self):
        self.sentences1_file.seek(0)
        vocabulary = Vocabulary.from_corpus([self.sentences1_file])
        self.assertEqual(vocabulary.num_words(), 10 + 3)
        self.assertEqual(vocabulary.num_classes(), 10 + 3)

        self.sentences1_file.seek(0)
        self.sentences2_file.seek(0)
        vocabulary = Vocabulary.from_corpus([self.sentences1_file,
                                             self.sentences2_file],
                                            3)
        self.assertEqual(vocabulary.num_words(), 10 + 3)
        self.assertEqual(vocabulary.num_classes(), 3 + 3)

        sos_id = vocabulary.word_to_id['<s>']
        eos_id = vocabulary.word_to_id['</s>']
        unk_id = vocabulary.word_to_id['<unk>']
        self.assertEqual(sos_id, 10)
        self.assertEqual(eos_id, 11)
        self.assertEqual(unk_id, 12)
        self.assertEqual(vocabulary.word_id_to_class_id[sos_id], 3)
        self.assertEqual(vocabulary.word_id_to_class_id[eos_id], 4)
        self.assertEqual(vocabulary.word_id_to_class_id[unk_id], 5)
        word_ids = set()
        class_ids = set()
        for word in vocabulary.words():
            if not word.startswith('<'):
                word_id = vocabulary.word_to_id[word]
                word_ids.add(word_id)
                class_ids.add(vocabulary.word_id_to_class_id[word_id])
        self.assertEqual(word_ids, set(range(10)))
        self.assertEqual(class_ids, set(range(3)))
Beispiel #2
0
    def test_from_corpus(self):
        self.sentences1_file.seek(0)
        vocabulary = Vocabulary.from_corpus([self.sentences1_file])
        self.assertEqual(vocabulary.num_words(), 10 + 3)
        self.assertEqual(vocabulary.num_classes(), 10 + 3)

        self.sentences1_file.seek(0)
        self.sentences2_file.seek(0)
        vocabulary = Vocabulary.from_corpus([self.sentences1_file,
                                             self.sentences2_file],
                                            3)
        self.assertEqual(vocabulary.num_words(), 10 + 3)
        self.assertEqual(vocabulary.num_classes(), 3 + 3)

        sos_id = vocabulary.word_to_id['<s>']
        eos_id = vocabulary.word_to_id['</s>']
        unk_id = vocabulary.word_to_id['<unk>']
        self.assertEqual(sos_id, 10)
        self.assertEqual(eos_id, 11)
        self.assertEqual(unk_id, 12)
        self.assertEqual(vocabulary.word_id_to_class_id[sos_id], 3)
        self.assertEqual(vocabulary.word_id_to_class_id[eos_id], 4)
        self.assertEqual(vocabulary.word_id_to_class_id[unk_id], 5)
        word_ids = set()
        class_ids = set()
        for word in vocabulary.words():
            if not word.startswith('<'):
                word_id = vocabulary.word_to_id[word]
                word_ids.add(word_id)
                class_ids.add(vocabulary.word_id_to_class_id[word_id])
        self.assertEqual(word_ids, set(range(10)))
        self.assertEqual(class_ids, set(range(3)))
Beispiel #3
0
 def test_from_state(self):
     self.classes_file.seek(0)
     vocabulary1 = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     f = h5py.File('in-memory.h5', driver='core', backing_store=False)
     vocabulary1.get_state(f)
     vocabulary2 = Vocabulary.from_state(f)
     self.assertTrue(numpy.array_equal(vocabulary1.id_to_word, vocabulary2.id_to_word))
     self.assertDictEqual(vocabulary1.word_to_id, vocabulary2.word_to_id)
     self.assertTrue(numpy.array_equal(vocabulary1.word_id_to_class_id, vocabulary2.word_id_to_class_id))
     self.assertListEqual(list(vocabulary1._word_classes),
                          list(vocabulary2._word_classes))
Beispiel #4
0
 def test_from_state(self):
     self.classes_file.seek(0)
     vocabulary1 = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     f = h5py.File('in-memory.h5', driver='core', backing_store=False)
     vocabulary1.get_state(f)
     vocabulary2 = Vocabulary.from_state(f)
     self.assertTrue(numpy.array_equal(vocabulary1.id_to_word,
                                       vocabulary2.id_to_word))
     self.assertDictEqual(vocabulary1.word_to_id, vocabulary2.word_to_id)
     self.assertTrue(numpy.array_equal(vocabulary1.word_id_to_class_id,
                                       vocabulary2.word_id_to_class_id))
     self.assertListEqual(list(vocabulary1._word_classes),
                          list(vocabulary2._word_classes))
    def setUp(self):
        script_path = os.path.dirname(os.path.realpath(__file__))

        vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
        with open(vocabulary_path) as vocabulary_file:
            self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words')

        self.sos_id = self.vocabulary.word_to_id['<s>']
        self.yksi_id = self.vocabulary.word_to_id['yksi']
        self.kaksi_id = self.vocabulary.word_to_id['kaksi']
        self.eos_id = self.vocabulary.word_to_id['</s>']

        projection_vector = tensor.zeros(shape=(self.vocabulary.num_words(),),
                                         dtype=theano.config.floatX)
        self.sos_prob = 0.1
        projection_vector = tensor.set_subtensor(projection_vector[self.sos_id], self.sos_prob)
        self.yksi_prob = 0.2
        projection_vector = tensor.set_subtensor(projection_vector[self.yksi_id], self.yksi_prob)
        self.kaksi_prob = 0.3
        projection_vector = tensor.set_subtensor(projection_vector[self.kaksi_id], self.kaksi_prob)
        self.eos_prob = 0.4
        projection_vector = tensor.set_subtensor(projection_vector[self.eos_id], self.eos_prob)
        self.network = DummyNetwork(self.vocabulary, projection_vector)

        lattice_path = os.path.join(script_path, 'lattice.slf')
        with open(lattice_path) as lattice_file:
            self.lattice = SLFLattice(lattice_file)
Beispiel #6
0
    def from_file(cls, model_path, mode=None, exclude_unk=False):
        """Reads a model from an HDF5 file.

        :type model_path: str
        :param model_path: path to a HDF5 model file

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, sets ``<unk>`` probability to
                            zero.
        """

        with h5py.File(model_path, 'r') as state:
            print("Reading vocabulary from network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
            print("Number of words in vocabulary:", vocabulary.num_words())
            print("Number of words in shortlist:",
                  vocabulary.num_shortlist_words())
            print("Number of word classes:", vocabulary.num_classes())
            print("Building neural network.")
            sys.stdout.flush()
            architecture = Architecture.from_state(state)
            result = cls(architecture,
                         vocabulary,
                         mode=mode,
                         exclude_unk=exclude_unk)
            print("Restoring neural network state.")
            sys.stdout.flush()
            result.set_state(state)
            return result
Beispiel #7
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    sequences = sampler.generate(30, args.num_sentences)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos+1]
        except:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
 def setUp(self):
     script_path = os.path.dirname(os.path.realpath(__file__))
     vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
     oos_words = ['yksitoista', 'kaksitoista']
     with open(vocabulary_path) as vocabulary_file:
         self.vocabulary = Vocabulary.from_file(vocabulary_file,
                                                'words',
                                                oos_words=oos_words)
     word_counts = {
         'yksi': 1,
         'kaksi': 2,
         'kolme': 3,
         'neljä': 4,
         'viisi': 5,
         'kuusi': 6,
         'seitsemän': 7,
         'kahdeksan': 8,
         'yhdeksän': 9,
         'kymmenen': 10,
         '<s>': 11,
         '</s>': 12,
         '<unk>': 13,
         'yksitoista': 3,
         'kaksitoista': 7
     }
     self.vocabulary.compute_probs(word_counts)
     self.dummy_network = DummyNetwork(self.vocabulary)
Beispiel #9
0
 def test_get_class_memberships(self):
     vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     word_ids = numpy.array([vocabulary.word_to_id['yksi'],
                             vocabulary.word_to_id['kaksi'],
                             vocabulary.word_to_id['kolme'],
                             vocabulary.word_to_id['neljä'],
                             vocabulary.word_to_id['viisi'],
                             vocabulary.word_to_id['kuusi'],
                             vocabulary.word_to_id['seitsemän'],
                             vocabulary.word_to_id['kahdeksan'],
                             vocabulary.word_to_id['yhdeksän'],
                             vocabulary.word_to_id['kymmenen']])
     class_ids, probs = vocabulary.get_class_memberships(word_ids)
     assert_equal(class_ids, vocabulary.word_id_to_class_id[word_ids])
     assert_equal(class_ids[3], vocabulary.word_id_to_class_id[word_ids[3]])
     assert_almost_equal(probs, [1.0,
                                 1.0,
                                 0.599 / (0.599 + 0.400),
                                 0.400 / (0.599 + 0.400),
                                 1.0,
                                 0.281 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.226 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.262 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.228 / (0.281 + 0.226 + 0.262 + 0.228),
                                 1.0])
Beispiel #10
0
 def test_get_class_memberships(self):
     vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     word_ids = numpy.array([vocabulary.word_to_id['yksi'],
                             vocabulary.word_to_id['kaksi'],
                             vocabulary.word_to_id['kolme'],
                             vocabulary.word_to_id['neljä'],
                             vocabulary.word_to_id['viisi'],
                             vocabulary.word_to_id['kuusi'],
                             vocabulary.word_to_id['seitsemän'],
                             vocabulary.word_to_id['kahdeksan'],
                             vocabulary.word_to_id['yhdeksän'],
                             vocabulary.word_to_id['kymmenen']])
     class_ids, probs = vocabulary.get_class_memberships(word_ids)
     assert_equal(class_ids, vocabulary.word_id_to_class_id[word_ids])
     assert_equal(class_ids[3], vocabulary.word_id_to_class_id[word_ids[3]])
     assert_almost_equal(probs, [1.0,
                                 1.0,
                                 0.599 / (0.599 + 0.400),
                                 0.400 / (0.599 + 0.400),
                                 1.0,
                                 0.281 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.226 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.262 / (0.281 + 0.226 + 0.262 + 0.228),
                                 0.228 / (0.281 + 0.226 + 0.262 + 0.228),
                                 1.0])
Beispiel #11
0
    def setUp(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        sentences1_path = os.path.join(script_path, 'sentences1.txt')
        sentences2_path = os.path.join(script_path, 'sentences2.txt')
        sentences3_path = os.path.join(script_path, 'sentences3.txt')
        vocabulary_path = os.path.join(script_path, 'vocabulary.txt')

        self.sentences1_file = open(sentences1_path)
        self.sentences2_file = open(sentences2_path)
        self.sentences3_file = open(sentences3_path)
        self.vocabulary_file = open(vocabulary_path)
        self.vocabulary = Vocabulary.from_file(self.vocabulary_file, 'words')
        self.vocabulary_file.seek(0)
        self.shortlist_vocabulary = \
            Vocabulary.from_file(self.vocabulary_file, 'words',
                                 oos_words=['yksitoista'])
Beispiel #12
0
    def test_bigram_statistics(self):
        self.sentences_file.seek(0)
        word_counts = compute_word_counts([self.sentences_file])
        self.vocabulary = Vocabulary.from_word_counts(word_counts)
        self.sentences_file.seek(0)
        statistics = BigramStatistics([self.sentences_file], self.vocabulary)

        unigram_counts = statistics.unigram_counts
        vocabulary = self.vocabulary
        self.assertEqual(unigram_counts[vocabulary.word_to_id['a']], 13)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['b']], 8)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['c']], 8)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['d']], 11)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['e']], 15)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['<unk>']], 0)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['<s>']], 11)
        self.assertEqual(unigram_counts[vocabulary.word_to_id['</s>']], 11)

        bigram_counts = statistics.bigram_counts
        vocabulary = self.vocabulary
        a_id = vocabulary.word_to_id['a']
        b_id = vocabulary.word_to_id['b']
        self.assertEqual(bigram_counts[a_id, a_id], 3)
        self.assertEqual(bigram_counts[a_id, b_id], 2)
        self.assertEqual(bigram_counts[b_id, a_id], 1)
        self.assertEqual(bigram_counts[b_id, b_id], 0)
Beispiel #13
0
    def setUp(self):
        script_path = os.path.dirname(os.path.realpath(__file__))

        vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
        with open(vocabulary_path) as vocabulary_file:
            self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words')

        self.sos_id = self.vocabulary.word_to_id['<s>']
        self.yksi_id = self.vocabulary.word_to_id['yksi']
        self.kaksi_id = self.vocabulary.word_to_id['kaksi']
        self.eos_id = self.vocabulary.word_to_id['</s>']

        projection_vector = tensor.zeros(shape=(self.vocabulary.num_words(), ),
                                         dtype=theano.config.floatX)
        self.sos_prob = 0.1
        projection_vector = tensor.set_subtensor(
            projection_vector[self.sos_id], self.sos_prob)
        self.yksi_prob = 0.2
        projection_vector = tensor.set_subtensor(
            projection_vector[self.yksi_id], self.yksi_prob)
        self.kaksi_prob = 0.3
        projection_vector = tensor.set_subtensor(
            projection_vector[self.kaksi_id], self.kaksi_prob)
        self.eos_prob = 0.4
        projection_vector = tensor.set_subtensor(
            projection_vector[self.eos_id], self.eos_prob)
        self.network = DummyNetwork(self.vocabulary, projection_vector)

        lattice_path = os.path.join(script_path, 'lattice.slf')
        with open(lattice_path) as lattice_file:
            self.lattice = SLFLattice(lattice_file)
Beispiel #14
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary, architecture,
                          predict_next_distribution=True)
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    for i in range(args.num_sentences):
        words = sampler.generate()
        args.output_file.write('{}: {}\n'.format(
            i, ' '.join(words)))
Beispiel #15
0
def sample(args):
    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary,
                          architecture,
                          mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)

    print("Building text sampler.")
    sys.stdout.flush()
    sampler = TextSampler(network)

    sequences = sampler.generate(30, args.num_sentences)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos + 1]
        except:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
Beispiel #16
0
 def setUp(self):
     script_path = os.path.dirname(os.path.realpath(__file__))
     sentences_path = os.path.join(script_path, 'sentences.txt')
     with open(sentences_path) as sentences_file:
         self.vocabulary = Vocabulary.from_corpus(sentences_file)
         sentences_file.seek(0)
         self.statistics = WordStatistics([sentences_file], self.vocabulary)
Beispiel #17
0
    def test_compute_probs(self):
        self.classes_file.seek(0)
        vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
        vocabulary.compute_probs([self.sentences1_file, self.sentences2_file])

        word_id = vocabulary.word_to_id['yksi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kaksi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kolme']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5)
        word_id = vocabulary.word_to_id['neljä']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5)
        word_id = vocabulary.word_to_id['viisi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kuusi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['seitsemän']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['kahdeksan']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['yhdeksän']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['kymmenen']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
Beispiel #18
0
    def test_compute_probs(self):
        self.classes_file.seek(0)
        vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
        vocabulary.compute_probs([self.sentences1_file, self.sentences2_file])

        word_id = vocabulary.word_to_id['yksi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kaksi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kolme']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5)
        word_id = vocabulary.word_to_id['neljä']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.5)
        word_id = vocabulary.word_to_id['viisi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
        word_id = vocabulary.word_to_id['kuusi']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['seitsemän']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['kahdeksan']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['yhdeksän']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 0.25)
        word_id = vocabulary.word_to_id['kymmenen']
        self.assertAlmostEqual(vocabulary.get_word_prob(word_id), 1.0)
    def setUp(self):
        theano.config.compute_test_value = 'warn'

        script_path = os.path.dirname(os.path.realpath(__file__))
        vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
        with open(vocabulary_path) as vocabulary_file:
            self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words')
        self.dummy_network = DummyNetwork(self.vocabulary)
    def setUp(self):
        self.maxDiff = None
        script_path = os.path.dirname(os.path.realpath(__file__))

        vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
        with open(vocabulary_path) as vocabulary_file:
            self.vocabulary = Vocabulary.from_file(vocabulary_file,
                                                   'words',
                                                   oos_words=['oos1', 'oos2'])
        self.vocabulary.compute_probs({
            'yksi': 1,
            'kaksi': 1,
            'kolme': 1,
            'neljä': 1,
            'viisi': 1,
            'kuusi': 1,
            'seitsemän': 1,
            'kahdeksan': 1,
            'yhdeksän': 1,
            'kymmenen': 1,
            'oos1': 1,
            'oos2': 2
        })

        self.sos_id = self.vocabulary.word_to_id['<s>']
        self.yksi_id = self.vocabulary.word_to_id['yksi']
        self.kaksi_id = self.vocabulary.word_to_id['kaksi']
        self.eos_id = self.vocabulary.word_to_id['</s>']
        self.unk_id = self.vocabulary.word_to_id['<unk>']
        self.oos1_id = self.vocabulary.word_to_id['oos1']
        self.oos2_id = self.vocabulary.word_to_id['oos2']

        projection_vector = tensor.zeros(
            shape=(self.vocabulary.num_shortlist_words(), ),
            dtype=theano.config.floatX)
        self.sos_prob = 0.1
        projection_vector = tensor.set_subtensor(
            projection_vector[self.sos_id], self.sos_prob)
        self.yksi_prob = 0.2
        projection_vector = tensor.set_subtensor(
            projection_vector[self.yksi_id], self.yksi_prob)
        self.kaksi_prob = 0.3
        projection_vector = tensor.set_subtensor(
            projection_vector[self.kaksi_id], self.kaksi_prob)
        self.eos_prob = 0.4
        projection_vector = tensor.set_subtensor(
            projection_vector[self.eos_id], self.eos_prob)
        self.unk_prob = 0.3
        projection_vector = tensor.set_subtensor(
            projection_vector[self.unk_id], self.unk_prob)
        self.network = DummyNetwork(self.vocabulary, projection_vector)

        lattice_path = os.path.join(script_path, 'lattice.slf')
        with open(lattice_path) as lattice_file:
            self.lattice = SLFLattice(lattice_file)
def restoreModel(path):
    with h5py.File(path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of words in shortlist:", vocabulary.num_shortlist_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary, mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        network.set_state(state)
        return network
Beispiel #22
0
 def test_class_ids(self):
     self.classes_file.seek(0)
     vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     word_id = vocabulary.word_to_id['yksi']
     yksi_class_id = vocabulary.word_id_to_class_id[word_id]
     word_id = vocabulary.word_to_id['kaksi']
     kaksi_class_id = vocabulary.word_id_to_class_id[word_id]
     word_id = vocabulary.word_to_id['kolme']
     kolme_class_id = vocabulary.word_id_to_class_id[word_id]
     word_id = vocabulary.word_to_id['neljä']
     nelja_class_id = vocabulary.word_id_to_class_id[word_id]
     word_id = vocabulary.word_to_id['</s>']
     eos_class_id = vocabulary.word_id_to_class_id[word_id]
     self.assertNotEqual(yksi_class_id, kaksi_class_id)
     self.assertEqual(kolme_class_id, nelja_class_id)
     self.assertNotEqual(kolme_class_id, eos_class_id)
     self.assertEqual(kaksi_class_id, eos_class_id)
Beispiel #23
0
    def from_file(cls,
                  model_path,
                  mode=None,
                  exclude_unk=False,
                  default_device=None):
        """Reads a model from an HDF5 file.

        :type model_path: str
        :param model_path: path to a HDF5 model file

        :type mode: Network.Mode
        :param mode: selects mini-batch or single time step processing

        :type exclude_unk: bool
        :param exclude_unk: if set to ``True``, set ``<unk>`` probability to
                            zero before normalizing the network outputs
                            (required to get exact normalization during
                            inference)

        :type default_device: str
        :param default_device: default device where to store the shared variables
        """

        with h5py.File(model_path, 'r') as state:
            logging.info("Reading vocabulary from network state.")
            #sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
            logging.info("Number of words in vocabulary: {}".format(
                vocabulary.num_words()))
            logging.info("Number of words in shortlist: {}".format(
                vocabulary.num_shortlist_words()))
            logging.info("Number of word classes: {}".format(
                vocabulary.num_classes()))
            logging.info("Building neural network.")
            #sys.stdout.flush()
            architecture = Architecture.from_state(state)
            result = cls(architecture,
                         vocabulary,
                         mode=mode,
                         exclude_unk=exclude_unk,
                         default_device=default_device)
            logging.info("Restoring neural network state.")
            result.set_training()
            logging.info("Reseting to the neural network to evaluate.")
            result.set_state(state)
            return result
Beispiel #24
0
def sample(args):
    """A function that performs the "theanolm sample" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    numpy.random.seed(args.random_seed)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'

    with h5py.File(args.model_path, 'r') as state:
        logging.info("Reading vocabulary from network state.")
        vocabulary = Vocabulary.from_state(state)
        logging.info("Number of words in vocabulary: %d",
                     vocabulary.num_words())
        logging.info("Number of words in shortlist: %d",
                     vocabulary.num_shortlist_words())
        logging.info("Number of word classes: %d", vocabulary.num_classes())
        logging.info("Building neural network.")
        architecture = Architecture.from_state(state)
        default_device = get_default_device(args.default_device)
        network = Network(architecture,
                          vocabulary,
                          mode=Network.Mode(minibatch=False),
                          default_device=default_device)
        logging.info("Restoring neural network state.")
        network.set_state(state)

    logging.info("Building text sampler.")
    sampler = TextSampler(network)

    sequences = sampler.generate(args.sentence_length,
                                 args.num_sentences,
                                 seed_sequence=args.seed_sequence)
    for sequence in sequences:
        try:
            eos_pos = sequence.index('</s>')
            sequence = sequence[:eos_pos + 1]
        except ValueError:
            pass
        args.output_file.write(' '.join(sequence) + '\n')
Beispiel #25
0
def score(args):
    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    print("Building text scorer.")
    sys.stdout.flush()
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    scorer = TextScorer(network, ignore_unk, unk_penalty)

    print("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, vocabulary, scorer, args.output_file,
                          args.log_base)
    else:
        print("Invalid output format requested:", args.output)
        sys.exit(1)
Beispiel #26
0
    def __init__(self, model_path):
        self.model_path = model_path
        numpy.random.seed()
        theano.config.compute_test_value = 'off'

        with h5py.File(model_path, 'r') as self.state:
            print("Reading vocabulary from network state.")
            #sys.stdout.flush()
            self.vocabulary = Vocabulary.from_state(self.state)
            print("Number of words in vocabulary:", self.vocabulary.num_words())
            print("Number of words in shortlist:", self.vocabulary.num_shortlist_words())
            print("Number of word classes:", self.vocabulary.num_classes())
            print("Building neural network.")
            #sys.stdout.flush()
            self.architecture = Architecture.from_state(self.state)
            self.network = Network(self.architecture, self.vocabulary, mode=Network.Mode(minibatch=False))
            print("Restoring neural network state.")
            self.network.set_state(self.state)

        print("Building text sampler.")
        #sys.stdout.flush()
        self.sampler = TextSampler(self.network)
Beispiel #27
0
def score(args):
    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary, architecture)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    print("Building text scorer.")
    sys.stdout.flush()
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    scorer = TextScorer(network, ignore_unk, unk_penalty)

    print("Scoring text.")
    if args.output == 'perplexity':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, False)
    elif args.output == 'word-scores':
        _score_text(args.input_file, vocabulary, scorer, args.output_file,
                    args.log_base, True)
    elif args.output == 'utterance-scores':
        _score_utterances(args.input_file, vocabulary, scorer,
                          args.output_file, args.log_base)
Beispiel #28
0
 def test_word_ids_to_names(self):
     self.classes_file.seek(0)
     vocabulary = Vocabulary.from_file(self.classes_file, 'srilm-classes')
     word_ids = [vocabulary.word_to_id['yksi'],
                 vocabulary.word_to_id['kaksi'],
                 vocabulary.word_to_id['kolme'],
                 vocabulary.word_to_id['neljä'],
                 vocabulary.word_to_id['viisi'],
                 vocabulary.word_to_id['kuusi'],
                 vocabulary.word_to_id['seitsemän'],
                 vocabulary.word_to_id['kahdeksan'],
                 vocabulary.word_to_id['yhdeksän'],
                 vocabulary.word_to_id['kymmenen']]
     names = vocabulary.word_ids_to_names(word_ids)
     self.assertEqual(names[0], 'yksi')
     self.assertEqual(names[1], 'kaksi')
     self.assertTrue(names[2].startswith('CLASS-'))
     self.assertEqual(names[2], names[3])
     self.assertEqual(names[4], 'viisi')
     self.assertTrue(names[5].startswith('CLASS-'))
     self.assertEqual(names[5], names[6])
     self.assertEqual(names[5], names[7])
     self.assertEqual(names[5], names[8])
     self.assertEqual(names[9], 'kymmenen')
Beispiel #29
0
def decode(args):
    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(architecture, vocabulary,
                          mode=Network.Mode(minibatch=False))
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base)

    if args.wi_penalty is None:
        wi_penalty = None
    else:
        wi_penalty = args.wi_penalty * log_scale
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    decoding_options = {
        'nnlm_weight': args.nnlm_weight,
        'lm_scale': args.lm_scale,
        'wi_penalty': wi_penalty,
        'ignore_unk': ignore_unk,
        'unk_penalty': unk_penalty,
        'linear_interpolation': args.linear_interpolation,
        'max_tokens_per_node': args.max_tokens_per_node,
        'beam': args.beam,
        'recombination_order': args.recombination_order
    }
    logging.debug("DECODING OPTIONS")
    for option_name, option_value in decoding_options.items():
        logging.debug("%s: %s", option_name, str(option_value))

    print("Building word lattice decoder.")
    sys.stdout.flush()
    decoder = LatticeDecoder(network, decoding_options)

    # Combine paths from command line and lattice list.
    lattices = args.lattices
    lattices.extend(args.lattice_list.readlines())
    lattices = [path.strip() for path in lattices]
    # Ignore empty lines in the lattice list.
    lattices = list(filter(None, lattices))
    # Pick every Ith lattice, if --num-jobs is specified and > 1.
    if args.num_jobs < 1:
        print("Invalid number of jobs specified:", args.num_jobs)
        sys.exit(1)
    if (args.job < 0) or (args.job > args.num_jobs - 1):
        print("Invalid job specified:", args.job)
        sys.exit(1)
    lattices = lattices[args.job::args.num_jobs]

    file_type = TextFileType('r')
    for index, path in enumerate(lattices):
        logging.info("Reading word lattice: %s", path)
        lattice_file = file_type(path)
        lattice = SLFLattice(lattice_file)

        if not lattice.utterance_id is None:
            utterance_id = lattice.utterance_id
        else:
            utterance_id = os.path.basename(lattice_file.name)
        logging.info("Utterance `%s' -- %d/%d of job %d",
                     utterance_id,
                     index + 1,
                     len(lattices),
                     args.job)
        tokens = decoder.decode(lattice)

        for index in range(min(args.n_best, len(tokens))):
            line = format_token(tokens[index],
                                utterance_id,
                                vocabulary,
                                log_scale,
                                args.output)
            args.output_file.write(line + "\n")
Beispiel #30
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = "%(asctime)s %(funcName)s: %(message)s"
    if args.log_file == "-":
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = "warn"
        print("Enabled computing test values for tensor variables.")
        print("Warning: GpuArray backend will fail random number generation!")
    else:
        theano.config.compute_test_value = "off"
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, "a", driver="core") as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, "rt", encoding="utf-8") as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file, args.vocabulary_format)
                if args.vocabulary_format == "classes":
                    print("Computing class membership probabilities from " "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        if args.num_noise_samples > vocabulary.num_classes():
            print(
                "Number of noise samples ({}) is larger than the number of "
                "classes. This doesn't make sense and would cause sampling "
                "to fail.".format(args.num_noise_samples)
            )
            sys.exit(1)

        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        training_options = {
            "batch_size": args.batch_size,
            "sequence_length": args.sequence_length,
            "validation_frequency": args.validation_frequency,
            "patience": args.patience,
            "stopping_criterion": args.stopping_criterion,
            "max_epochs": args.max_epochs,
            "min_epochs": args.min_epochs,
            "max_annealing_count": args.max_annealing_count,
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        optimization_options = {
            "method": args.optimization_method,
            "epsilon": args.numerical_stability_term,
            "gradient_decay_rate": args.gradient_decay_rate,
            "sqr_gradient_decay_rate": args.sqr_gradient_decay_rate,
            "learning_rate": args.learning_rate,
            "weights": weights,
            "momentum": args.momentum,
            "max_gradient_norm": args.gradient_normalization,
            "cost_function": args.cost,
            "num_noise_samples": args.num_noise_samples,
            "noise_sharing": args.noise_sharing,
            "ignore_unk": ignore_unk,
            "unk_penalty": unk_penalty,
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ", ".join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training " "files.")
            sys.exit(1)

        print("Creating trainer.")
        sys.stdout.flush()
        trainer = Trainer(training_options, vocabulary, args.training_set, args.sampling)
        trainer.set_logging(args.log_interval)

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == "lstm300" or args.architecture == "lstm1500":
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, "rt", encoding="utf-8") as arch_file:
                architecture = Architecture.from_description(arch_file)

        network = Network(
            architecture,
            vocabulary,
            trainer.class_prior_probs,
            args.noise_dampening,
            default_device=args.default_device,
            profile=args.profile,
        )

        print("Compiling optimization function.")
        sys.stdout.flush()
        optimizer = create_optimizer(optimization_options, network, device=args.default_device, profile=args.profile)

        if args.print_graph:
            print("Cost function computation graph:")
            theano.printing.debugprint(optimizer.gradient_update_function)

        trainer.initialize(network, state, optimizer)

        if not args.validation_file is None:
            print("Building text scorer for cross-validation.")
            sys.stdout.flush()
            scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)
            print("Validation text:", args.validation_file.name)
            validation_mmap = mmap.mmap(args.validation_file.fileno(), 0, prot=mmap.PROT_READ)
            validation_iter = LinearBatchIterator(
                validation_mmap, vocabulary, batch_size=args.batch_size, max_sequence_length=None
            )
            trainer.set_validation(validation_iter, scorer)
        else:
            print("Cross-validation will not be performed.")
            validation_iter = None

        print("Training neural network.")
        sys.stdout.flush()
        trainer.train()

        if not "layers" in state.keys():
            print(
                "The model has not been trained. No cross-validations were "
                "performed or training did not improve the model."
            )
        elif not validation_iter is None:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #31
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set,
                                                args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file,
                                                  args.vocabulary_format)
                if args.vocabulary_format == 'classes':
                    print("Computing class membership probabilities from "
                          "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)
        network = Network(vocabulary, architecture, profile=args.profile)

        sys.stdout.flush()
        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        print("Building text scorer.")
        scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)

        validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
        validation_iter = \
            LinearBatchIterator(validation_mmap,
                                vocabulary,
                                batch_size=args.batch_size,
                                max_sequence_length=None)

        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'cost_function': args.cost,
            'num_noise_samples': args.num_noise_samples,
            'ignore_unk': ignore_unk,
            'unk_penalty': unk_penalty
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ', '.join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        training_options = {
            'strategy': args.training_strategy,
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        print("Building neural network trainer.")
        sys.stdout.flush()
        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)
        trainer = create_trainer(
            training_options, optimization_options,
            network, vocabulary, scorer,
            args.training_set, args.sampling, validation_iter,
            state, args.profile)
        trainer.set_logging(args.log_interval)

        print("Training neural network.")
        sys.stdout.flush()
        trainer.train()

        if not 'layers' in state.keys():
            print("The model has not been trained. No cross-validations were "
                  "performed or training did not improve the model.")
        else:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
    def test_decode(self):
        vocabulary = Vocabulary.from_word_counts({
            'to': 1,
            'and': 1,
            'it': 1,
            'but': 1,
            'a.': 1,
            'in': 1,
            'a': 1,
            'at': 1,
            'the': 1,
            "didn't": 1,
            'elaborate': 1
        })
        projection_vector = tensor.ones(
            shape=(vocabulary.num_shortlist_words(), ),
            dtype=theano.config.floatX)
        projection_vector *= 0.05
        network = DummyNetwork(vocabulary, projection_vector)

        decoding_options = {
            'nnlm_weight': 0.0,
            'lm_scale': None,
            'wi_penalty': None,
            'unk_penalty': None,
            'use_shortlist': False,
            'unk_from_lattice': False,
            'linear_interpolation': True,
            'max_tokens_per_node': None,
            'beam': None,
            'recombination_order': 20
        }
        decoder = LatticeDecoder(network, decoding_options)
        tokens = decoder.decode(self.lattice)[0]

        # Compare tokens to n-best list given by SRILM lattice-tool.
        log_scale = math.log(10)

        print()
        for token in tokens:
            print(token.ac_logprob / log_scale,
                  token.lat_lm_logprob / log_scale,
                  token.total_logprob / log_scale,
                  ' '.join(token.history_words(vocabulary)))

        all_paths = [
            "<s> it didn't elaborate </s>", "<s> but it didn't elaborate </s>",
            "<s> the didn't elaborate </s>",
            "<s> and it didn't elaborate </s>", "<s> e. didn't elaborate </s>",
            "<s> in it didn't elaborate </s>", "<s> a didn't elaborate </s>",
            "<s> at it didn't elaborate </s>",
            "<s> it it didn't elaborate </s>",
            "<s> to it didn't elaborate </s>",
            "<s> a. it didn't elaborate </s>", "<s> a it didn't elaborate </s>"
        ]
        paths = [' '.join(token.history_words(vocabulary)) for token in tokens]
        self.assertListEqual(paths, all_paths)

        token = tokens[0]
        history = ' '.join(token.history_words(vocabulary))
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8686.28,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -94.3896,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4)

        token = tokens[1]
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8743.96,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -111.488,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)

        token = tokens[-1]
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8696.26,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -178.00,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
Beispiel #33
0
    def test_decode(self):
        vocabulary = Vocabulary.from_word_counts({
            'TO': 1,
            'AND': 1,
            'IT': 1,
            'BUT': 1,
            'A.': 1,
            'IN': 1,
            'A': 1,
            'AT': 1,
            'THE': 1,
            'E.': 1,
            "DIDN'T": 1,
            'ELABORATE': 1
        })
        projection_vector = tensor.ones(shape=(vocabulary.num_words(), ),
                                        dtype=theano.config.floatX)
        projection_vector *= 0.05
        network = DummyNetwork(vocabulary, projection_vector)

        decoding_options = {
            'nnlm_weight': 0.0,
            'lm_scale': None,
            'wi_penalty': None,
            'ignore_unk': False,
            'unk_penalty': None,
            'linear_interpolation': True,
            'max_tokens_per_node': None,
            'beam': None,
            'recombination_order': None
        }
        decoder = LatticeDecoder(network, decoding_options)
        tokens = decoder.decode(self.lattice)

        # Compare tokens to n-best list given by SRILM lattice-tool.
        log_scale = math.log(10)

        print()
        for token in tokens:
            print(token.ac_logprob / log_scale,
                  token.lat_lm_logprob / log_scale,
                  token.total_logprob / log_scale,
                  ' '.join(vocabulary.id_to_word[token.history]))

        all_paths = [
            "<s> IT DIDN'T ELABORATE </s>", "<s> BUT IT DIDN'T ELABORATE </s>",
            "<s> THE DIDN'T ELABORATE </s>",
            "<s> AND IT DIDN'T ELABORATE </s>", "<s> E. DIDN'T ELABORATE </s>",
            "<s> IN IT DIDN'T ELABORATE </s>", "<s> A DIDN'T ELABORATE </s>",
            "<s> AT IT DIDN'T ELABORATE </s>",
            "<s> IT IT DIDN'T ELABORATE </s>",
            "<s> TO IT DIDN'T ELABORATE </s>",
            "<s> A. IT DIDN'T ELABORATE </s>", "<s> A IT DIDN'T ELABORATE </s>"
        ]
        paths = [
            ' '.join(vocabulary.id_to_word[token.history]) for token in tokens
        ]
        self.assertListEqual(paths, all_paths)

        token = tokens[0]
        history = ' '.join(vocabulary.id_to_word[token.history])
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8686.28,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -94.3896,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4)

        token = tokens[1]
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8743.96,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -111.488,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)

        token = tokens[-1]
        self.assertAlmostEqual(token.ac_logprob / log_scale,
                               -8696.26,
                               places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale,
                               -178.00,
                               places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
Beispiel #34
0
def train(args):
    numpy.random.seed(args.random_seed)

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'a', driver='core') as state:
        if state.keys():
            print("Reading vocabulary from existing network state.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_state(state)
        elif args.vocabulary is None:
            print("Constructing vocabulary from training set.")
            sys.stdout.flush()
            vocabulary = Vocabulary.from_corpus(args.training_set,
                                                args.num_classes)
            for training_file in args.training_set:
                training_file.seek(0)
            vocabulary.get_state(state)
        else:
            print("Reading vocabulary from {}.".format(args.vocabulary))
            sys.stdout.flush()
            with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file:
                vocabulary = Vocabulary.from_file(vocab_file,
                                                  args.vocabulary_format)
                if args.vocabulary_format == 'classes':
                    print("Computing class membership probabilities from "
                          "unigram word counts.")
                    sys.stdout.flush()
                    vocabulary.compute_probs(args.training_set)
            vocabulary.get_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())

        print("Building neural network.")
        sys.stdout.flush()
        if args.architecture == 'lstm300' or args.architecture == 'lstm1500':
            architecture = Architecture.from_package(args.architecture)
        else:
            with open(args.architecture, 'rt', encoding='utf-8') as arch_file:
                architecture = Architecture.from_description(arch_file)
        network = Network(vocabulary, architecture, profile=args.profile)

        sys.stdout.flush()
        if args.unk_penalty is None:
            ignore_unk = False
            unk_penalty = None
        elif args.unk_penalty == 0:
            ignore_unk = True
            unk_penalty = None
        else:
            ignore_unk = False
            unk_penalty = args.unk_penalty

        num_training_files = len(args.training_set)
        if len(args.weights) > num_training_files:
            print("You specified more weights than training files.")
            sys.exit(1)
        weights = numpy.ones(num_training_files).astype(theano.config.floatX)
        for index, weight in enumerate(args.weights):
            weights[index] = weight

        print("Building text scorer.")
        scorer = TextScorer(network, ignore_unk, unk_penalty, args.profile)

        validation_mmap = mmap.mmap(args.validation_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
        validation_iter = LinearBatchIterator(validation_mmap,
                                              vocabulary,
                                              batch_size=32)

        optimization_options = {
            'method': args.optimization_method,
            'epsilon': args.numerical_stability_term,
            'gradient_decay_rate': args.gradient_decay_rate,
            'sqr_gradient_decay_rate': args.sqr_gradient_decay_rate,
            'learning_rate': args.learning_rate,
            'weights': weights,
            'momentum': args.momentum,
            'max_gradient_norm': args.gradient_normalization,
            'ignore_unk': ignore_unk,
            'unk_penalty': unk_penalty
        }
        logging.debug("OPTIMIZATION OPTIONS")
        for option_name, option_value in optimization_options.items():
            if type(option_value) is list:
                value_str = ', '.join(str(x) for x in option_value)
                logging.debug("%s: [%s]", option_name, value_str)
            else:
                logging.debug("%s: %s", option_name, str(option_value))

        training_options = {
            'strategy': args.training_strategy,
            'batch_size': args.batch_size,
            'sequence_length': args.sequence_length,
            'validation_frequency': args.validation_frequency,
            'patience': args.patience,
            'stopping_criterion': args.stopping_criterion,
            'max_epochs': args.max_epochs,
            'min_epochs': args.min_epochs,
            'max_annealing_count': args.max_annealing_count
        }
        logging.debug("TRAINING OPTIONS")
        for option_name, option_value in training_options.items():
            logging.debug("%s: %s", option_name, str(option_value))

        print("Building neural network trainer.")
        sys.stdout.flush()
        if len(args.sampling) > len(args.training_set):
            print("You specified more sampling coefficients than training "
                  "files.")
            sys.exit(1)
        trainer = create_trainer(
            training_options, optimization_options,
            network, vocabulary, scorer,
            args.training_set, args.sampling, validation_iter,
            state, args.profile)
        trainer.set_logging(args.log_interval)

        print("Training neural network.")
        sys.stdout.flush()
        trainer.run()

        if not state.keys():
            print("The model has not been trained.")
        else:
            network.set_state(state)
            perplexity = scorer.compute_perplexity(validation_iter)
            print("Best validation set perplexity:", perplexity)
Beispiel #35
0
def decode(args):
    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    with h5py.File(args.model_path, 'r') as state:
        print("Reading vocabulary from network state.")
        sys.stdout.flush()
        vocabulary = Vocabulary.from_state(state)
        print("Number of words in vocabulary:", vocabulary.num_words())
        print("Number of word classes:", vocabulary.num_classes())
        print("Building neural network.")
        sys.stdout.flush()
        architecture = Architecture.from_state(state)
        network = Network(vocabulary,
                          architecture,
                          mode=Network.Mode.target_words)
        print("Restoring neural network state.")
        sys.stdout.flush()
        network.set_state(state)

    log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base)

    if args.wi_penalty is None:
        wi_penalty = None
    else:
        wi_penalty = args.wi_penalty * log_scale
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    decoding_options = {
        'nnlm_weight': args.nnlm_weight,
        'lm_scale': args.lm_scale,
        'wi_penalty': wi_penalty,
        'ignore_unk': ignore_unk,
        'unk_penalty': unk_penalty,
        'linear_interpolation': args.linear_interpolation,
        'max_tokens_per_node': args.max_tokens_per_node,
        'beam': args.beam,
        'recombination_order': args.recombination_order
    }
    logging.debug("DECODING OPTIONS")
    for option_name, option_value in decoding_options.items():
        logging.debug("%s: %s", option_name, str(option_value))

    print("Building word lattice decoder.")
    sys.stdout.flush()
    decoder = LatticeDecoder(network, decoding_options)

    # Combine paths from command line and lattice list.
    lattices = args.lattices
    lattices.extend(args.lattice_list.readlines())
    lattices = [path.strip() for path in lattices]
    # Ignore empty lines in the lattice list.
    lattices = list(filter(None, lattices))
    # Pick every Ith lattice, if --num-jobs is specified and > 1.
    if args.num_jobs < 1:
        print("Invalid number of jobs specified:", args.num_jobs)
        sys.exit(1)
    if (args.job < 0) or (args.job > args.num_jobs - 1):
        print("Invalid job specified:", args.job)
        sys.exit(1)
    lattices = lattices[args.job::args.num_jobs]

    file_type = TextFileType('r')
    for index, path in enumerate(lattices):
        logging.info("Reading word lattice: %s", path)
        lattice_file = file_type(path)
        lattice = SLFLattice(lattice_file)

        if not lattice.utterance_id is None:
            utterance_id = lattice.utterance_id
        else:
            utterance_id = os.path.basename(lattice_file.name)
        logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id,
                     index + 1, len(lattices), args.job)
        tokens = decoder.decode(lattice)

        for index in range(min(args.n_best, len(tokens))):
            line = format_token(tokens[index], utterance_id, vocabulary,
                                log_scale, args.output)
            args.output_file.write(line + "\n")
 def setUp(self):
     script_path = os.path.dirname(os.path.realpath(__file__))
     vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
     with open(vocabulary_path) as vocabulary_file:
         self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words')
     self.dummy_network = DummyNetwork(self.vocabulary)
Beispiel #37
0
def _read_vocabulary(args, state):
    """If ``state`` contains data, reads the vocabulary from the HDF5 state.
    Otherwise reads a vocabulary file or constructs the vocabulary from the
    training set and writes it to the HDF5 state.

    If the state does not contain data and --vocabulary argument is given, reads
    the vocabulary from the file given after the argument. The rest of the words
    in the training set will be added as out-of-shortlist words.

    If the state does not contain data and no vocabulary is given, constructs a
    vocabulary that contains all the training set words. In that case,
    --num-classes argument can be used to control the number of classes.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments

    :type state: hdf5.File
    :param state: HDF5 file where the vocabulary should be saved

    :rtype: Vocabulary
    :returns: the created vocabulary
    """

    if state.keys():
        print("Reading vocabulary from existing network state.")
        sys.stdout.flush()
        result = Vocabulary.from_state(state)
        if not result.has_unigram_probs():
            # This is for backward compatibility. Remove at some point.
            print("Computing unigram word probabilities from training set.")
            sys.stdout.flush()
            word_counts = compute_word_counts(args.training_set)
            shortlist_words = list(result.id_to_word)
            shortlist_set = set(shortlist_words)
            oos_words = [
                x for x in word_counts.keys() if x not in shortlist_set
            ]
            result.id_to_word = numpy.asarray(shortlist_words + oos_words,
                                              dtype=object)
            result.word_to_id = {
                word: word_id
                for word_id, word in enumerate(result.id_to_word)
            }
            result.compute_probs(word_counts, update_class_probs=False)
            result.get_state(state)

    elif args.vocabulary is None:
        print("Constructing vocabulary from training set.")
        sys.stdout.flush()
        word_counts = compute_word_counts(args.training_set)
        result = Vocabulary.from_word_counts(word_counts, args.num_classes)
        result.get_state(state)

    else:
        print("Reading vocabulary from {}.".format(args.vocabulary))
        sys.stdout.flush()
        word_counts = compute_word_counts(args.training_set)
        oos_words = word_counts.keys()
        with open(args.vocabulary, 'rt', encoding='utf-8') as vocab_file:
            result = Vocabulary.from_file(vocab_file,
                                          args.vocabulary_format,
                                          oos_words=oos_words)

        if args.vocabulary_format == 'classes':
            print("Computing class membership probabilities and unigram "
                  "probabilities for out-of-shortlist words.")
            sys.stdout.flush()
            update_class_probs = True
        else:
            print(
                "Computing unigram probabilities for out-of-shortlist words.")
            sys.stdout.flush()
            update_class_probs = False
        result.compute_probs(word_counts,
                             update_class_probs=update_class_probs)
        result.get_state(state)

    print("Number of words in vocabulary:", result.num_words())
    print("Number of words in shortlist:", result.num_shortlist_words())
    print("Number of word classes:", result.num_classes())
    return result
Beispiel #38
0
 def setUp(self):
     script_path = os.path.dirname(os.path.realpath(__file__))
     vocabulary_path = os.path.join(script_path, 'vocabulary.txt')
     with open(vocabulary_path) as vocabulary_file:
         self.vocabulary = Vocabulary.from_file(vocabulary_file, 'words')
     self.dummy_network = DummyNetwork(self.vocabulary)
Beispiel #39
0
    def test_decode(self):
        vocabulary = Vocabulary.from_word_counts({
            'TO': 1,
            'AND': 1,
            'IT': 1,
            'BUT': 1,
            'A.': 1,
            'IN': 1,
            'A': 1,
            'AT': 1,
            'THE': 1,
            'E.': 1,
            "DIDN'T": 1,
            'ELABORATE': 1})
        projection_vector = tensor.ones(shape=(vocabulary.num_words(),),
                                        dtype=theano.config.floatX)
        projection_vector *= 0.05
        network = DummyNetwork(vocabulary, projection_vector)

        decoding_options = {
            'nnlm_weight': 0.0,
            'lm_scale': None,
            'wi_penalty': None,
            'ignore_unk': False,
            'unk_penalty': None,
            'linear_interpolation': True,
            'max_tokens_per_node': None,
            'beam': None,
            'recombination_order': None
        }
        decoder = LatticeDecoder(network, decoding_options)
        tokens = decoder.decode(self.lattice)

        # Compare tokens to n-best list given by SRILM lattice-tool.
        log_scale = math.log(10)

        print()
        for token in tokens:
            print(token.ac_logprob / log_scale,
                  token.lat_lm_logprob / log_scale,
                  token.total_logprob / log_scale,
                  ' '.join(vocabulary.id_to_word[token.history]))

        all_paths = ["<s> IT DIDN'T ELABORATE </s>",
                     "<s> BUT IT DIDN'T ELABORATE </s>",
                     "<s> THE DIDN'T ELABORATE </s>",
                     "<s> AND IT DIDN'T ELABORATE </s>",
                     "<s> E. DIDN'T ELABORATE </s>",
                     "<s> IN IT DIDN'T ELABORATE </s>",
                     "<s> A DIDN'T ELABORATE </s>",
                     "<s> AT IT DIDN'T ELABORATE </s>",
                     "<s> IT IT DIDN'T ELABORATE </s>",
                     "<s> TO IT DIDN'T ELABORATE </s>",
                     "<s> A. IT DIDN'T ELABORATE </s>",
                     "<s> A IT DIDN'T ELABORATE </s>"]
        paths = [' '.join(vocabulary.id_to_word[token.history])
                 for token in tokens]
        self.assertListEqual(paths, all_paths)

        token = tokens[0]
        history = ' '.join(vocabulary.id_to_word[token.history])
        self.assertAlmostEqual(token.ac_logprob / log_scale, -8686.28, places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -94.3896, places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 4)

        token = tokens[1]
        self.assertAlmostEqual(token.ac_logprob / log_scale, -8743.96, places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -111.488, places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)

        token = tokens[-1]
        self.assertAlmostEqual(token.ac_logprob / log_scale, -8696.26, places=2)
        self.assertAlmostEqual(token.lat_lm_logprob / log_scale, -178.00, places=2)
        self.assertAlmostEqual(token.nn_lm_logprob, math.log(0.1) * 5)
Beispiel #40
0
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set',
        metavar='FILE',
        type=TextFileType('r'),
        nargs='+',
        required=True,
        help='text or .gz files containing training data (one sentence per '
        'line)')
    argument_group.add_argument(
        '--vocabulary',
        metavar='FILE',
        type=TextFileType('r'),
        default=None,
        help='text or .gz file containing a list of words to include in class '
        'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format',
        metavar='FORMAT',
        type=str,
        default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
        '"classes" (word and class ID per line), "srilm-classes" (class '
        'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format',
        metavar='FORMAT',
        type=str,
        default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
        'per line), "srilm-classes" (default; class name, membership '
        'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency',
        metavar='N',
        type=int,
        default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes',
        metavar='N',
        type=int,
        default=2000,
        help='number of classes to form, if vocabulary is not specified '
        '(default 2000)')
    argument_group.add_argument(
        '--method',
        metavar='NAME',
        type=str,
        default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
        '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--log-interval',
        metavar='N',
        type=int,
        default=1000,
        help='print statistics after every Nth word; quiet if less than one '
        '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info(
                    "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                    num_words, num_words / vocabulary.num_words() * 100,
                    iteration, num_moves, optimizer.log_likelihood(),
                    duration * 100)
            if is_scheduled(num_words, args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)
Beispiel #41
0
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set', metavar='FILE', type=TextFileType('r'),
        nargs='+', required=True,
        help='text or .gz files containing training data (one sentence per '
             'line)')
    argument_group.add_argument(
        '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None,
        help='text or .gz file containing a list of words to include in class '
             'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format', metavar='FORMAT', type=str, default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
             '"classes" (word and class ID per line), "srilm-classes" (class '
             'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file', metavar='FILE', type=TextFileType('w'), default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format', metavar='FORMAT', type=str, default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
             'per line), "srilm-classes" (default; class name, membership '
             'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency', metavar='N', type=int, default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes', metavar='N', type=int, default=2000,
        help='number of classes to form, if vocabulary is not specified '
             '(default 2000)')
    argument_group.add_argument(
        '--method', metavar='NAME', type=str, default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
             '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file', metavar='FILE', type=str, default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level', metavar='LEVEL', type=str, default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
             '(default "info")')
    argument_group.add_argument(
        '--log-interval', metavar='N', type=int, default=1000,
        help='print statistics after every Nth word; quiet if less than one '
             '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level)
    else:
        logging.basicConfig(filename=log_file, format=log_format, level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info("[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                     num_words,
                     num_words / vocabulary.num_words() * 100,
                     iteration,
                     num_moves,
                     optimizer.log_likelihood(),
                     duration * 100)
            if is_scheduled(num_words,
                            args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)
Beispiel #42
0
 def test_from_file(self):
     self.vocabulary_file.seek(0)
     vocabulary = Vocabulary.from_file(self.vocabulary_file, 'words')
     self.assertEqual(vocabulary.num_words(), 10 + 3)
     self.assertEqual(vocabulary.num_classes(), 10 + 3)
Beispiel #43
0
 def test_from_file(self):
     self.vocabulary_file.seek(0)
     vocabulary = Vocabulary.from_file(self.vocabulary_file, 'words')
     self.assertEqual(vocabulary.num_words(), 10 + 3)
     self.assertEqual(vocabulary.num_classes(), 10 + 3)