Example #1
0
    def __init__(self, statistics, vocabulary):
        """Computes initial statistics.

        :type statistics: WordStatistics
        :param statistics: word statistics from the training corpus

        :type vocabulary: theanolm.Vocabulary
        :param vocabulary: words to include in the optimization and initial classes
        """

        # count_nonzero() and any() seem to fail on the sparse matrix.
        if not statistics.unigram_counts.any():
            raise ValueError("Empty word unigram statistics.")
        if statistics.bigram_counts.nnz == 0:
            raise ValueError("Empty word bigram statistics.")

        super().__init__(vocabulary)

        # Create word counts.
        self._word_counts = statistics.unigram_counts
        logging.debug("Allocated %s for word counts.", byte_size(self._word_counts.nbytes))
        self._ww_counts = statistics.bigram_counts.tocsc()
        logging.debug("Allocated %s for sparse word-word counts.", byte_size(self._ww_counts.data.nbytes))

        # Initialize classes.
        self._word_to_class = numpy.array(vocabulary.word_id_to_class_id)
        logging.debug("Allocated %s for word-to-class mapping.", byte_size(self._word_to_class.nbytes))

        # Compute class counts from word counts.
        logging.info("Computing class and class/word statistics.")
        self._class_counts, self._cc_counts, self._cw_counts, self._wc_counts = self._compute_class_statistics(
            self._word_counts, self._ww_counts, self._word_to_class
        )
Example #2
0
    def __init__(self, statistics, vocabulary):
        """Computes initial statistics.

        :type statistics: BigramStatistics
        :param statistics: word statistics from the training corpus

        :type vocabulary: theanolm.Vocabulary
        :param vocabulary: words to include in the optimization and initial classes
        """

        # count_nonzero() and any() seem to fail on the sparse matrix.
        if not statistics.unigram_counts.any():
            raise ValueError("Empty word unigram statistics.")
        if statistics.bigram_counts.nnz == 0:
            raise ValueError("Empty word bigram statistics.")

        # Sparse classes in Theano 0.8 support only int32 indices.
        super().__init__(vocabulary, 'int32')

        # Create word counts.
        word_counts = statistics.unigram_counts
        self._word_counts = theano.shared(word_counts, 'word_counts')
        logging.debug("Allocated %s for word counts.",
                      byte_size(word_counts.nbytes))
        ww_counts_csc = statistics.bigram_counts.tocsc()
        self._ww_counts = theano.shared(ww_counts_csc, 'ww_counts_csc')
        logging.debug("Allocated %s for CSC word-word counts.",
                      byte_size(ww_counts_csc.data.nbytes))
        ww_counts_csr = statistics.bigram_counts.tocsr()
        self._ww_counts_csr = theano.shared(ww_counts_csr, 'ww_counts_csr')
        logging.debug("Allocated %s for CSR word-word counts.",
                      byte_size(ww_counts_csr.data.nbytes))

        # Initialize classes.
        word_to_class = numpy.array(vocabulary.word_id_to_class_id)
        self._word_to_class = theano.shared(word_to_class, 'word_to_class')
        logging.debug("Allocated %s for word-to-class mapping.",
                      byte_size(word_to_class.nbytes))

        # Compute class counts from word counts.
        logging.info("Computing class and class/word statistics.")
        class_counts, cc_counts, cw_counts, wc_counts = \
            self._compute_class_statistics(word_counts,
                                           ww_counts_csc,
                                           word_to_class)
        self._class_counts = theano.shared(class_counts, 'class_counts')
        self._cc_counts = theano.shared(cc_counts, 'cc_counts')
        self._cw_counts = theano.shared(cw_counts, 'cw_counts')
        self._wc_counts = theano.shared(wc_counts, 'wc_counts')

        # Create Theano functions.
        self._create_get_word_prob_function()
        self._create_evaluate_function()
        self._create_move_function()
        self._create_log_likelihood_function()
        self._create_class_size_function()
Example #3
0
    def __init__(self, statistics, vocabulary):
        """Computes initial statistics.

        :type statistics: WordStatistics
        :param statistics: word statistics from the training corpus

        :type vocabulary: theanolm.Vocabulary
        :param vocabulary: words to include in the optimization and initial classes
        """

	# count_nonzero() and any() seem to fail on the sparse matrix.
        if not statistics.unigram_counts.any():
            raise ValueError("Empty word unigram statistics.")
        if statistics.bigram_counts.nnz == 0:
            raise ValueError("Empty word bigram statistics.")

        # Sparse classes in Theano 0.8 support only int32 indices.
        super().__init__(vocabulary, 'int32')

        # Create word counts.
        word_counts = statistics.unigram_counts
        self._word_counts = theano.shared(word_counts, 'word_counts')
        logging.debug("Allocated %s for word counts.",
                      byte_size(word_counts.nbytes))
        ww_counts_csc = statistics.bigram_counts.tocsc()
        self._ww_counts = theano.shared(ww_counts_csc, 'ww_counts_csc')
        logging.debug("Allocated %s for CSC word-word counts.",
                      byte_size(ww_counts_csc.data.nbytes))
        ww_counts_csr = statistics.bigram_counts.tocsr()
        self._ww_counts_csr = theano.shared(ww_counts_csr, 'ww_counts_csr')
        logging.debug("Allocated %s for CSR word-word counts.",
                      byte_size(ww_counts_csr.data.nbytes))

        # Initialize classes.
        word_to_class = numpy.array(vocabulary.word_id_to_class_id)
        self._word_to_class = theano.shared(word_to_class, 'word_to_class')
        logging.debug("Allocated %s for word-to-class mapping.",
                      byte_size(word_to_class.nbytes))

        # Compute class counts from word counts.
        logging.info("Computing class and class/word statistics.")
        class_counts, cc_counts, cw_counts, wc_counts = \
            self._compute_class_statistics(word_counts,
                                           ww_counts_csc,
                                           word_to_class)
        self._class_counts = theano.shared(class_counts, 'class_counts')
        self._cc_counts = theano.shared(cc_counts, 'cc_counts')
        self._cw_counts = theano.shared(cw_counts, 'cw_counts')
        self._wc_counts = theano.shared(wc_counts, 'wc_counts')

        # Create Theano functions.
        self._create_get_word_prob_function()
        self._create_evaluate_function()
        self._create_move_function()
        self._create_log_likelihood_function()
        self._create_class_size_function()
Example #4
0
    def _compute_class_statistics(self, word_counts, ww_counts, word_to_class):
        """Computes class statistics from word statistics given the
        word-to-class mapping.

        :type word_counts: numpy.ndarray
        :param word_counts: word unigram counts

        :type ww_counts: scipy.sparse.csc_matrix
        :param ww_counts: word bigram counts

        :type word_to_class: numpy.ndarray
        :param word_to_class: gives the class ID of each word ID
        """

        class_counts = numpy.zeros(self.num_classes, self._count_type)
        logging.debug("Allocated %s for class counts.",
                      byte_size(class_counts.nbytes))
        cc_counts = numpy.zeros(
            (self.num_classes, self.num_classes), dtype=self._count_type)
        logging.debug("Allocated %s for class-class counts.",
                      byte_size(cc_counts.nbytes))
        cw_counts = numpy.zeros(
            (self.num_classes, self.vocabulary_size), dtype=self._count_type)
        logging.debug("Allocated %s for class-word counts.",
                      byte_size(cw_counts.nbytes))
        wc_counts = numpy.zeros(
            (self.vocabulary_size, self.num_classes), dtype=self._count_type)
        logging.debug("Allocated %s for word-class counts.",
                      byte_size(wc_counts.nbytes))

        numpy.add.at(class_counts, word_to_class, word_counts)

        left_word_ids, right_word_ids = ww_counts.nonzero()
        counts = ww_counts[left_word_ids, right_word_ids].flat
        left_class_ids = word_to_class[left_word_ids]
        right_class_ids = word_to_class[right_word_ids]
        numpy.add.at(cc_counts, (left_class_ids, right_class_ids), counts)
        numpy.add.at(cw_counts, (left_class_ids, right_word_ids), counts)
        numpy.add.at(wc_counts, (left_word_ids, right_class_ids), counts)

        return class_counts, cc_counts, cw_counts, wc_counts
Example #5
0
    def _compute_class_statistics(self, word_counts, ww_counts, word_to_class):
        """Computes class statistics from word statistics given the
        word-to-class mapping.

        :type word_counts: numpy.ndarray
        :param word_counts: word unigram counts

        :type ww_counts: scipy.sparse.csc_matrix
        :param ww_counts: word bigram counts

        :type word_to_class: numpy.ndarray
        :param word_to_class: gives the class ID of each word ID
        """

        class_counts = numpy.zeros(self.num_classes, dtype=self._count_type)
        logging.debug("Allocated %s for class counts.",
                      byte_size(class_counts.nbytes))
        cc_counts = numpy.zeros(
            (self.num_classes, self.num_classes), dtype=self._count_type)
        logging.debug("Allocated %s for class-class counts.",
                      byte_size(cc_counts.nbytes))
        cw_counts = numpy.zeros(
            (self.num_classes, self.vocabulary_size), dtype=self._count_type)
        logging.debug("Allocated %s for class-word counts.",
                      byte_size(cw_counts.nbytes))
        wc_counts = numpy.zeros(
            (self.vocabulary_size, self.num_classes), dtype=self._count_type)
        logging.debug("Allocated %s for word-class counts.",
                      byte_size(wc_counts.nbytes))

        numpy.add.at(class_counts, word_to_class, word_counts)

        left_word_ids, right_word_ids = ww_counts.nonzero()
        counts = ww_counts[left_word_ids, right_word_ids].flat
        left_class_ids = word_to_class[left_word_ids]
        right_class_ids = word_to_class[right_word_ids]
        numpy.add.at(cc_counts, (left_class_ids, right_class_ids), counts)
        numpy.add.at(cw_counts, (left_class_ids, right_word_ids), counts)
        numpy.add.at(wc_counts, (left_word_ids, right_class_ids), counts)

        return class_counts, cc_counts, cw_counts, wc_counts
    def __init__(self, statistics, vocabulary):
        """Computes initial statistics.

        :type statistics: WordStatistics
        :param statistics: word statistics from the training corpus

        :type vocabulary: theanolm.Vocabulary
        :param vocabulary: words to include in the optimization and initial classes
        """

        # count_nonzero() and any() seem to fail on the sparse matrix.
        if not statistics.unigram_counts.any():
            raise ValueError("Empty word unigram statistics.")
        if statistics.bigram_counts.nnz == 0:
            raise ValueError("Empty word bigram statistics.")

        super().__init__(vocabulary)

        # Create word counts.
        self._word_counts = statistics.unigram_counts
        logging.debug("Allocated %s for word counts.",
                      byte_size(self._word_counts.nbytes))
        self._ww_counts = statistics.bigram_counts.tocsc()
        logging.debug("Allocated %s for sparse word-word counts.",
                      byte_size(self._ww_counts.data.nbytes))

        # Initialize classes.
        self._word_to_class = numpy.array(vocabulary.word_id_to_class_id)
        logging.debug("Allocated %s for word-to-class mapping.",
                      byte_size(self._word_to_class.nbytes))

        # Compute class counts from word counts.
        logging.info("Computing class and class/word statistics.")
        self._class_counts, self._cc_counts, self._cw_counts, self._wc_counts = \
            self._compute_class_statistics(self._word_counts,
                                           self._ww_counts,
                                           self._word_to_class)