def __init__(self, statistics, vocabulary): """Computes initial statistics. :type statistics: WordStatistics :param statistics: word statistics from the training corpus :type vocabulary: theanolm.Vocabulary :param vocabulary: words to include in the optimization and initial classes """ # count_nonzero() and any() seem to fail on the sparse matrix. if not statistics.unigram_counts.any(): raise ValueError("Empty word unigram statistics.") if statistics.bigram_counts.nnz == 0: raise ValueError("Empty word bigram statistics.") super().__init__(vocabulary) # Create word counts. self._word_counts = statistics.unigram_counts logging.debug("Allocated %s for word counts.", byte_size(self._word_counts.nbytes)) self._ww_counts = statistics.bigram_counts.tocsc() logging.debug("Allocated %s for sparse word-word counts.", byte_size(self._ww_counts.data.nbytes)) # Initialize classes. self._word_to_class = numpy.array(vocabulary.word_id_to_class_id) logging.debug("Allocated %s for word-to-class mapping.", byte_size(self._word_to_class.nbytes)) # Compute class counts from word counts. logging.info("Computing class and class/word statistics.") self._class_counts, self._cc_counts, self._cw_counts, self._wc_counts = self._compute_class_statistics( self._word_counts, self._ww_counts, self._word_to_class )
def __init__(self, statistics, vocabulary): """Computes initial statistics. :type statistics: BigramStatistics :param statistics: word statistics from the training corpus :type vocabulary: theanolm.Vocabulary :param vocabulary: words to include in the optimization and initial classes """ # count_nonzero() and any() seem to fail on the sparse matrix. if not statistics.unigram_counts.any(): raise ValueError("Empty word unigram statistics.") if statistics.bigram_counts.nnz == 0: raise ValueError("Empty word bigram statistics.") # Sparse classes in Theano 0.8 support only int32 indices. super().__init__(vocabulary, 'int32') # Create word counts. word_counts = statistics.unigram_counts self._word_counts = theano.shared(word_counts, 'word_counts') logging.debug("Allocated %s for word counts.", byte_size(word_counts.nbytes)) ww_counts_csc = statistics.bigram_counts.tocsc() self._ww_counts = theano.shared(ww_counts_csc, 'ww_counts_csc') logging.debug("Allocated %s for CSC word-word counts.", byte_size(ww_counts_csc.data.nbytes)) ww_counts_csr = statistics.bigram_counts.tocsr() self._ww_counts_csr = theano.shared(ww_counts_csr, 'ww_counts_csr') logging.debug("Allocated %s for CSR word-word counts.", byte_size(ww_counts_csr.data.nbytes)) # Initialize classes. word_to_class = numpy.array(vocabulary.word_id_to_class_id) self._word_to_class = theano.shared(word_to_class, 'word_to_class') logging.debug("Allocated %s for word-to-class mapping.", byte_size(word_to_class.nbytes)) # Compute class counts from word counts. logging.info("Computing class and class/word statistics.") class_counts, cc_counts, cw_counts, wc_counts = \ self._compute_class_statistics(word_counts, ww_counts_csc, word_to_class) self._class_counts = theano.shared(class_counts, 'class_counts') self._cc_counts = theano.shared(cc_counts, 'cc_counts') self._cw_counts = theano.shared(cw_counts, 'cw_counts') self._wc_counts = theano.shared(wc_counts, 'wc_counts') # Create Theano functions. self._create_get_word_prob_function() self._create_evaluate_function() self._create_move_function() self._create_log_likelihood_function() self._create_class_size_function()
def __init__(self, statistics, vocabulary): """Computes initial statistics. :type statistics: WordStatistics :param statistics: word statistics from the training corpus :type vocabulary: theanolm.Vocabulary :param vocabulary: words to include in the optimization and initial classes """ # count_nonzero() and any() seem to fail on the sparse matrix. if not statistics.unigram_counts.any(): raise ValueError("Empty word unigram statistics.") if statistics.bigram_counts.nnz == 0: raise ValueError("Empty word bigram statistics.") # Sparse classes in Theano 0.8 support only int32 indices. super().__init__(vocabulary, 'int32') # Create word counts. word_counts = statistics.unigram_counts self._word_counts = theano.shared(word_counts, 'word_counts') logging.debug("Allocated %s for word counts.", byte_size(word_counts.nbytes)) ww_counts_csc = statistics.bigram_counts.tocsc() self._ww_counts = theano.shared(ww_counts_csc, 'ww_counts_csc') logging.debug("Allocated %s for CSC word-word counts.", byte_size(ww_counts_csc.data.nbytes)) ww_counts_csr = statistics.bigram_counts.tocsr() self._ww_counts_csr = theano.shared(ww_counts_csr, 'ww_counts_csr') logging.debug("Allocated %s for CSR word-word counts.", byte_size(ww_counts_csr.data.nbytes)) # Initialize classes. word_to_class = numpy.array(vocabulary.word_id_to_class_id) self._word_to_class = theano.shared(word_to_class, 'word_to_class') logging.debug("Allocated %s for word-to-class mapping.", byte_size(word_to_class.nbytes)) # Compute class counts from word counts. logging.info("Computing class and class/word statistics.") class_counts, cc_counts, cw_counts, wc_counts = \ self._compute_class_statistics(word_counts, ww_counts_csc, word_to_class) self._class_counts = theano.shared(class_counts, 'class_counts') self._cc_counts = theano.shared(cc_counts, 'cc_counts') self._cw_counts = theano.shared(cw_counts, 'cw_counts') self._wc_counts = theano.shared(wc_counts, 'wc_counts') # Create Theano functions. self._create_get_word_prob_function() self._create_evaluate_function() self._create_move_function() self._create_log_likelihood_function() self._create_class_size_function()
def _compute_class_statistics(self, word_counts, ww_counts, word_to_class): """Computes class statistics from word statistics given the word-to-class mapping. :type word_counts: numpy.ndarray :param word_counts: word unigram counts :type ww_counts: scipy.sparse.csc_matrix :param ww_counts: word bigram counts :type word_to_class: numpy.ndarray :param word_to_class: gives the class ID of each word ID """ class_counts = numpy.zeros(self.num_classes, self._count_type) logging.debug("Allocated %s for class counts.", byte_size(class_counts.nbytes)) cc_counts = numpy.zeros( (self.num_classes, self.num_classes), dtype=self._count_type) logging.debug("Allocated %s for class-class counts.", byte_size(cc_counts.nbytes)) cw_counts = numpy.zeros( (self.num_classes, self.vocabulary_size), dtype=self._count_type) logging.debug("Allocated %s for class-word counts.", byte_size(cw_counts.nbytes)) wc_counts = numpy.zeros( (self.vocabulary_size, self.num_classes), dtype=self._count_type) logging.debug("Allocated %s for word-class counts.", byte_size(wc_counts.nbytes)) numpy.add.at(class_counts, word_to_class, word_counts) left_word_ids, right_word_ids = ww_counts.nonzero() counts = ww_counts[left_word_ids, right_word_ids].flat left_class_ids = word_to_class[left_word_ids] right_class_ids = word_to_class[right_word_ids] numpy.add.at(cc_counts, (left_class_ids, right_class_ids), counts) numpy.add.at(cw_counts, (left_class_ids, right_word_ids), counts) numpy.add.at(wc_counts, (left_word_ids, right_class_ids), counts) return class_counts, cc_counts, cw_counts, wc_counts
def _compute_class_statistics(self, word_counts, ww_counts, word_to_class): """Computes class statistics from word statistics given the word-to-class mapping. :type word_counts: numpy.ndarray :param word_counts: word unigram counts :type ww_counts: scipy.sparse.csc_matrix :param ww_counts: word bigram counts :type word_to_class: numpy.ndarray :param word_to_class: gives the class ID of each word ID """ class_counts = numpy.zeros(self.num_classes, dtype=self._count_type) logging.debug("Allocated %s for class counts.", byte_size(class_counts.nbytes)) cc_counts = numpy.zeros( (self.num_classes, self.num_classes), dtype=self._count_type) logging.debug("Allocated %s for class-class counts.", byte_size(cc_counts.nbytes)) cw_counts = numpy.zeros( (self.num_classes, self.vocabulary_size), dtype=self._count_type) logging.debug("Allocated %s for class-word counts.", byte_size(cw_counts.nbytes)) wc_counts = numpy.zeros( (self.vocabulary_size, self.num_classes), dtype=self._count_type) logging.debug("Allocated %s for word-class counts.", byte_size(wc_counts.nbytes)) numpy.add.at(class_counts, word_to_class, word_counts) left_word_ids, right_word_ids = ww_counts.nonzero() counts = ww_counts[left_word_ids, right_word_ids].flat left_class_ids = word_to_class[left_word_ids] right_class_ids = word_to_class[right_word_ids] numpy.add.at(cc_counts, (left_class_ids, right_class_ids), counts) numpy.add.at(cw_counts, (left_class_ids, right_word_ids), counts) numpy.add.at(wc_counts, (left_word_ids, right_class_ids), counts) return class_counts, cc_counts, cw_counts, wc_counts
def __init__(self, statistics, vocabulary): """Computes initial statistics. :type statistics: WordStatistics :param statistics: word statistics from the training corpus :type vocabulary: theanolm.Vocabulary :param vocabulary: words to include in the optimization and initial classes """ # count_nonzero() and any() seem to fail on the sparse matrix. if not statistics.unigram_counts.any(): raise ValueError("Empty word unigram statistics.") if statistics.bigram_counts.nnz == 0: raise ValueError("Empty word bigram statistics.") super().__init__(vocabulary) # Create word counts. self._word_counts = statistics.unigram_counts logging.debug("Allocated %s for word counts.", byte_size(self._word_counts.nbytes)) self._ww_counts = statistics.bigram_counts.tocsc() logging.debug("Allocated %s for sparse word-word counts.", byte_size(self._ww_counts.data.nbytes)) # Initialize classes. self._word_to_class = numpy.array(vocabulary.word_id_to_class_id) logging.debug("Allocated %s for word-to-class mapping.", byte_size(self._word_to_class.nbytes)) # Compute class counts from word counts. logging.info("Computing class and class/word statistics.") self._class_counts, self._cc_counts, self._cw_counts, self._wc_counts = \ self._compute_class_statistics(self._word_counts, self._ww_counts, self._word_to_class)