Exemple #1
0
    def __init__(self,
                 corpus,
                 vocab_embeddings,
                 vocab,
                 num_tables,
                 alpha,
                 log=None,
                 save_path=None,
                 show_topics=None,
                 cholesky_decomp=False,
                 num_words_for_formatting=None):
        """

        :param corpus:
        :param vocab_embeddings:
        :param vocab:
        :param num_tables:
        :param alpha:
        :param log:
        :param save_path:
        :param show_topics:
        :param cholesky_decomp:
        :param num_words_for_formatting: By default, each topic is formatted by computing the probability of
            every word in the vocabulary under that topic. This can take a long time for a large vocabulary.
            If given, this limits the number considered to the first
            N in the vocabulary (which makes sense if the vocabulary is ordered with most common words first).
        """
        if log is None:
            log = get_logger("GLDA")
        self.log = log
        # Vocab is used for outputting topics
        self.vocab = vocab
        self.cholesky_decomp = cholesky_decomp
        self.show_topics = show_topics
        self.save_path = save_path

        # Dirichlet hyperparam
        self.alpha = alpha

        # dataVectors
        self.vocab_embeddings = vocab_embeddings
        self.embedding_size = vocab_embeddings.shape[1]
        # List of list of ints
        self.corpus = corpus
        # numIterations
        # K, num tables
        self.num_tables = num_tables
        # N, num docs
        self.num_documents = len(corpus)
        # In the current iteration, map of table_id's to number of customers. Table id starts from 0
        self.table_counts = np.zeros((self.num_tables), dtype=np.int32)
        # K x N array.tableCounts[i][j] represents how many words of document j are present in topic i.
        self.table_counts_per_doc = np.zeros(
            (self.num_tables, self.num_documents), dtype=np.int32)
        # Stores the table (topic) assignment of each customer in each iteration
        # tableAssignments[i][j] gives the table assignment of customer j of the ith document.
        self.table_assignments = []
        # The following 4 parameters are arraylist (list) and not maps (dict) because,
        # if they are K tables, they are continuously numbered from 0 to K-1 and hence we can directly index them
        # Mean vector associated with each table in the current iteration.
        # This is the bayesian mean (i.e has the prior part too)
        self.table_means = np.zeros((self.num_tables, self.embedding_size),
                                    dtype=np.float64)
        # inverse of covariance matrix associated with each table in the current iteration.
        # The covariance matrix is scaled before taking the inverse by \frac{k_N + 1}{k_N (v_N - D + 1)}
        # (This is because the t-distribution take the matrix scaled as input)
        self.table_inverse_covariances = np.zeros(
            (self.num_tables, self.embedding_size, self.embedding_size),
            dtype=np.float64)
        # log-determinant of covariance matrix for each table.
        # Since 0.5 * logDet is required in (see logMultivariateTDensity), therefore that value is kept.
        self.log_determinants = np.zeros(self.num_tables, dtype=np.float64)
        # Stores the sum of the vectors of customers at a given table
        self.sum_table_customers = np.zeros(
            (self.num_tables, self.embedding_size), dtype=np.float64)
        # Stores the squared sum of the vectors of customers at a given table
        self.sum_squared_table_customers = np.zeros(
            (self.num_tables, self.embedding_size, self.embedding_size),
            dtype=np.float64)

        if self.cholesky_decomp:
            # Cholesky Lower Triangular Decomposition of covariance matrix associated with each table.
            self.table_cholesky_ltriangular_mat = np.zeros(
                (self.num_tables, self.embedding_size, self.embedding_size),
                dtype=np.float64)
        else:
            self.table_cholesky_ltriangular_mat = None

        # Normal inverse wishart prior
        self.prior = Wishart(self.vocab_embeddings)

        # Cache k_0\mu_0\mu_0^T, only compute it once
        # Used in calculate_table_params()
        self.k0mu0mu0T = self.prior.kappa * np.outer(self.prior.mu,
                                                     self.prior.mu)

        self.num_words_for_formatting = num_words_for_formatting

        self.log.info("Initializing assignments")
        self.initialize()
    def __init__(self,
                 corpus,
                 vocab_embeddings,
                 vocab,
                 num_tables,
                 alpha=None,
                 kappa=0.1,
                 log=None,
                 save_path=None,
                 show_topics=None,
                 mh_steps=2,
                 num_words_for_formatting=None,
                 das_normalization=True,
                 show_progress=True):
        """

        :param corpus:
        :param vocab_embeddings:
        :param vocab:
        :param num_tables:
        :param alpha: Dirichlet concentration. Defaults to 1/num_tables
        :param kappa:
        :param log:
        :param save_path:
        :param show_topics:
        :param mh_steps:
        :param num_words_for_formatting: By default, each topic is formatted by computing the probability of
            every word in the vocabulary under that topic. This can take a long time for a large vocabulary.
            If given, this limits the number considered to the first
            N in the vocabulary (which makes sense if the vocabulary is ordered with most common words first).
        :param das_normalization: Use the normalization of probability distributions used by Das, Zaheer and Dyer's
            original implementation when computing the sampling probability to choose whether to use the document
            posterior or language model part of the topic posterior. If False, do not normalize in this way, but use
            an alternative, which looks to me like it's more correct mathematically.
        """
        if log is None:
            log = get_logger("GLDA")
        self.log = log
        self.show_progress = show_progress
        # Vocab is used for outputting topics
        self.vocab = vocab
        self.show_topics = show_topics
        self.save_path = save_path

        # MH sampling steps
        self.mh_steps = mh_steps

        # Dirichlet hyperparam
        if alpha is None:
            alpha = 1. / num_tables
        self.alpha = alpha

        self.das_normalization = das_normalization

        # dataVectors
        self.vocab_embeddings = vocab_embeddings
        self.embedding_size = vocab_embeddings.shape[1]
        self.num_terms = vocab_embeddings.shape[0]
        # List of list of ints
        self.corpus = corpus
        # numIterations
        # K, num tables
        self.num_tables = num_tables
        # N, num docs
        self.num_documents = len(corpus)
        # In the current iteration, map of table_id's to number of customers. Table id starts from 0
        # Use shared memory
        self.table_counts = SharedArray.create(self.num_tables, "int")
        # K x N array.tableCounts[i][j] represents how many words of document j are present in topic i.
        self.table_counts_per_doc = np.zeros(
            (self.num_tables, self.num_documents), dtype=np.int32)
        # Stores the table (topic) assignment of each customer in each iteration
        # tableAssignments[i][j] gives the table assignment of customer j of the ith document.
        self.table_assignments = []
        # The following 4 parameters are arraylist (list) and not maps (dict) because,
        # if they are K tables, they are continuously numbered from 0 to K-1 and hence we can directly index them
        # Mean vector associated with each table in the current iteration.
        # This is the bayesian mean (i.e has the prior part too)
        # Use shared memory
        self.table_means = SharedArray.create(
            (self.num_tables, self.embedding_size), "float")
        # log-determinant of covariance matrix for each table.
        # Since 0.5 * logDet is required in (see logMultivariateTDensity), therefore that value is kept.
        # Use shared memory
        self.log_determinants = SharedArray.create(self.num_tables, "float")
        # Stores the squared sum of the vectors of customers at a given table
        self.sum_squared_table_customers = np.zeros(
            (self.num_tables, self.embedding_size, self.embedding_size),
            dtype=np.float64)

        # Cholesky Lower Triangular Decomposition of covariance matrix associated with each table.
        # Use shared memory
        self.table_cholesky_ltriangular_mat = SharedArray.create(
            (self.num_tables, self.embedding_size, self.embedding_size),
            "float")

        # Normal inverse wishart prior
        self.prior = Wishart(self.vocab_embeddings, kappa=kappa)

        # Cache k_0\mu_0\mu_0^T, only compute it once
        # Used in calculate_table_params()
        self.k0mu0mu0T = self.prior.kappa * np.outer(self.prior.mu,
                                                     self.prior.mu)

        self.num_words_for_formatting = num_words_for_formatting

        self.aliases = VoseAliases.create(self.num_terms, self.num_tables)

        self.log.info("Initializing assignments")
        self.initialize()