Ejemplo n.º 1
0
 def _compute_coherence(self, model, k, test_data, log_terms=False):
     num_topics = model.n_latent
     sorted_ids = model.get_top_k_terms(k)
     num_topics = min(num_topics, sorted_ids.shape[-1])
     top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])]
                              for t in range(num_topics)]
     npmi_eval = EvaluateNPMI(top_k_words_per_topic)
     npmi = npmi_eval.evaluate_csr_mat(test_data)
     unique_term_ids = set()
     unique_limit = 5  ## only consider the top 5 terms for each topic when looking at degree of redundancy
     for i in range(num_topics):
         topic_ids = list(top_k_words_per_topic[i][:unique_limit])
         for j in range(len(topic_ids)):
             unique_term_ids.add(topic_ids[j])
     redundancy = (
         1.0 -
         (float(len(unique_term_ids)) / num_topics / unique_limit))**2.0
     logging.info("Test Coherence: {}".format(npmi))
     if log_terms:
         top_k_tokens = [
             list(map(lambda x: self.vocabulary.idx_to_token[x], list(li)))
             for li in top_k_words_per_topic
         ]
         for i in range(num_topics):
             logging.info("Topic {}: {}".format(i, top_k_tokens[i]))
     return npmi, redundancy
Ejemplo n.º 2
0
    def _npmi(self, X, y, k=10):
        """
        Calculate NPMI(Normalized Pointwise Mutual Information) for data X

        Parameters:
            X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size]
            k (int): Threshold at which to compute npmi. optional (default=10)

        Returns:
            npmi (float): NPMI score.
        """
        sorted_ids = self.model.get_ordered_terms()
        num_topics = min(self.n_latent, sorted_ids.shape[-1])
        top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])]
                                 for t in range(self.n_latent)]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic)
        npmi = npmi_eval.evaluate_csr_mat(X)
        unique_term_ids = set()
        unique_limit = 5  ## only consider the top 5 terms for each topic when looking at degree of redundancy
        for i in range(num_topics):
            topic_ids = list(top_k_words_per_topic[i][:unique_limit])
            for j in range(len(topic_ids)):
                unique_term_ids.add(topic_ids[j])
        redundancy = (
            1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit))**2
        return npmi, redundancy
Ejemplo n.º 3
0
 def _npmi_with_dataloader(self, dataloader, k=10):
     sorted_ids = self.model.get_ordered_terms_encoder(
         dataloader
     ) if self.coherence_via_encoder else self.model.get_ordered_terms()
     num_topics = min(self.n_latent, sorted_ids.shape[-1])
     top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])]
                              for t in range(self.n_latent)]
     npmi_eval = EvaluateNPMI(top_k_words_per_topic)
     npmi = npmi_eval.evaluate_csr_loader(dataloader)
     unique_term_ids = set()
     unique_limit = 5  ## only consider the top 5 terms for each topic when looking at degree of redundancy
     for i in range(num_topics):
         topic_ids = list(top_k_words_per_topic[i][:unique_limit])
         for j in range(len(topic_ids)):
             unique_term_ids.add(topic_ids[j])
     redundancy = (
         1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit))**2
     return npmi, redundancy
Ejemplo n.º 4
0
    def _npmi_per_covariate(self, X, y, k=10):
        """
        Calculate NPMI(Normalized Pointwise Mutual Information) for each covariate for data X

        Parameters:
            X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size]
            y (array-like or sparse matrix): Covariate matrix. shape [n_samples, n_covars]
            k (int): Threshold at which to compute npmi. optional (default=10)

        Returns:
            (dict): Dictionary of npmi scores for each covariate.
        """

        X_train = X.toarray()
        y_train = y
        covars = np.unique(y_train, axis=0)
        covar_npmi = {}
        npmi_total = 0
        for covar in covars:
            mask = (y_train == covar).all(axis=1)
            X_covar, y_covar = mx.nd.array(
                X_train[mask], dtype=np.float32), mx.nd.array(y_train[mask],
                                                              dtype=np.float32)
            sorted_ids = self.model.get_ordered_terms_with_covar_at_data(
                X_covar, k, y_covar)
            top_k_words_per_topic = [[
                int(i) for i in list(sorted_ids[:k, t].asnumpy())
            ] for t in range(self.n_latent)]
            npmi_eval = EvaluateNPMI(top_k_words_per_topic)
            npmi = npmi_eval.evaluate_csr_mat(X_covar)

            if (self.label_map):
                covar_key = covar[0]
            else:
                covar_key = np.where(covar)[0][0]
            covar_npmi[covar_key] = npmi
            npmi_total += npmi
        return npmi_total / len(covars)
Ejemplo n.º 5
0
os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0"

if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()

    verbose = False  ### XXX - add as argument
    vocab = load_vocab(args.vocab_file)
    if args.override_top_k_terms:
        top_k_words_per_topic = get_top_k_terms_from_file(
            args.override_top_k_terms)
        tst_csr, _, _, _ = file_to_data(args.test_file, len(vocab))
        top_k_words_per_topic_ids = [[vocab[t] for t in t_set]
                                     for t_set in top_k_words_per_topic]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids)
        test_npmi = npmi_eval.evaluate_csr_mat(tst_csr)
        print("**** Test NPMI = {} *******".format(test_npmi))
        exit(0)

    inference_model = BowVAEInferencer.from_saved(
        model_dir=args.model_dir,
        ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu))

    if args.plot_file:  # get UMAP embedding visualization
        import matplotlib.pyplot as plt
        encoded, labels = inference_model.encode_vec_file(args.test_file)
        encodings = np.array([doc.asnumpy() for doc in encoded])
        print("There are {0} labels and {1} encodings".format(
            len(labels), len(encodings)))
        umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean')
Ejemplo n.º 6
0
os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0"

if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()

    verbose = False ### XXX - add as argument
    inference_model = BowVAEInferencer.from_saved(model_dir=args.model_dir,
                                                  ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu))
    
    if args.override_top_k_terms:
        top_k_words_per_topic = get_top_k_terms_from_file(args.override_top_k_terms)
        tst_csr, _, _, _ = file_to_data(args.test_file, len(inference_model.vocab))
        top_k_words_per_topic_ids = [ [ inference_model.vocab[t] for t in t_set ]  for t_set in top_k_words_per_topic ]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids)
        test_npmi = npmi_eval.evaluate_csr_mat(tst_csr)
        print("**** Test NPMI = {} *******".format(test_npmi))
        exit(0)


    if args.plot_file: # get UMAP embedding visualization
        import matplotlib.pyplot as plt
        encoded, labels = inference_model.encode_vec_file(args.test_file)
        encodings = np.array([doc.asnumpy() for doc in encoded])
        print("There are {0} labels and {1} encodings".format(len(labels), len(encodings)))
        umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean')
        embeddings = umap_model.fit_transform(encodings)
        plt.scatter(*embeddings.T, c=labels, s=0.2, alpha=0.7, cmap='coolwarm')
        plt.savefig(args.plot_file, dpi=1000)