Example #1
0
    def expand_node(self, taxo: Taxonomy, node: TaxoNode, level: int,
                    n_children: int, max_level: int):
        """Expand a taxonomy node.

    Args:
      taxo: The Taxonomy object.
      node: The node to expand.
      level: Current level of taxonomy.
      n_children: Number of children to expand to next level.
    """
        logger.info(f"Expand node {node.prefix}")
        logger.info("1. Contrast analysis for term scoring")
        if node.parent is None:
            raise RuntimeError(
                "Contrastive analysis assumes parent node is set.")
        else:
            term_scores = contrast.node_contrast_analysis(
                node,
                node.parent,
                taxo.siblings(node),
                self.tf_lift * (config.LEVEL_DECAY**level),
                self.idf_lift * (config.LEVEL_DECAY**level),
            )
        for term, old_score in term_scores.items():
            term_scores[term] = old_score * node.term_prior[term]
        node.term_scores = term_scores

        logger.debug("Top terms for this node")
        logger.debug(
            str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores, 20)
            ]))

        logger.info(
            f"check stopping criteria, level {level} >= {max_level} is {level >= max_level}"
        )
        if level >= max_level:
            return

        logger.info("Generate motif context")
        generate_motif_context(args.data_dir, level, taxo.G, node.terms,
                               node.docs, self.motif_matchers)
        sample_motif_context(args, level, self.conf_motif)

        logger.info("2. Local embedding")
        word_embed, net_embed = loc_emb.local_embedding(node, args.data_dir)
        wv_word = word_embed.syn0
        wv_net = net_embed.syn0

        logger.info("3. Term clustering")
        logger.debug(f"#term_scores {len(term_scores)}")
        topk = min(config.N_TOP_TERMS,
                   int(config.TOP_TERMS_PCT * len(term_scores)))
        topk_terms = utils.take_topk(term_scores, topk)
        clus_labels, aligned_terms = clus.term_clustering(
            topk_terms, wv_word, n_clusters=n_children)
        clus_labels_net, aligned_terms_net = clus.term_clustering(
            topk_terms, wv_net, n_clusters=n_children)
        map_ab = clus.align_clustering(clus_labels, clus_labels_net)

        logger.info("4. Anchor phrase selection")
        # anchor phrase selection w/ intersection
        # WT_clusters = []  # term weights
        term_weights_clusters = []
        for i in range(n_children):
            # get all terms in the cluster
            clus_terms_word = set(
                clus.get_cluster_terms(clus_labels, aligned_terms, i))
            clus_terms_net = set(
                clus.get_cluster_terms(clus_labels_net, aligned_terms_net,
                                       map_ab[i]))
            clus_terms = clus_terms_word & clus_terms_net
            term_nids = _term2nid(taxo.G, clus_terms)
            # get associated documents
            D_c, weights_c = clus.get_cluster_documents(
                taxo.G, node.docs, term_nids)
            # run contrastive analysis
            tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c)
            next_level = level + 1
            term_scores_c = contrast.contrast_analysis(
                tf_c, idf_c, node.tf, node.idf,
                self.tf_lift * (config.LEVEL_DECAY**next_level),
                self.idf_lift * (config.LEVEL_DECAY**next_level))
            term_weights_clusters.append(term_scores_c)
            logger.debug("Cluster {}:: ".format(i) + str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores_c, 30)
            ]))

        logger.info("5. Motif selection")
        cluster_seeds = []
        n_seed = config.N_ANCHOR_TERMS
        for i in range(n_children):
            seed_phrases = utils.take_topk(term_weights_clusters[i], n_seed)
            cluster_seeds.append(seed_phrases)

        motif_selection_sampling(args,
                                 level,
                                 cluster_seeds,
                                 keep_ratio=config.TOP_MOTIF_PCT)

        logger.info("6. Recompute embedding")
        joint_embed = loc_emb.joint_local_embedding(node, args.data_dir)
        wv_all = joint_embed.syn0

        logger.info("7. Soft clustering")
        clus_labels, aligned_terms, vmf = clus.soft_clustering(
            topk_terms, wv_all, n_clusters=n_children)

        logger.info("8. Generate next level")
        term_prior_clusters = []
        cluster_centers = []
        for i in range(n_children):
            # get all terms in the cluster
            clus_terms = clus.get_cluster_terms(clus_labels, aligned_terms, i)
            term_nids = _term2nid(taxo.G, clus_terms)
            # get associated documents
            D_c, weights_c = clus.get_cluster_documents(
                taxo.G, node.docs, term_nids)
            # run contrastive analysis
            tf_c, idf_c = utils.get_tf_idf(D_c, clus_terms, weights=weights_c)
            term_scores_c = contrast.contrast_analysis(tf_c, idf_c, node.tf,
                                                       node.idf)
            term_prior_clusters.append(term_scores_c)
            logger.debug("Cluster {}:: ".format(i) + str([
                utils.strip_phrase_tags(phrase)
                for phrase in utils.take_topk(term_scores_c, 30)
            ]))

        # generate next level terms and documents
        # compute clustering probability
        X = []
        X_terms = []
        for term in node.terms:
            try:
                X.append(wv_all[term])
                X_terms.append(term)
            except:
                pass
        X = np.vstack(X)
        clustering_probs = clus.get_soft_cluster_probs(X, vmf.cluster_centers_,
                                                       vmf.weights_,
                                                       vmf.concentrations_)
        clustering_probs = clustering_probs.T
        for idx_c in range(n_children):
            # find words in each cluster
            terms_c = []
            term_prior_c = dict()
            for i in range(X.shape[0]):
                if clustering_probs[i, idx_c] > 2 * (1 / n_children):
                    terms_c.append(X_terms[i])
                    term_prior_c[X_terms[i]] = clustering_probs[i, idx_c]

            # find documents associated with each cluster
            ranking, clustering_probs_net = clus.populate_clustering(
                taxo.G, n_children, term_prior_clusters, damping=0.8)
            doc_prior_c = dict()
            docs_c = dict()
            for paper_id, paper_content in node.docs.items():
                nid = taxo.G.find_by_entity_id("P", paper_id)
                score = clustering_probs_net[nid, idx_c]
                if score <= 2 * (1 / n_children):
                    continue
                docs_c[paper_id] = paper_content
                doc_prior_c[paper_id] = score

            node_c = TaxoNode(node.prefix + "/{}".format(idx_c), docs_c,
                              terms_c, doc_prior_c, term_prior_c)
            curr = node_c
            node_c.set_parent(node)
            node.add_child(node_c)