Ejemplo n.º 1
0
    def getHIN(self):
        hin = HIN()
        hin.Ids = self.Ids
        hin.RelationIds = self.RelationIds
        hin.Links = self.Links

        return hin
Ejemplo n.º 2
0
 def getHIN(self):
     hin = HIN()
     hin.Ids = self.Ids
     hin.RelationIds = self.RelationIds
     hin.Links = self.Links
     hin.NodeTypes = self.NodeTypes
     hin.TypeIds = self.TypeIds
     hin.AuthorLabels = self.AuthorLabels
     hin.ConfLabels = self.ConfLabels
     return hin
Ejemplo n.º 3
0
    def match(self, G: HIN, document_ids, terms):
        logger.info(f"Match {self.motif_name}")
        row_labels = []
        P_indices = []
        K_indices = []
        for paper_id in document_ids:
            P_indices.append(G.find_by_entity_id("P", paper_id))

        for term in terms:
            row_labels.append(term)
            K_indices.append(G.find_by_entity_id("K", term))

        KP = G.A[K_indices, :]
        KP = KP[:, P_indices]

        Y_split = []
        for i in range(1965, 2021, 5):
            Y_split.append(i)

        col_labels = []
        V_indices = G.type_idx["V"]
        n_venues = len(V_indices)
        for y in Y_split:
            for v in V_indices:
                col_labels.append(G.get_info(v) + " " + str(y))

        Y_indices = G.type_idx["Y"]
        PY = G.A[P_indices, :]
        PY = PY[:, Y_indices]
        PV = G.A[P_indices, :]
        PV = PV[:, V_indices]
        KPVY = sp.dok_matrix((KP.shape[0], n_venues * len(Y_split)))
        for i in tqdm(range(KP.shape[0])):
            for p_idx in KP[i, :].nonzero()[1]:
                try:
                    v_idx = PV[p_idx].nonzero()[1][0]
                    y_idx = PY[p_idx].nonzero()[1][0]
                    year = int(G.get_info(y_idx))
                except:
                    continue
                for k in range(len(Y_split)):
                    if Y_split[k] > year:
                        offset = k
                        break
                KPVY[i, v_idx + offset * n_venues] += 1

        KPVY = KPVY.tocsr()
        return KPVY, row_labels, col_labels
Ejemplo n.º 4
0
    def getHIN(self):
        hin = HIN()
        hin.Ids = self.Ids
        hin.RelationIds = self.RelationIds
        hin.Links = self.Links
        hin.ValidLinks = self.ValidLinks
        hin.TestLinks = self.TestLinks

        return hin
Ejemplo n.º 5
0
def populate_clustering(G: HIN,
                        n_clusters: int,
                        WT_clusters: List[Dict[str, float]],
                        damping=0.8) -> Tuple[np.ndarray, np.ndarray]:
    """Populate clustering results from terms to whole graph by random walk w/ restart.

  Args:
    G: The HIN.
    n_clusters: Number of clusters.
    WT_clusters: A list of initial weights of terms in each cluster. These
      weights will be populated to the whole graph.
    damping: The damping factor for random walk. Larger means more restart
      probability.

  Returns:
    ranking: The ranking distribution over ALL nodes. Shape (n_nodes,
    n_clusters).
    clustering_probs: The clustering distribution of all nodes. Shape (n_nodes,
    n_clusters).
  """
    clustering_probs = np.zeros((G.num_nodes(), n_clusters), dtype=np.float64)
    for k in range(n_clusters):
        # get initial distribution using T_score
        T_score = list(WT_clusters[k].items())  # P_Ti
        # T_score = take_topk(WT_clusters[k], 20, return_tuple=True)
        phrases, scores = list(zip(*T_score))
        z = sum(WT_clusters[k].values())  # normalizer
        dist = np.zeros((G.num_nodes(), ), dtype=np.float64)
        aligned_nids = G.find_by_entity_ids("K", phrases)
        for i in range(len(scores)):
            dist[aligned_nids[i]] = scores[i] / z

        # use random walk to populate clustering probabilities
        pr = G.ppr(damping=damping, init_probs=dist)
        clustering_probs[:, k] = pr
    ranking = clustering_probs
    clustering_probs = utils.row_normalize(clustering_probs)
    return ranking, clustering_probs
Ejemplo n.º 6
0
def get_cluster_documents(
        G: HIN, D: Dict[str, List[str]],
        term_nids: List[int]) -> Tuple[Dict[str, List[str]], Dict[str, float]]:
    """Get all documents associated a set of terms.

  The weight on each document is computed by aggregating
  edge weights to all relevant term node neighbors.

  Args:
    G: The HIN.
    D: The set of documents to consider.
    term_nids: The set of terms in node id.

  Returns:
    documents: The set of associated documents.
    weights: The weight on each document indicating how
    much they are relevant.
  """
    documents = dict()
    weights: Dict[str, float] = defaultdict(float)
    for nid in term_nids:
        neighbors = G.neighbors(nid)  # assume P-K links are bi-directional
        for neighbor in neighbors:
            # filter out non paper nodes
            if G.get_type(neighbor) != "P":
                continue
            # filter out docs out of parent cluster
            paper_id = G.get_doc_id(neighbor)
            if paper_id not in D:
                continue
            weights[paper_id] += G.A[nid, neighbor]

    weights = dict(weights)
    for paper_id in weights.keys():
        documents[paper_id] = D[paper_id]
    return documents, weights
Ejemplo n.º 7
0
 def getHIN(self):
     hin = HIN()
     hin.Ids = self.Ids
     hin.DocIds = self.DocIds
     hin.DocLabels = self.DocLabels
     hin.Links = self.Links
     hin.RelationIds = self.RelationIds
     hin.NodeTypes = self.NodeTypes
     hin.TypeIds = self.TypeIds
     hin.WordIds = self.WordIds
     hin.WordDFs = self.WordDFs
     hin.WordCount = self.WordCount
     hin.EntityDFs = self.EntityDFs
     return hin
Ejemplo n.º 8
0
    def match(self, G: HIN, document_ids, terms):
        logger.info(f"Match {self.motif_name}")
        row_labels = []
        P_indices = []
        K_indices = []
        for paper_id in document_ids:
            P_indices.append(G.find_by_entity_id("P", paper_id))

        keyword2idx = dict()
        for i, term in enumerate(terms):
            row_labels.append(term)
            keyword2idx[term] = i
            K_indices.append(G.find_by_entity_id("K", term))

        KP = G.A[K_indices, :]
        KP = KP[:, P_indices]

        col_labels = []
        author_pairs = list()
        # scan through all papers and get all author pairs
        for nid in tqdm(G.type_idx["P"]):
            neighbors = G.neighbors(nid)
            authors = []
            for i in neighbors:
                if G.get_type(i) == "A":
                    authors.append(i)
            for a1, a2 in itertools.combinations(authors, r=2):
                author_pairs.append(str(a1) + "-" + str(a2))

        # select pairs with more than freq_threshold occurrence
        c = collections.Counter(author_pairs)
        freq_author_pairs = set(
            [pair for pair, cnt in c.items() if cnt >= self.freq_threshold])
        logger.debug(f"frequent author pairs {len(freq_author_pairs)}")

        # scan through all papers and build bipartite graph
        author2idx = {}  # author-pair -> bipartite graph idx
        for i, author_pair in enumerate(freq_author_pairs):
            author2idx[author_pair] = i
            a1, a2 = author_pair.split("-")
            a1, a2 = int(a1), int(a2)
            a1_name, a2_name = G.get_info(a1), G.get_info(a2)
            col_labels.append(a1_name + " - " + a2_name)

        n_keyword = len(terms)
        n_author = len(author2idx)

        keyword_set = set(terms)

        KPAA = sp.dok_matrix((n_keyword, n_author))

        for nid in tqdm(P_indices):
            neighbors = G.neighbors(nid)
            authors = []
            keywords = []
            for i in neighbors:
                if G.get_type(i) == "A":
                    authors.append(i)
                elif G.get_type(i) == "K":
                    keywords.append(G.get_info(i))
                pairs = []
                for a1, a2 in itertools.combinations(authors, r=2):
                    if str(a1) + "-" + str(a2) in freq_author_pairs:
                        pairs.append(str(a1) + "-" + str(a2))
                for kwd in keywords:
                    if kwd not in keyword_set:
                        continue
                    row_id = keyword2idx[kwd]
                    for pair in pairs:
                        col_id = author2idx[pair]
                        KPAA[row_id, col_id] += 1

        KPAA = KPAA.tocsr()
        return KPAA, row_labels, col_labels
Ejemplo n.º 9
0
def _term2nid(G: HIN, terms: List[str]) -> List[int]:
    return G.find_by_entity_ids("K", terms)
Ejemplo n.º 10
0
def main():
    logger.warning("Start building taxonomy")
    # Load input: this includes reading network, text, and
    # a background corpus for contrastive analysis
    logger.info("Loading graph from file")
    A, node_info = utils.load_graph(args.data_dir,
                                    remove_citation=True,
                                    force_undirected=True)
    logger.info("Create HIN")
    G = HIN(A, node_info)

    logger.info("Load text")
    corpus = utils.load_documents(args.data_dir)

    motif_matchers = [
        Motif_KPV(),
        Motif_KPA(),
        Motif_KP(),
        Motif_KPVY(),
        Motif_KPAA()
    ]

    intermediate_dir = plib.Path(args.data_dir, "intermediate")
    if not intermediate_dir.is_dir():
        logger.warning(f"Creating intermediate dir {intermediate_dir}")
        intermediate_dir.mkdir(parents=False)

    # we collect all phrases
    T = []  # terms / phrases
    for info in node_info.values():
        if info.node_type == "K":
            T.append(info.entity_id)

    D = corpus
    tf_bg, idf_bg = utils.get_tf_idf_from_file(
        plib.Path(args.data_dir, "background_documents.txt"), T)

    taxo = Taxonomy(D, T, G)

    builder = NetTaxo(motif_matchers,
                      tf_lift=args.tf_lift,
                      idf_lift=args.idf_lift,
                      damping=args.damping,
                      conf_motif=Motif_KPA().motif_name)

    # set background corpus for contrastive analysis
    builder.set_background(tf_bg, idf_bg)
    builder.build(taxo, args.levels)

    # save
    output_dir = plib.Path(args.output_dir, config.unique_id)
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)
    logger.info(f"Saving to {output_dir}")
    taxo.save(output_dir)

    logger.info("Saving complete")

    # generate output
    taxo.visualize(plib.Path(output_dir, f"vis.pdf"))
    taxo.save_readable(output_dir)