def getHIN(self): hin = HIN() hin.Ids = self.Ids hin.RelationIds = self.RelationIds hin.Links = self.Links return hin
def getHIN(self): hin = HIN() hin.Ids = self.Ids hin.RelationIds = self.RelationIds hin.Links = self.Links hin.NodeTypes = self.NodeTypes hin.TypeIds = self.TypeIds hin.AuthorLabels = self.AuthorLabels hin.ConfLabels = self.ConfLabels return hin
def match(self, G: HIN, document_ids, terms): logger.info(f"Match {self.motif_name}") row_labels = [] P_indices = [] K_indices = [] for paper_id in document_ids: P_indices.append(G.find_by_entity_id("P", paper_id)) for term in terms: row_labels.append(term) K_indices.append(G.find_by_entity_id("K", term)) KP = G.A[K_indices, :] KP = KP[:, P_indices] Y_split = [] for i in range(1965, 2021, 5): Y_split.append(i) col_labels = [] V_indices = G.type_idx["V"] n_venues = len(V_indices) for y in Y_split: for v in V_indices: col_labels.append(G.get_info(v) + " " + str(y)) Y_indices = G.type_idx["Y"] PY = G.A[P_indices, :] PY = PY[:, Y_indices] PV = G.A[P_indices, :] PV = PV[:, V_indices] KPVY = sp.dok_matrix((KP.shape[0], n_venues * len(Y_split))) for i in tqdm(range(KP.shape[0])): for p_idx in KP[i, :].nonzero()[1]: try: v_idx = PV[p_idx].nonzero()[1][0] y_idx = PY[p_idx].nonzero()[1][0] year = int(G.get_info(y_idx)) except: continue for k in range(len(Y_split)): if Y_split[k] > year: offset = k break KPVY[i, v_idx + offset * n_venues] += 1 KPVY = KPVY.tocsr() return KPVY, row_labels, col_labels
def getHIN(self): hin = HIN() hin.Ids = self.Ids hin.RelationIds = self.RelationIds hin.Links = self.Links hin.ValidLinks = self.ValidLinks hin.TestLinks = self.TestLinks return hin
def populate_clustering(G: HIN, n_clusters: int, WT_clusters: List[Dict[str, float]], damping=0.8) -> Tuple[np.ndarray, np.ndarray]: """Populate clustering results from terms to whole graph by random walk w/ restart. Args: G: The HIN. n_clusters: Number of clusters. WT_clusters: A list of initial weights of terms in each cluster. These weights will be populated to the whole graph. damping: The damping factor for random walk. Larger means more restart probability. Returns: ranking: The ranking distribution over ALL nodes. Shape (n_nodes, n_clusters). clustering_probs: The clustering distribution of all nodes. Shape (n_nodes, n_clusters). """ clustering_probs = np.zeros((G.num_nodes(), n_clusters), dtype=np.float64) for k in range(n_clusters): # get initial distribution using T_score T_score = list(WT_clusters[k].items()) # P_Ti # T_score = take_topk(WT_clusters[k], 20, return_tuple=True) phrases, scores = list(zip(*T_score)) z = sum(WT_clusters[k].values()) # normalizer dist = np.zeros((G.num_nodes(), ), dtype=np.float64) aligned_nids = G.find_by_entity_ids("K", phrases) for i in range(len(scores)): dist[aligned_nids[i]] = scores[i] / z # use random walk to populate clustering probabilities pr = G.ppr(damping=damping, init_probs=dist) clustering_probs[:, k] = pr ranking = clustering_probs clustering_probs = utils.row_normalize(clustering_probs) return ranking, clustering_probs
def get_cluster_documents( G: HIN, D: Dict[str, List[str]], term_nids: List[int]) -> Tuple[Dict[str, List[str]], Dict[str, float]]: """Get all documents associated a set of terms. The weight on each document is computed by aggregating edge weights to all relevant term node neighbors. Args: G: The HIN. D: The set of documents to consider. term_nids: The set of terms in node id. Returns: documents: The set of associated documents. weights: The weight on each document indicating how much they are relevant. """ documents = dict() weights: Dict[str, float] = defaultdict(float) for nid in term_nids: neighbors = G.neighbors(nid) # assume P-K links are bi-directional for neighbor in neighbors: # filter out non paper nodes if G.get_type(neighbor) != "P": continue # filter out docs out of parent cluster paper_id = G.get_doc_id(neighbor) if paper_id not in D: continue weights[paper_id] += G.A[nid, neighbor] weights = dict(weights) for paper_id in weights.keys(): documents[paper_id] = D[paper_id] return documents, weights
def getHIN(self): hin = HIN() hin.Ids = self.Ids hin.DocIds = self.DocIds hin.DocLabels = self.DocLabels hin.Links = self.Links hin.RelationIds = self.RelationIds hin.NodeTypes = self.NodeTypes hin.TypeIds = self.TypeIds hin.WordIds = self.WordIds hin.WordDFs = self.WordDFs hin.WordCount = self.WordCount hin.EntityDFs = self.EntityDFs return hin
def match(self, G: HIN, document_ids, terms): logger.info(f"Match {self.motif_name}") row_labels = [] P_indices = [] K_indices = [] for paper_id in document_ids: P_indices.append(G.find_by_entity_id("P", paper_id)) keyword2idx = dict() for i, term in enumerate(terms): row_labels.append(term) keyword2idx[term] = i K_indices.append(G.find_by_entity_id("K", term)) KP = G.A[K_indices, :] KP = KP[:, P_indices] col_labels = [] author_pairs = list() # scan through all papers and get all author pairs for nid in tqdm(G.type_idx["P"]): neighbors = G.neighbors(nid) authors = [] for i in neighbors: if G.get_type(i) == "A": authors.append(i) for a1, a2 in itertools.combinations(authors, r=2): author_pairs.append(str(a1) + "-" + str(a2)) # select pairs with more than freq_threshold occurrence c = collections.Counter(author_pairs) freq_author_pairs = set( [pair for pair, cnt in c.items() if cnt >= self.freq_threshold]) logger.debug(f"frequent author pairs {len(freq_author_pairs)}") # scan through all papers and build bipartite graph author2idx = {} # author-pair -> bipartite graph idx for i, author_pair in enumerate(freq_author_pairs): author2idx[author_pair] = i a1, a2 = author_pair.split("-") a1, a2 = int(a1), int(a2) a1_name, a2_name = G.get_info(a1), G.get_info(a2) col_labels.append(a1_name + " - " + a2_name) n_keyword = len(terms) n_author = len(author2idx) keyword_set = set(terms) KPAA = sp.dok_matrix((n_keyword, n_author)) for nid in tqdm(P_indices): neighbors = G.neighbors(nid) authors = [] keywords = [] for i in neighbors: if G.get_type(i) == "A": authors.append(i) elif G.get_type(i) == "K": keywords.append(G.get_info(i)) pairs = [] for a1, a2 in itertools.combinations(authors, r=2): if str(a1) + "-" + str(a2) in freq_author_pairs: pairs.append(str(a1) + "-" + str(a2)) for kwd in keywords: if kwd not in keyword_set: continue row_id = keyword2idx[kwd] for pair in pairs: col_id = author2idx[pair] KPAA[row_id, col_id] += 1 KPAA = KPAA.tocsr() return KPAA, row_labels, col_labels
def _term2nid(G: HIN, terms: List[str]) -> List[int]: return G.find_by_entity_ids("K", terms)
def main(): logger.warning("Start building taxonomy") # Load input: this includes reading network, text, and # a background corpus for contrastive analysis logger.info("Loading graph from file") A, node_info = utils.load_graph(args.data_dir, remove_citation=True, force_undirected=True) logger.info("Create HIN") G = HIN(A, node_info) logger.info("Load text") corpus = utils.load_documents(args.data_dir) motif_matchers = [ Motif_KPV(), Motif_KPA(), Motif_KP(), Motif_KPVY(), Motif_KPAA() ] intermediate_dir = plib.Path(args.data_dir, "intermediate") if not intermediate_dir.is_dir(): logger.warning(f"Creating intermediate dir {intermediate_dir}") intermediate_dir.mkdir(parents=False) # we collect all phrases T = [] # terms / phrases for info in node_info.values(): if info.node_type == "K": T.append(info.entity_id) D = corpus tf_bg, idf_bg = utils.get_tf_idf_from_file( plib.Path(args.data_dir, "background_documents.txt"), T) taxo = Taxonomy(D, T, G) builder = NetTaxo(motif_matchers, tf_lift=args.tf_lift, idf_lift=args.idf_lift, damping=args.damping, conf_motif=Motif_KPA().motif_name) # set background corpus for contrastive analysis builder.set_background(tf_bg, idf_bg) builder.build(taxo, args.levels) # save output_dir = plib.Path(args.output_dir, config.unique_id) if not output_dir.is_dir(): output_dir.mkdir(parents=True) logger.info(f"Saving to {output_dir}") taxo.save(output_dir) logger.info("Saving complete") # generate output taxo.visualize(plib.Path(output_dir, f"vis.pdf")) taxo.save_readable(output_dir)