def main(): logger.warning("Start building taxonomy") # Load input: this includes reading network, text, and # a background corpus for contrastive analysis logger.info("Loading graph from file") A, node_info = utils.load_graph(args.data_dir, remove_citation=True, force_undirected=True) logger.info("Create HIN") G = HIN(A, node_info) logger.info("Load text") corpus = utils.load_documents(args.data_dir) motif_matchers = [ Motif_KPV(), Motif_KPA(), Motif_KP(), Motif_KPVY(), Motif_KPAA() ] intermediate_dir = plib.Path(args.data_dir, "intermediate") if not intermediate_dir.is_dir(): logger.warning(f"Creating intermediate dir {intermediate_dir}") intermediate_dir.mkdir(parents=False) # we collect all phrases T = [] # terms / phrases for info in node_info.values(): if info.node_type == "K": T.append(info.entity_id) D = corpus tf_bg, idf_bg = utils.get_tf_idf_from_file( plib.Path(args.data_dir, "background_documents.txt"), T) taxo = Taxonomy(D, T, G) builder = NetTaxo(motif_matchers, tf_lift=args.tf_lift, idf_lift=args.idf_lift, damping=args.damping, conf_motif=Motif_KPA().motif_name) # set background corpus for contrastive analysis builder.set_background(tf_bg, idf_bg) builder.build(taxo, args.levels) # save output_dir = plib.Path(args.output_dir, config.unique_id) if not output_dir.is_dir(): output_dir.mkdir(parents=True) logger.info(f"Saving to {output_dir}") taxo.save(output_dir) logger.info("Saving complete") # generate output taxo.visualize(plib.Path(output_dir, f"vis.pdf")) taxo.save_readable(output_dir)