Example #1
0
def main():
    logger.warning("Start building taxonomy")
    # Load input: this includes reading network, text, and
    # a background corpus for contrastive analysis
    logger.info("Loading graph from file")
    A, node_info = utils.load_graph(args.data_dir,
                                    remove_citation=True,
                                    force_undirected=True)
    logger.info("Create HIN")
    G = HIN(A, node_info)

    logger.info("Load text")
    corpus = utils.load_documents(args.data_dir)

    motif_matchers = [
        Motif_KPV(),
        Motif_KPA(),
        Motif_KP(),
        Motif_KPVY(),
        Motif_KPAA()
    ]

    intermediate_dir = plib.Path(args.data_dir, "intermediate")
    if not intermediate_dir.is_dir():
        logger.warning(f"Creating intermediate dir {intermediate_dir}")
        intermediate_dir.mkdir(parents=False)

    # we collect all phrases
    T = []  # terms / phrases
    for info in node_info.values():
        if info.node_type == "K":
            T.append(info.entity_id)

    D = corpus
    tf_bg, idf_bg = utils.get_tf_idf_from_file(
        plib.Path(args.data_dir, "background_documents.txt"), T)

    taxo = Taxonomy(D, T, G)

    builder = NetTaxo(motif_matchers,
                      tf_lift=args.tf_lift,
                      idf_lift=args.idf_lift,
                      damping=args.damping,
                      conf_motif=Motif_KPA().motif_name)

    # set background corpus for contrastive analysis
    builder.set_background(tf_bg, idf_bg)
    builder.build(taxo, args.levels)

    # save
    output_dir = plib.Path(args.output_dir, config.unique_id)
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)
    logger.info(f"Saving to {output_dir}")
    taxo.save(output_dir)

    logger.info("Saving complete")

    # generate output
    taxo.visualize(plib.Path(output_dir, f"vis.pdf"))
    taxo.save_readable(output_dir)