Esempio n. 1
0
def n2v(graph: str, output_dir: str, directed: bool, tag: str,
        params: dict) -> None:
    """Runs the SNAP implementation of Node2Vec on a NetworkX graph

    Args:
        graph (str): Path to a pickled NetworkX Graph
        output_dir (str): The directory that will save Node2Vec Model.
        directed (bool): If True, process as directed graph
        tag (str): The tag that will be appended to output files, useful for IDing 
        params (dict): Dictionary of Node2Vec/Word2Vec Parameters
    """

    # Ensure directories exist
    directory_check(output_dir)
    directory_check(output_dir + "/models")
    directory_check(output_dir + "/embeddings")
    temp_dir = output_dir + "/temp"
    directory_check(temp_dir)

    node2vec_init = n2v_init(temp_folder=temp_dir, **params)
    node2vec_fit = n2v_fit(**params)

    print("Beginning node2vec script")
    print("Graph: %s" % graph)
    for key, value in node2vec_init.items():
        print("%s: %s" % (key, value))
    for key, value in node2vec_fit.items():
        print("%s: %s" % (key, value))

    G = nx.read_gpickle(graph)

    if not directed:
        G = G.to_undirected()

    try:
        node2vec = Node2Vec(G, **node2vec_init)
        model = node2vec.fit(**node2vec_fit)
    except Exception as e:
        logging.error("Failed to run Node2Vec on Graph")
        logging.error(e.__doc__)

    embedding_file = generate_out_file("embeddings.pkl",
                                       output_dir + "/embeddings", tag)
    model_file = generate_out_file("model.pkl", output_dir + "/models", tag)

    # Save embeddings
    model.wv.save_word2vec_format(embedding_file)
    print("Embeddings saved to %s" % embedding_file)

    # Save model
    model.save(model_file)
    print("Model saved to %s" % model_file)

    print("Completed n2v.py")
Esempio n. 2
0
    def fit(self, save: bool = True):

        # Initiate early stopping
        callbacks = []
        if self.early_stopping:
            early_stopping_monitor = EarlyStopping(patience=5)
            callbacks.append(early_stopping_monitor)

        # TODO: Fix tensorboard
        # Add TensorBoard
        # log_dir = config.TRAIN_LOGS + "/" + self.tag
        # tensorboard_callback = tf.keras.callbacks.TensorBoard(
        #     log_dir=log_dir, histogram_freq=1
        # )
        # callbacks.append(tensorboard_callback)

        self.model.fit(
            self.X_train,
            self.y_train,
            callbacks=callbacks,
            validation_data=(self.X_valid, self.y_valid),
            **self.training_params,
        )

        if save:
            outfile = generate_out_file("sp_model.h5", self.save_dir, self.tag)
            self.model.save(outfile)
            print(f"Model saved to {outfile}")
Esempio n. 3
0
def build(fred: bool, append_ids: bool, rdf_dir: str, out_dir: str, tag: str) -> None:
    """
    rdf_dir: Directory where RDFs are located
    out_dir: Directory to output graph and log
    fred: If True, leave FRED nodes intact
    append_ids: If True, append file ids to FRED nodes
    tag: a unique tag for the output files. Defaults to current time
    """
    tag_addend = "ontol" if append_ids else "full"
    tag = tag_addend + "-" + tag

    now = datetime.datetime.now()
    print("build_corpus_graph.py")
    print("----------------------")
    print(f"Now: {now}")
    print(f"RDF Dir: {rdf_dir}")
    print(f"Output Dir: {out_dir}")
    print(f"Keep Fred Nodes?: {fred}")
    print(f"Stitch only on non-FRED nodes?: {append_ids}")
    print(f"Experiment tag: {tag}")

    rdf_sub_dirs = [str(rdf_dir + "/" + x + "/") for x in os.listdir(rdf_dir)]
    for i, entry in enumerate(rdf_sub_dirs):
        print(f"Appending subgraphs from {entry}")
        if i == 0:
            create_graph(entry, out_dir, fred, append_ids, tag)
        else:
            existing_graph = generate_out_file("corpus_graph.pkl", out_dir, tag)
            create_graph(entry, out_dir, fred, append_ids, tag, existing=existing_graph)

    now = datetime.datetime.now()
    print(f"Finished creating corpus graph {existing_graph}")
    print(now)
Esempio n. 4
0
 def report(self):
     print(self.classification_report)
     print(f"AUC: {self.auc}")
     heatmap_file = generate_out_file("confusion.png", self.save_dir,
                                      self.tag)
     self.heatmap.figure.savefig(heatmap_file)
     print(f"Confusion matrix saved to {heatmap_file}")
Esempio n. 5
0
    def fit(self, save: bool = True):

        self.model.fit(self.X_train, self.y_train)

        if save:
            outfile = generate_out_file("sp_model.h5", self.save_dir, self.tag)
            joblib.dump(self.model, outfile)
            print(f"Model saved to {outfile}")
Esempio n. 6
0
def nodevec(graph: str, output_dir: str, directed: bool, tag: str,
            params: dict) -> None:

    # Ensure directories exist
    directory_check(output_dir)
    directory_check(output_dir + "/models")
    directory_check(output_dir + "/embeddings")
    temp_dir = output_dir + "/temp"
    directory_check(temp_dir)

    w2vparams = get_w2vparams(**params)
    node2vec_init = get_n2vparams(w2vparams=w2vparams, **params)

    print("Beginning node2vec script")
    print("File: %s" % graph)
    for key, value in node2vec_init.items():
        print("%s: %s" % (key, value))
    for key, value in w2vparams.items():
        print("%s: %s" % (key, value))

    G = nx.read_gpickle(graph)
    G = uri_to_str(G)

    if not directed:
        G = G.to_undirected()

    n2v_model = Node2Vec(**node2vec_init)
    n2v_model.fit(G)

    embedding_file = generate_out_file("embeddings.pkl",
                                       out_dir + "embeddings/", tag)
    model_file = generate_out_file("model.pkl", out_dir + "models/", tag)

    # Save embeddings
    n2v_model.model.wv.save_word2vec_format(embedding_file)
    print("Embeddings saved to %s" % embedding_file)

    # Save model
    n2v_model.model.save(model_file)
    print("Model saved to %s" % embedding_file)

    print("Completed nodevectors.py")
Esempio n. 7
0
        df,
        y,
        test_size=args.test_size,
        stratify=y,
        random_state=config.RANDOM_SEED)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train,
        y_train,
        test_size=args.test_size,
        stratify=y_train,
        random_state=config.RANDOM_SEED,
    )

    out_dir = config.SP_SPLITS_DIR + '/train'
    out_file = "X_train.pkl"
    out_file = generate_out_file(out_file, out_dir, tag)
    X_train.to_pickle(out_file)
    out_file = "y_train.npy"
    out_file = generate_out_file(out_file, out_dir, tag)
    np.save(out_file, y_train)

    out_dir = config.SP_SPLITS_DIR + '/valid'
    out_file = "X_valid.pkl"
    out_file = generate_out_file(out_file, out_dir, tag)
    X_valid.to_pickle(out_file)
    out_file = "y_valid.npy"
    out_file = generate_out_file(out_file, out_dir, tag)
    np.save(out_file, y_valid)

    out_dir = config.SP_SPLITS_DIR + '/test'
    out_file = "X_test.pkl"
Esempio n. 8
0
def create_graph(
    rdf_dir: str,
    out_dir: str,
    fred: bool,
    append: bool,
    tag: str = NOW,
    existing: str = None,
) -> None:
    """
    rdf_dir: Directory where RDFs are located
    out_dir: Directory to output graph and log
    fred: If True, leave FRED nodes intact
    append: If True, append file ids to FRED nodes
    tag (optional): a unique tag for the output files. Defaults to current time
    existing (option): path to an existing networkx graph to append to. Should be pickled format. Default None
    """

    full_graph = None

    # Initialize structure that will become final output graph
    if existing:
        full_graph = nx.read_gpickle(existing)
    else:
        full_graph = nx.MultiGraph()

    rdf_files = [x for x in os.listdir(rdf_dir) if ".rdf" in x]

    # for every rdf file
    for rdf_file in tqdm(rdf_files):

        print(f"\nParsing {rdf_file}")

        rdf_path = rdf_dir + rdf_file

        graph = None
        ### Parse RDF Graph
        try:
            graph = get_rdfGraph(rdf_path)
        except Exception as e:
            logging.error("Failed to parse: %s" % rdf_file)
            logging.error(e.__doc__)
            continue

        # Append unique RDF ids to FRED nodes (limits how much graph is combined)
        if append:
            uid = get_filename(rdf_path)
            graph = append_rdf_ids(graph, uid)

        # Make NetworkX Graph
        try:
            nx_graph = rdflib_to_networkx_multidigraph(graph)  # rdf ->networkx
        except Exception as e:
            logging.error("Failed to parse RDF to NetworkX: %s" % rdf_file)
            logging.error(e.__doc__)
            continue

        # Collapse out FRED nodes
        if not fred:
            try:
                nx_graph = collapse_fred_nodes(nx_graph)
            except Exception as e:
                logging.error("Failed to collapse FRED Nodes: %s" % rdf_file)
                logging.error(e.__doc__)
                continue

        # Add new graph to corpus graph
        try:
            full_graph = nx.compose(full_graph, nx_graph)
        except Exception as e:
            logging.error("Failed to append %s to corpus graph" % rdf_file)
            logging.error(e.__doc__)
            continue

    out_graph = generate_out_file("corpus_graph.pkl", out_dir, tag)
    nx.write_gpickle(full_graph, out_graph)

    print(f"Completed appending {rdf_dir}")
Esempio n. 9
0
        for i, pair in enumerate(pairs):
            json, rdf_dir = pair
            if i == 0:
                generate_sp_df(
                    n2v_model_file=n2v_model_file,
                    snippets=json,
                    rdf_dir=rdf_dir,
                    out_dir=args.out_dir,
                    node_file=args.node_file,
                    tag=tag,
                    weighted=args.weighted,
                    directed=args.directed,
                )
            else:
                existing_df = generate_out_file("sp_df.pkl", args.out_dir, tag)
                generate_sp_df(
                    n2v_model_file=n2v_model_file,
                    snippets=json,
                    rdf_dir=rdf_dir,
                    out_dir=args.out_dir,
                    node_file=args.node_file,
                    tag=tag,
                    weighted=args.weighted,
                    directed=args.directed,
                    existing=existing_df,
                )

now = datetime.datetime.now()
print(f"Finished creating shortest path dataframe {existing_df}")
print(now)
Esempio n. 10
0
def generate_sp_df(
    n2v_model_file: str,
    snippets: str,
    rdf_dir: str,
    out_dir: str,
    node_file: str,
    tag: str,
    weighted: bool = False,
    directed: bool = False,
    existing: str = None,
) -> pd.DataFrame:
    """Generates a dataframe of shortest path vectors between two nodes.

    Args:
        n2v_model_file (str): Path to Node2Vec model
        snippets (str): Path to a .json containing snippets containing relations
        rdf_dir (str): Path to the directory of RDFs corresponding to the snippets
        out_dir (str): The directory that the dataframe will be written to
        node_file (str): Path to pickled dataframe that contains terminal nodes for all relations
        tag (str): The experimental tag, to be appended to the output file name
        weighted (bool, optional): Process as weighted graph. Defaults to False.
        directed (bool, optional): Process as directed graph. Defaults to False. 
        existing (str, optional): Filepath to an existing dataframe to append to. Defaults to None.
    """

    now = datetime.datetime.now()
    print("-" * 30)
    print("Beginning shortest_path.generate_sp_df()")
    print("-" * 30)
    print(f"N2V Model: {n2v_model_file}")
    print(f"Snippet File: {snippets}")
    print(f"RDF Dir: {rdf_dir}")

    n2v_model = None
    nv = False
    if "nv" in n2v_model_file:
        n2v_model = load_nodevectors_model(n2v_model_file)
        nv = True
    else:
        n2v_model = load_n2v_model(n2v_model_file)

    data = list()

    # Get list of .rdf files in directory
    rdfs = os.listdir(rdf_dir)
    relations = None
    relation_type = snippets.split("/")[-1].split("_")[0].split('.')[
        0]  # very GREC specific

    # load terminal nodes into <nodes> df
    nodes_df = pd.read_pickle(node_file)

    # load snippets into <relations> variable
    with open(snippets, "r") as f_grec:
        relations = json.loads(f_grec.read())

    # for every .rdf in directory
    for rdf in rdfs:
        # generate path
        rdf_path = rdf_dir + "/" + rdf

        # set variables to retrieve from grec .json
        rating = None
        subj = None
        obj = None
        db_subj = None
        db_obj = None
        uid = rdf.split(".")[0]

        # get variables from grec .json
        for relation in relations:
            if relation["UID"] == uid:
                rating = relation["maj_vote"]
                subj = relation["sub"]
                obj = relation["obj"]
                db_subj = relation["dbpedia_sub"]
                db_obj = relation["dbpedia_obj"]
                break

        print(
            f"Processing {uid}: rating: {rating}, subject: {subj}, object: {obj}"
        )

        sub_node = nodes_df.loc[uid]["sub"]
        obj_node = nodes_df.loc[uid]["obj"]

        # if bad subject/object, skip to next rdf
        if "Not Found" == sub_node or "Not Found" == obj_node:
            print(f"ERROR: Bad subject or object, skipping {uid}")
            continue

        # Parse graphs, remove VN tags, collapse nodes, and undirect graph
        try:
            graph = get_rdfGraph(rdf_path)
            # graph = remove_vn_tags(graph)
            # graph = append_rdf_ids(graph, uid)
            nx_graph = rdflib_to_networkx_multidigraph(graph)
            nx_graph = collapse_fred_nodes(nx_graph)
            nx_graph = nx_graph.to_undirected()  # returns Multigraph object
            if directed:
                nx_graph = nx.DiGraph(nx_graph)
            else:
                nx_graph = nx.Graph(nx_graph)
        except Exception as e:
            print(f"ERROR: Could not generate graphs for {uid}.")
            print(e.__doc__)
            continue

        if weighted:
            # Calculate weight for all edges
            try:
                nx_graph = to_weighted_graph(nx_graph, n2v_model, nv)
            except Exception as e:
                print(f"ERROR: Could not weight graph {uid}")
                print(e.__doc__)
                continue

        # shortest path between subject and object (as a list)
        try:
            if weighted:
                shortest_path = nx.dijkstra_path(nx_graph, obj_node, sub_node)
            else:
                shortest_path = nx.shortest_path(nx_graph, obj_node, sub_node)
        except Exception as e:
            print(
                f"ERROR: There is no path found between {obj_node} and {sub_node}. Relation: {uid}"
            )
            continue

        # Calculate normalized vectors for path
        ## vector_final holds sum of all vectors in path
        vector_final = None

        ## get vector for every node and add them
        for node in shortest_path:
            vector = (get_nodevectors_vector(n2v_model, node)
                      if nv else get_n2v_vector(n2v_model, node))
            if vector_final is None:
                # for first vector
                vector_final = vector
            else:
                vector_final = vector_final + vector

        # if these are none, there was an error. Skip
        if vector_final is None:
            print("ERROR: Issue with producing embeddings...")
            continue

        # Normalize vector
        n2v_norm = np.linalg.norm(vector_final)
        vector_final = vector_final / n2v_norm

        # append new entry to list
        new_entry = [uid, subj, obj, relation_type, rating, vector_final]
        data.append(new_entry)

        print(f"Finished processing {uid}")

    df = pd.DataFrame(data,
                      columns=[
                          "UID", "Subject", "Object", "Relation", "Maj_Vote",
                          "Short_Path"
                      ])
    out_file = generate_out_file("sp_df.pkl", out_dir, tag)

    if existing:
        df_existing = pd.read_pickle(existing)
        df = pd.concat([df_existing, df], ignore_index=True)

    df.to_pickle(out_file)
    print(f"Shortest paths written to {out_file}")
    print("Completed shortest_path.py execution")
    print("-" * 30)

    return df
Esempio n. 11
0
        str(args.rdf_dir + "/" + x) for x in os.listdir(args.rdf_dir)
    ]
    jsons = [str(args.grec_dir + "/" + x) for x in os.listdir(args.grec_dir)]

    pairs = []
    for json in jsons:
        for rdf_sub_dir in rdf_sub_dirs:
            if json_relation_tag(json) in rdf_sub_dir:
                pairs.append([json, rdf_sub_dir])
                continue

    for i, pair in enumerate(pairs):
        json, rdf_dir = pair
        if i == 0:
            generate_terminal_node_df(snippets=json,
                                      rdf_dir=rdf_dir,
                                      out_dir=args.out_dir,
                                      tag=tag)
        else:
            existing_df = generate_out_file("terminal_nodes.pkl", args.out_dir,
                                            tag)
            generate_terminal_node_df(snippets=json,
                                      rdf_dir=rdf_dir,
                                      out_dir=args.out_dir,
                                      tag=tag,
                                      existing=existing_df)

now = datetime.datetime.now()
print(f"Finished creating shortest path dataframe {existing_df}")
print(now)