Beispiel #1
0
def n2v(graph: str, output_dir: str, directed: bool, tag: str,
        params: dict) -> None:
    """Runs the SNAP implementation of Node2Vec on a NetworkX graph

    Args:
        graph (str): Path to a pickled NetworkX Graph
        output_dir (str): The directory that will save Node2Vec Model.
        directed (bool): If True, process as directed graph
        tag (str): The tag that will be appended to output files, useful for IDing 
        params (dict): Dictionary of Node2Vec/Word2Vec Parameters
    """

    # Ensure directories exist
    directory_check(output_dir)
    directory_check(output_dir + "/models")
    directory_check(output_dir + "/embeddings")
    temp_dir = output_dir + "/temp"
    directory_check(temp_dir)

    node2vec_init = n2v_init(temp_folder=temp_dir, **params)
    node2vec_fit = n2v_fit(**params)

    print("Beginning node2vec script")
    print("Graph: %s" % graph)
    for key, value in node2vec_init.items():
        print("%s: %s" % (key, value))
    for key, value in node2vec_fit.items():
        print("%s: %s" % (key, value))

    G = nx.read_gpickle(graph)

    if not directed:
        G = G.to_undirected()

    try:
        node2vec = Node2Vec(G, **node2vec_init)
        model = node2vec.fit(**node2vec_fit)
    except Exception as e:
        logging.error("Failed to run Node2Vec on Graph")
        logging.error(e.__doc__)

    embedding_file = generate_out_file("embeddings.pkl",
                                       output_dir + "/embeddings", tag)
    model_file = generate_out_file("model.pkl", output_dir + "/models", tag)

    # Save embeddings
    model.wv.save_word2vec_format(embedding_file)
    print("Embeddings saved to %s" % embedding_file)

    # Save model
    model.save(model_file)
    print("Model saved to %s" % model_file)

    print("Completed n2v.py")
Beispiel #2
0
def generate_rdfs(
    api_key_file: str, snippets: dict, out_dir: str, api_method: str = "limited"
) -> None:
    """Generates a directory of RDF files that have been passed to the FRED API from a set of strings

    Args:
        api_key_file (str): The file path to the api key
        snippets (dict): A dictionary of snippets such that UID: snippet
        out_dir (str): The directory to store generated RDFs within
        api_method ([type], optional): Limited or unlimited api behavior. Defaults to 'limited':str.
    """
    # Check for directory and extract api_key
    directory_check(out_dir)
    api_key = get_api_key(api_key_file)

    # For every snippet, call FRED API
    for uid, snip in tqdm(snippets.items()):
        out_file = out_dir + uid + ".rdf"

        # Based on FRED API Limits with generic API key
        if api_method == "limited":
            with RL_DAY:
                with RL_MINUTE:
                    try:
                        checkFredSentence(snip, api_key, out_file)
                        print("Successfully parsed ", uid)
                    except Exception as e:
                        print("Failed to parse ", uid, " Snippet: ", snip)
                        print(e)
        # If a special key has been provided by FRED research team...
        elif api_method == "unlimited":
            try:
                checkFredSentence(snip, api_key, out_file)
                print("Successfully parsed ", uid)
            except Exception as e:
                print("Failed to parse ", uid, " Snippet: ", snip)
                print(e)
Beispiel #3
0
def nodevec(graph: str, output_dir: str, directed: bool, tag: str,
            params: dict) -> None:

    # Ensure directories exist
    directory_check(output_dir)
    directory_check(output_dir + "/models")
    directory_check(output_dir + "/embeddings")
    temp_dir = output_dir + "/temp"
    directory_check(temp_dir)

    w2vparams = get_w2vparams(**params)
    node2vec_init = get_n2vparams(w2vparams=w2vparams, **params)

    print("Beginning node2vec script")
    print("File: %s" % graph)
    for key, value in node2vec_init.items():
        print("%s: %s" % (key, value))
    for key, value in w2vparams.items():
        print("%s: %s" % (key, value))

    G = nx.read_gpickle(graph)
    G = uri_to_str(G)

    if not directed:
        G = G.to_undirected()

    n2v_model = Node2Vec(**node2vec_init)
    n2v_model.fit(G)

    embedding_file = generate_out_file("embeddings.pkl",
                                       out_dir + "embeddings/", tag)
    model_file = generate_out_file("model.pkl", out_dir + "models/", tag)

    # Save embeddings
    n2v_model.model.wv.save_word2vec_format(embedding_file)
    print("Embeddings saved to %s" % embedding_file)

    # Save model
    n2v_model.model.save(model_file)
    print("Model saved to %s" % embedding_file)

    print("Completed nodevectors.py")
Beispiel #4
0
        #   Read json
        with open(j_file) as f:
            relations = json.loads(f.read())
        #   Load dictionary
        snips = dict()
        for relation in relations:
            uid = relation["UID"]
            snippet = get_snippet(relation)
            len_snip = len(snippet.split())
            if (len_snip > max or len_snip < min) and (
                    "_cr" not in uid
            ):  # if number of words in snippet is outlier, do not include
                continue
            else:
                snips[uid] = snippet

        generate_rdfs(api_key_file, snips, out_dir, api_method)


if __name__ == "__main__":
    args = arg_parse()

    api_method = args.api
    api_key_file = args.api_key
    grec_dir = args.grec
    rdf_dir = args.rdf

    directory_check(rdf_dir)

    main(api_method, api_key_file, grec_dir, rdf_dir)
Beispiel #5
0
    X_train = X_train["Short_Path"].apply(pd.Series)
    X_valid = X_valid["Short_Path"].apply(pd.Series)
    X_test = X_test["Short_Path"].apply(pd.Series)

    return (X_train, y_train), (X_valid, y_valid), (X_test, y_test)


if __name__ == "__main__":
    now = datetime.datetime.now().strftime("%y%m%d")

    args = arg_parse()
    assert args.model_name is not None, "Must provide name of Model"
    assert args.in_tag is not None, "Must provide tag for Training Data"

    directory_check(args.in_dir, create=False)
    directory_check(args.out_dir)
    directory_check(config.TRAIN_LOGS)

    tag = f"{args.model_name}-"
    if args.cv:
        tag += "cv-"
    if args.no_early_stopping:
        tag += "nes-"
    tag += f"{args.model_name}-"
    tag += f"{remove_tag_date(args.in_tag)}-{now}"

    print("train.py")
    print("-" * 30)
    print(f"Now: {now}")
    print(f"Model: {args.model_name}")
Beispiel #6
0

def add_negative_samples(df):
    df_no = df.loc[df["Maj_Vote"] == "no"]
    df_no["Relation"] = "none"
    df_no["Maj_Vote"] = "yes"
    df = df.append(df_no)
    return df


if __name__ == "__main__":
    args = arg_parse()

    now = datetime.datetime.now().strftime("%y%m%d")

    directory_check(args.in_dir)
    directory_check(args.out_dir)

    # Can manually write in path to shortest path pickles, if deviating too far from standard execution
    sp_files = [(args.in_dir + '/' + x) for x in os.listdir(args.in_dir)
                if x.endswith('.pkl')]
    assert len(
        sp_files) <= 2, "Only a maximum of two dataframes can be processed"

    tag = ""
    if args.neg:
        tag += "neg-"
    if args.unbalanced:
        tag += "unbal-"
    tag += get_experiment_tag(sp_files[0]) + f"-{now}"
    )

    if arg_list:
        return parser.parse_args(args=arg_list)
    else:
        return parser.parse_args()


if __name__ == "__main__":

    now = datetime.datetime.now().strftime("%y%m%d")

    args = arg_parse()
    arg_dict = vars(args)

    directory_check(args.n2v_model_dir, create=False)
    directory_check(args.grec_dir, create=False)
    directory_check(args.rdf_dir, create=False)
    directory_check(args.out_dir)

    n2v_model_files = []
    tags = None

    if args.custom != "None":
        tags = [sp_params.sp_param_dict[args.custom]]
    else:
        tags = [sp_params.sp_param_dict["best"]]

    models = os.listdir(args.n2v_model_dir)

    for tag_i in tags:
Beispiel #8
0
    else:
        return parser.parse_args()


def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)


def download_file(url, destination):
    session = requests.Session()
    response = session.get(url)
    destination += "/" + pathlib.Path(url).name
    save_response_content(response, destination)


args = arg_parse()
dir = args.output_dir

directory_check(dir)

for url in GREC_URLS:
    print(f"Downloading { pathlib.Path(url).name } ...")
    download_file(url, dir)

print(f"Copying ./data/cr.json to {dir}")
copyfile('./data/cr.json', (dir + '/cr.json'))
Beispiel #9
0
    if arg_list:
        return parser.parse_args(args=arg_list)
    else:
        return parser.parse_args()


if __name__ == "__main__":

    now = datetime.datetime.now().strftime("%y%m%d")

    args = arg_parse()
    arg_dict = vars(args)
    assert args.in_tag is not None, "Must provide tag for corpus graph"

    directory_check(args.graph_dir, create=False)
    in_graph = args.graph_dir + "/corpus_graph-" + args.in_tag + ".pkl"
    itag = args.in_tag.split("-")[0]
    tag = None

    nv = arg_dict["nodevectors"]

    params = None

    out_dir = arg_dict.pop("out_dir")
    directed = arg_dict.pop("directed")

    if args.custom != "None":
        params = [n2v_params.n2v_param_dict[args.custom]]
    else:
        params = [n2v_params.n2v_param_dict["best"]]