Exemple #1
0
from cocube_beta import modify_phrases

if __name__ == "__main__":
    basepath = "../data/"
    dataset = "dblp/"
    pkl_dump_dir = basepath + dataset

    with open(
            pkl_dump_dir +
            "df_mapped_labels_phrase_removed_stopwords_baseline_metadata.pkl",
            "rb") as handler:
        df = pickle.load(handler)

    phrase_id_map = pickle.load(open(pkl_dump_dir + "phrase_id_map.pkl", "rb"))

    label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_run3.json")
    label_term_dict = modify_phrases(label_term_dict, phrase_id_map)

    print(label_term_dict)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df["abstract"])
    X_arr = X.toarray()
    names = vectorizer.get_feature_names()

    label_term_index_dict = {}
    for i in label_term_dict:
        label_term_index_dict[i] = []
        for w in label_term_dict[i]:
            try:
                label_term_index_dict[i].append(names.index(w))
Exemple #2
0
                temp += model.infer_vector([w])
            except Exception as e:
                print("Word ", w, e)
        label_w2v_dict[l] = temp
    return label_w2v_dict


if __name__ == "__main__":
    basepath = "../data/"
    dataset = "nyt/"
    pkl_dump_dir = basepath + dataset

    with open(pkl_dump_dir + "df_tokenized_clean_child.pkl", "rb") as handler:
        df = pickle.load(handler)

    label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_child_uncon.json")

    # documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["sentence"])]
    # vec_size = 100
    # alpha = 0.025
    #
    # model = Doc2Vec(documents, vector_size=vec_size, window=2, min_count=1, workers=4, iter=100)
    # model.save(pkl_dump_dir + "d2v.model")
    # print("Model Saved")

    model = Doc2Vec.load(pkl_dump_dir + "d2v.model")

    label_w2v_dict = get_label_w2v_dict(label_term_dict)

    pred = []
    label_count, term_count = A_LT.shape
    for i in range(label_count):
        print("*" * 80)
        print("For Label ", index_to_label[i] + " : ")
        for j in range(term_count):
            if A_LT[i][j]:
                print(index_to_word[j])


if __name__ == "__main__":
    basepath = "/data3/jingbo/dheeraj/"
    dataset = "20news/"
    pkl_dump_dir = basepath + dataset

    df = pickle.load(open(pkl_dump_dir + "df_tokenized_limit_clean_parent.pkl", "rb"))
    label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_parent_uncon.json")

    word_vec = Word2Vec.load(pkl_dump_dir + "w2v.model_parent")
    # word_vec = pickle.load(open(pkl_dump_dir + "word_vec_tokenized_clean_removed_stopwords.pkl", "rb"))
    labels, label_to_index, index_to_label = get_distinct_labels(df)

    # U_D = get_UD(df, word_vec)
    # U_D = pickle.load(open(pkl_dump_dir + "U_D.pkl", "rb"))
    word_to_index, index_to_word, U_T = get_UT(word_vec)
    A_LT = get_ALT(index_to_label, word_to_index, label_term_dict)
    U_L = get_UL(A_LT, U_T)
    A_TD = get_ATD(df, word_to_index)
    U_D = np.transpose(A_TD)
    docfreq = get_doc_freq(df)
    t = 20
    threshold = 0.4
    return X, y, y_true


if __name__ == "__main__":
    # base_path = "./data/"
    base_path = "/data4/dheeraj/metaguide/"
    dataset = "books/"

    data_path = base_path + dataset
    df = pickle.load(open(data_path + "df_phrase_removed_stopwords.pkl", "rb"))
    tokenizer = pickle.load(open(data_path + "tokenizer.pkl", "rb"))
    phrase_id_map = pickle.load(open(data_path + "phrase_id_map.pkl", "rb"))
    id_phrase_map = pickle.load(open(data_path + "id_phrase_map.pkl", "rb"))

    labels, label_to_index, index_to_label = get_distinct_labels(df)
    label_term_dict = get_label_term_json(data_path + "seedwords.json")
    label_term_dict = modify_phrases(label_term_dict, phrase_id_map)

    graph, metapaths = get_book_graph_metapaths(df, tokenizer, id_phrase_map)

    print(
        "Number of nodes {} and number of edges {} in graph.".format(
            graph.number_of_nodes(), graph.number_of_edges()
        )
    )

    rw = UniformRandomMetaPathWalk(graph)
    walks = rw.run(
        nodes=list(graph.nodes()),  # root nodes
        length=5,  # maximum length of a random walk
        n=5,  # number of random walks per root node