from cocube_beta import modify_phrases if __name__ == "__main__": basepath = "../data/" dataset = "dblp/" pkl_dump_dir = basepath + dataset with open( pkl_dump_dir + "df_mapped_labels_phrase_removed_stopwords_baseline_metadata.pkl", "rb") as handler: df = pickle.load(handler) phrase_id_map = pickle.load(open(pkl_dump_dir + "phrase_id_map.pkl", "rb")) label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_run3.json") label_term_dict = modify_phrases(label_term_dict, phrase_id_map) print(label_term_dict) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(df["abstract"]) X_arr = X.toarray() names = vectorizer.get_feature_names() label_term_index_dict = {} for i in label_term_dict: label_term_index_dict[i] = [] for w in label_term_dict[i]: try: label_term_index_dict[i].append(names.index(w))
temp += model.infer_vector([w]) except Exception as e: print("Word ", w, e) label_w2v_dict[l] = temp return label_w2v_dict if __name__ == "__main__": basepath = "../data/" dataset = "nyt/" pkl_dump_dir = basepath + dataset with open(pkl_dump_dir + "df_tokenized_clean_child.pkl", "rb") as handler: df = pickle.load(handler) label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_child_uncon.json") # documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["sentence"])] # vec_size = 100 # alpha = 0.025 # # model = Doc2Vec(documents, vector_size=vec_size, window=2, min_count=1, workers=4, iter=100) # model.save(pkl_dump_dir + "d2v.model") # print("Model Saved") model = Doc2Vec.load(pkl_dump_dir + "d2v.model") label_w2v_dict = get_label_w2v_dict(label_term_dict) pred = []
label_count, term_count = A_LT.shape for i in range(label_count): print("*" * 80) print("For Label ", index_to_label[i] + " : ") for j in range(term_count): if A_LT[i][j]: print(index_to_word[j]) if __name__ == "__main__": basepath = "/data3/jingbo/dheeraj/" dataset = "20news/" pkl_dump_dir = basepath + dataset df = pickle.load(open(pkl_dump_dir + "df_tokenized_limit_clean_parent.pkl", "rb")) label_term_dict = get_label_term_json(pkl_dump_dir + "seedwords_parent_uncon.json") word_vec = Word2Vec.load(pkl_dump_dir + "w2v.model_parent") # word_vec = pickle.load(open(pkl_dump_dir + "word_vec_tokenized_clean_removed_stopwords.pkl", "rb")) labels, label_to_index, index_to_label = get_distinct_labels(df) # U_D = get_UD(df, word_vec) # U_D = pickle.load(open(pkl_dump_dir + "U_D.pkl", "rb")) word_to_index, index_to_word, U_T = get_UT(word_vec) A_LT = get_ALT(index_to_label, word_to_index, label_term_dict) U_L = get_UL(A_LT, U_T) A_TD = get_ATD(df, word_to_index) U_D = np.transpose(A_TD) docfreq = get_doc_freq(df) t = 20 threshold = 0.4
return X, y, y_true if __name__ == "__main__": # base_path = "./data/" base_path = "/data4/dheeraj/metaguide/" dataset = "books/" data_path = base_path + dataset df = pickle.load(open(data_path + "df_phrase_removed_stopwords.pkl", "rb")) tokenizer = pickle.load(open(data_path + "tokenizer.pkl", "rb")) phrase_id_map = pickle.load(open(data_path + "phrase_id_map.pkl", "rb")) id_phrase_map = pickle.load(open(data_path + "id_phrase_map.pkl", "rb")) labels, label_to_index, index_to_label = get_distinct_labels(df) label_term_dict = get_label_term_json(data_path + "seedwords.json") label_term_dict = modify_phrases(label_term_dict, phrase_id_map) graph, metapaths = get_book_graph_metapaths(df, tokenizer, id_phrase_map) print( "Number of nodes {} and number of edges {} in graph.".format( graph.number_of_nodes(), graph.number_of_edges() ) ) rw = UniformRandomMetaPathWalk(graph) walks = rw.run( nodes=list(graph.nodes()), # root nodes length=5, # maximum length of a random walk n=5, # number of random walks per root node