def main(layer, edge_path, edge_filename, output_path, walk_filename, n, length, p, q, is_weighted, is_directed, job_id): if is_weighted == 1: weighted = True else: weighted = False if is_directed == 1: directed = True else: directed = False file_name = os.path.join(edge_path, layer, edge_filename) tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0") tmp_edge[['source', 'target']] = tmp_edge[['source', 'target']].astype(str) # 1) for each layer first create a nx-Digraph nxg = graph_utils.Build_nx_Graph(source_target_weight=tmp_edge, directed=True) # 2) Create stellar Di graphs sdg = StellarDiGraph(nxg) # 3) Initialize the walk and do the begin checks BDWW.BeginWalk(sdg, begin_checks=True, weighted=True, directed=True) rw = BDWW.BiasedDirectedRandomWalk(sdg, directed=True, weighted=True, begin_checks=False) nodes = list(sdg.nodes()) walks = rw.run(nodes=nodes, length=length, n=n, p=p, q=q, weighted=weighted, directed=directed) result_path = os.path.join(output_path, layer) if not os.path.isdir(result_path): print("making a new directory for the output") os.mkdir(result_path) if job_id is not None: walk_file_name = str.split(walk_filename, ".")[0] + "_" + str(job_id) + ".csv" else: walk_file_name = walk_filename utils.Write_List_of_Lists_from_CSV(result_path, walk_file_name, walks)
def test_tie_breaking(tie_breaking): pred_scores = np.array( [ [1, 5, 8], # true_modified_node_ilocs: [1, 3, 8], # 1 [1, 2, 7], # 2 [1, 2, 6], # 3 ] ) known_edges_graph = StellarDiGraph( nodes=pd.DataFrame(index=["a", "b", "c", "d"]), edges=pd.DataFrame( [ # preds[0, :]: edge being predicted, checking it's counted properly for 'filtered' ("a", "b"), # preds[1, :]: the other tied edge, to see the 'bottom' score move up ("b", "d"), ], columns=["source", "target"], ), ) copies = 100 rankings = [ _ranks_from_score_columns( pred_scores, true_modified_node_ilocs=np.array([1, 2, 3]), unmodified_node_ilocs=np.array([0, 1, 2]), true_rel_ilocs=np.array([0, 0, 0]), modified_object=True, known_edges_graph=known_edges_graph, tie_breaking=tie_breaking, ) for _ in range(copies) ] all_rankings = np.array(rankings) assert all_rankings.shape == (copies, 2, 3) top_expected = np.repeat([[[1, 3, 4], [1, 3, 4]]], copies, axis=0) bottom_expected = np.repeat([[[4, 4, 4], [4, 3, 4]]], copies, axis=0) if tie_breaking == "top": np.testing.assert_array_equal(all_rankings, top_expected) elif tie_breaking == "bottom": np.testing.assert_array_equal(all_rankings, bottom_expected) elif tie_breaking == "random": assert (all_rankings >= top_expected).all() assert (all_rankings <= bottom_expected).all() # check both raw and filtered results (independently) have some variation in them for i in range(all_rankings.shape[1]): raw_or_filtered = all_rankings[:, i, :] assert (raw_or_filtered != top_expected[:, i, :]).any() assert (raw_or_filtered != bottom_expected[:, i, :]).any()
def tree_graph() -> StellarGraph: nodes = pd.DataFrame(index=["root", "0", 1, 2, "c1.1", "c2.1", "c2.2"]) edges = pd.DataFrame( [ ("root", 2), ("root", 1), ("root", "0"), (2, "c2.1"), (2, "c2.2"), (1, "c1.1"), ], columns=["source", "target"], ) return StellarDiGraph(nodes, edges)
def knowledge_graph(): nodes = ["a", "b", "c", "d"] edge_counter = 0 def edge_df(*elements): nonlocal edge_counter end = edge_counter + len(elements) index = range(edge_counter, end) edge_counter = end return pd.DataFrame(elements, columns=["source", "target"], index=index) edges = { "W": edge_df(("a", "b")), "X": edge_df(("a", "b"), ("b", "c")), "Y": edge_df(("b", "a")), "Z": edge_df(("d", "b")), } return StellarDiGraph(nodes=pd.DataFrame(index=nodes), edges=edges)
def test_model_rankings(model_maker): nodes = pd.DataFrame(index=["a", "b", "c", "d"]) rels = ["W", "X", "Y", "Z"] empty = pd.DataFrame(columns=["source", "target"]) every_edge = itertools.product(nodes.index, rels, nodes.index) every_edge_df = triple_df(*every_edge) no_edges = StellarDiGraph(nodes, {name: empty for name in rels}) # the filtering is most interesting when there's a smattering of edges, somewhere between none # and all; this does a stratified sample by label, to make sure there's at least one edge from # each label. one_per_label_df = (every_edge_df.groupby("label").apply( lambda df: df.sample(n=1)).droplevel(0)) others_df = every_edge_df.sample(frac=0.25) some_edges_df = pd.concat([one_per_label_df, others_df], ignore_index=True) some_edges = StellarDiGraph( nodes, { name: df.drop(columns="label") for name, df in some_edges_df.groupby("label") }, ) all_edges = StellarDiGraph( nodes=nodes, edges={ name: df.drop(columns="label") for name, df in every_edge_df.groupby("label") }, ) gen = KGTripleGenerator(all_edges, 3) sg_model = model_maker(gen, embedding_dimension=5) x_inp, x_out = sg_model.in_out_tensors() model = Model(x_inp, x_out) raw_some, filtered_some = sg_model.rank_edges_against_all_nodes( gen.flow(every_edge_df), some_edges) # basic check that the ranks are formed correctly assert raw_some.dtype == int assert np.all(raw_some >= 1) # filtered ranks are never greater, and sometimes less assert np.all(filtered_some <= raw_some) assert np.any(filtered_some < raw_some) raw_no, filtered_no = sg_model.rank_edges_against_all_nodes( gen.flow(every_edge_df), no_edges) np.testing.assert_array_equal(raw_no, raw_some) # with no edges, filtering does nothing np.testing.assert_array_equal(raw_no, filtered_no) raw_all, filtered_all = sg_model.rank_edges_against_all_nodes( gen.flow(every_edge_df), all_edges) np.testing.assert_array_equal(raw_all, raw_some) # when every edge is known, the filtering should eliminate every possibility assert np.all(filtered_all == 1) # check the ranks against computing them from the model predictions directly. That is, for each # edge, compare the rank against one computed by counting the predictions. This computes the # filtered ranks naively too. predictions = model.predict(gen.flow(every_edge_df)) for (source, rel, target), score, raw, filtered in zip( every_edge_df.itertuples(index=False), predictions, raw_some, filtered_some): # rank for the subset specified by the given selector def rank(compare_selector): return 1 + (predictions[compare_selector] > score).sum() same_r = every_edge_df.label == rel same_s_r = (every_edge_df.source == source) & same_r expected_raw_mod_o_rank = rank(same_s_r) assert raw[0] == expected_raw_mod_o_rank known_objects = some_edges_df[(some_edges_df.source == source) & (some_edges_df.label == rel)] object_is_unknown = ~every_edge_df.target.isin(known_objects.target) expected_filt_mod_o_rank = rank(same_s_r & object_is_unknown) assert filtered[0] == expected_filt_mod_o_rank same_r_o = same_r & (every_edge_df.target == target) expected_raw_mod_s_rank = rank(same_r_o) assert raw[1] == expected_raw_mod_s_rank known_subjects = some_edges_df[(some_edges_df.label == rel) & (some_edges_df.target == target)] subject_is_unknown = ~every_edge_df.source.isin(known_subjects.source) expected_filt_mod_s_rank = rank(subject_is_unknown & same_r_o) assert filtered[1] == expected_filt_mod_s_rank
def main(edge_path, edge_filename, output_path, walk_filename, n, length, p, q, is_weighted, is_directed, job_id): start_time = timeit.default_timer() if is_weighted == 1: weighted = True else: weighted = False if is_directed == 1: directed = True else: directed = False layers = os.listdir(edge_path) nx_graphs = {} # keep all the nxDigraphs stellar_Di_graphs = {} # keep all the stellarDigraphs node_importance = {} # keep all the node_importance per layers for layer in layers: file_name = os.path.join(edge_path, layer, edge_filename) tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0") tmp_edge[['source', 'target']] = tmp_edge[['source', 'target']].astype(str) # 1) for each layer first create a nx-Digraph nxg = graph_utils.Build_nx_Graph(source_target_weight=tmp_edge, directed=True) nx_graphs[layer] = nxg # 2) Create stellar Di graphs sdg = StellarDiGraph(nxg) stellar_Di_graphs[layer] = sdg # 3) Initialize the walk and return the layer_node_importance obj = BDWW.BeginWalk(sdg, begin_checks=True, weighted=True, directed=True) node_importance[layer] = obj.node_importance print("for layer: ", layer, "this is the end node:", obj.end_nodes) # 4) Find all the nodes in all the graphs base_nodes = BDWW.get_all_nodes(stellar_Di_graphs) # 5) Find the node importance layer_importance = BDWW.get_layer_importance(base_nodes, node_importance) #6) finally lets walk walks = BDWW.biased_directed_multi_walk( stellar_multi_graph_dict=stellar_Di_graphs, nodes=base_nodes, layer_importance=layer_importance, n=n, length=length, p=p, q=q, tol=10**-6, weighted=weighted, directed=directed)[0] if job_id is not None: walk_file_name = str.split(walk_filename, ".")[0] + "_" + str(job_id) + ".csv" else: walk_file_name = walk_filename utils.Write_List_of_Lists_from_CSV(output_path, walk_file_name, walks) elapsed = timeit.default_timer() - start_time print('-------------------------------') print('Training time:', elapsed) print('-------------------------------')
def model_train(): print("Loading files..") cluster = MongoClient('mongodb+srv://nirmal:[email protected]/<dbname>?retryWrites=true&w=majority') db = cluster.Dataset pcol = db.posts vcol= db.views fcol= db.follows favcol = db.favourites # posts = pd.read_csv('./posts.csv',engine='python') # users = pd.read_csv('./users.csv') # views = pd.read_csv('./views.csv') # favorites = pd.read_csv('./favourites.csv') # userPosts = pd.read_csv('./usersPosts.csv') # print("Files loaded..") ''' Create DataFrame for preprocessing ''' posts = pd.DataFrame(list(pcol.find())) views = pd.DataFrame(list(vcol.find())) favorites = pd.DataFrame(list(favcol.find())) userPosts = posts[['_id','postedBy']] follows = pd.DataFrame(list(fcol.find())) print("Collections loaded..") print("Started preprocessing..") views = views[views['user_id']!='anonymous'] posts = posts.dropna(subset=['title',' post_type','tags']) posts['category'] = posts['category'].fillna(posts['tags']) posts['tags'] = posts['tags'].apply(clean_text) """Splitting on '|' and '#' for getting categories""" uniq_category = dict() uniq_post_type = dict() i=0 j=0 for cats,pt in zip(posts['category'].values,posts[' post_type'].values): for cat in re.split('[#|]',cats): if cat not in uniq_category.keys(): uniq_category[cat]=i i+=1 if pt not in uniq_post_type.keys(): uniq_post_type[pt]=j j+=1 category_ohe = np.zeros((len(posts),513)) for i,cats in enumerate(posts['category'].values): for cat in re.split('[#|]',cats): category_ohe[i][uniq_category[cat]]=1 token_tag = [word_tokenize(tag) for tag in posts['tags'].values.tolist()] tag_model = Word2Vec(token_tag,sg=1,size=100,window=5, min_count=5, workers=4,iter=100) tag_model.save('./tag.model') tag_model = Sentence2Vec('./tag.model') processed_title = posts['title'].apply(clean_text) token_title = [word_tokenize(tag) for tag in processed_title] title_model = Word2Vec(token_title,sg=1,size=100,window=5, min_count=5, workers=4,iter=100) title_model.save('./title.model') title_model = Sentence2Vec('./title.model') posts_info = dict() for pid,title,cat,tag in zip(posts['_id'],posts['title'].values,category_ohe,posts['tags'].values): posts_info[pid] = dict() posts_info[pid]['title'] = title_model.get_vector(title) posts_info[pid]['tag'] = tag_model.get_vector(tag) posts_info[pid]['cat'] = cat """Removing rows in views.csv, favorites.csv and usrPosts.csv that has pid not present in posts.csv """ pidr=set() for pid in views['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in favorites['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in userPosts['post_id']: if posts_info.get(pid,0) == 0: pidr.add(pid) for pid in list(pidr): views = views[views['post_id']!=pid] userPosts = userPosts[userPosts['post_id']!=pid] favorites = favorites[favorites['post_id']!=pid] """Representing the user based on the categories seen by the user""" users_info = defaultdict(lambda :np.zeros((513))) for uid,pid in zip(views['user_id'],views['post_id']): a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe'])) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """Increasing the weightage for categories by 100% for posts posted by user""" for uid,pid in zip(userPosts['user_id'],userPosts['post_id']): a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe'])) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """Increasing weightage for categories by 50% for favorite posts""" for uid,pid in zip(favorites['user_id'],favorites['post_id']): a = 1/2*posts_info[pid]['cat'] #,posts_info[pid]['pt'])))#,posts_info[pid]['title_ohe']))) users_info[uid] = np.add(users_info[uid],a) assert(np.sum(users_info[uid])!=0) """## MODEL Generating -ive datapoints for each user where the posts chosen have categories that are not seen by the user """ def gen_pseudoDP(user_id): cat_user = users_info[uid] arr=[] k=0 for pid in posts_info.keys(): cat = posts_info[pid]['cat'] flag=0 for i in range(len(cat)): if (cat[i]!=0 and cat_user[i] != 0): flag=1 break if flag==0: arr.append([uid,pid,0]) k+=1 if k==4: break return arr pseudo = pd.DataFrame(np.zeros((len(users_info)*4,3)),columns=['user_id','post_id','view']) i=0 for uid in list(users_info.keys()): arr = gen_pseudoDP(uid) if len(arr): pseudo[i:i+len(arr)] = arr i+=4 views['view'] = np.ones((len(views))) views = views.drop(columns=['timestamp'],axis=1) data = views.append(pseudo) print("Preprocessing done!") class Datagenerator(tf.keras.utils.Sequence): def __init__(self,X,y=None,batch_size=1,shuffle=True): super().__init__() self.X = X self.y = y self.batch_size = batch_size self.on_epoch_end() def __getitem__(self,index): indices = self.indexes[index * self.batch_size:(index + 1) * self.batch_size] batch = self.X.iloc[indices] y = self.y.iloc[indices] user=np.zeros((self.batch_size,len(uniq_category))) title=np.zeros((self.batch_size,100)) tag=np.zeros((self.batch_size,100)) category = np.zeros((self.batch_size,len(uniq_category))) for i in range(self.batch_size): title[i] = posts_info[batch.post_id.values[i]]['title'] tag[i] = posts_info[batch.post_id.values[i]]['tag'] category[i] = posts_info[batch.post_id.values[i]]['cat'] user[i] = users_info[batch['user_id'].values[i]] return [user,title,tag,category],y.values.reshape(-1,1) def __len__(self): return int(np.floor(len(self.X) / self.batch_size)) def on_epoch_end(self): self.indexes = np.arange(len(self.X)) np.random.shuffle(self.indexes) y = data['view'] X = data.drop(['view'],axis=1) X_train, X_test, y_train, y_test = train_test_split(X,y) train_dg = Datagenerator(X_train,y_train,128) test_dg = Datagenerator(X_test,y_test,128) """Model predicts whether a user will see a post or not. Based on that user embeddings will be learnt which will then be used for recommendation""" def create_model(): user_inp = Input((len(uniq_category))) embed = Embedding(input_dim=len(uniq_category),output_dim=50)(user_inp) dense = Dense(2056)(Flatten()(embed)) user = Dense(500,activation='relu')(dense) user = Dense(400,activation='relu')(user) cat = Input((len(uniq_category))) cat_ = Dense(300,activation='relu')(cat) title = Input((100)) title_ = Dense(50,activation='relu')(title) tag = Input((100)) tag_ = Dense(50,activation='relu')(tag) post_concat = Concatenate()([cat_,title_,tag_]) output = Dot(axes=[-1,-1],normalize=True)([user,post_concat]) model = tf.keras.Model([user_inp,title,tag,cat],output) return model model = create_model() model.compile(optimizer=Adagrad(lr=0.0001), loss='binary_crossentropy',metrics=['accuracy']) print("model started training...") model.fit_generator(train_dg,validation_data=test_dg,epochs=1) print("Model trained") """Retrieving trained user embeddings""" user_embeddings = model.get_layer('embedding').get_weights()[0] follows = pd.read_csv('./follows.csv') follows = follows.drop(['timestamp'],axis=1) """Users present in follows.csv""" uids = np.concatenate((follows['user_id'].values,follows['follower_id'].values)) uids = set(uids) """Creating Edges""" edges = [(y,x) for x,y in zip(follows['user_id'],follows['follower_id'])] """Creating Directional Graph and adding the edges""" G = nx.DiGraph() G.add_edges_from(edges) edges_dict = dict() for edge in edges: edges_dict[edge]=1 rw = UniformRandomMetaPathWalk(StellarDiGraph(G)) """Creating random walks. Each walk can be seen as a chain: uid->uid->uid ... They are of length 100 """ walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']]) """Word2Vec on those chains""" user_model = Word2Vec(walks,size=128,window=5) user_model.wv.vectors.shape """Each user represented by 128 dim vector""" node_ids = user_model.wv.index2word node_embed = user_model.wv.vectors print("Pushing to database...") userCollection = cluster.Users.User_Embeddings userCollection.delete_many({}) followCollection = cluster.Users.Follows followCollection.delete_many({}) posted = cluster.Users.Posted posted.delete_many({}) catCol = cluster.Users.Categories catCol.delete_many({}) embedCol = cluster.Users.Embedding_Matrix embedCol.delete_many({}) folDict = dict() for i,id in enumerate(node_ids): folDict[id]=i user_ins=[] for user in tqdm(users_info.keys()): embed = list(np.matmul(users_info[user],user_embeddings)) if folDict.get(user,-1) == -1: #userCollection.insert_one({'user_id':user, 'user_embed':embed}) user_ins.append({'user_id':user, 'user_embed':embed}) else: yo = node_embed[folDict[user]].tolist() #userCollection.insert_one({'user_id':user, 'user_embed':embed, 'node_embed':yo}) user_ins.append({'user_id':user, 'user_embed':embed, 'node_embed':yo}) userCollection.insert_many(user_ins) fol=[] for uid,fid in tqdm(zip(follows['user_id'],follows['follower_id'])): d = dict() d['user_id'] = uid d['follower_id'] = fid fol.append(d) followCollection.insert_many(fol) categories = pickle.dumps(uniq_category) user_embed = pickle.dumps(user_embeddings) catCol.insert_one({"Categories":categories}) embedCol.insert_one({"Matrix":user_embed}) uids = set() for uid in userPosts['user_id']: uids.add(uid) to_ins=[] for uid in uids: noob = dict() noob['user_id']=uid to_ins.append(noob) posted.insert_many(to_ins) requests.get('http://3.7.185.166/train') print("Done!")
def load_graph_data(dataframe, embeddings, name="default", testing=False, num_test=100, using_start=False): actor_indeces = [] actor_features = [] utterance_indeces = [] utterance_features = [] source_edges = [] target_edges = [] if testing: num_dialogues = num_test else: num_dialogues = len(dataframe['Dialogue ID'].unique()) print("Building graph, 1 dialogue at a time...") for dialogueID in tqdm(dataframe['Dialogue ID'].unique()[0:num_dialogues]): dialogue = dataframe[dataframe["Dialogue ID"] == dialogueID] # Loop through all utterances of the dialogue for rowidx in range(len(dialogue)): row = dialogue.iloc[rowidx] # 0. Add actor index-feature if it does not already exist actor_idx = f"{row.Actor}_{dialogueID}" if actor_idx not in actor_indeces: actor_indeces.append(actor_idx) if len(actor_features) == 0: # Create new numpy array of actor features actor_features = np.random.normal(0.0, 1.0, [1, 1024]) else: # Concatenate features to already existing array actor_features = np.concatenate( (actor_features, np.random.normal(0.0, 1.0, [1, 1024])), axis=0) # 1. Add utterance index-feature (ELMo embeddings) utt_idx = f"u_dID{dialogueID}_#{rowidx}" utterance_indeces.append(utt_idx) # To iterate over the ELMo embeddings we use the index list of the # dataset, indexed by the row of the dialogue we are currently parsing if len(utterance_features) == 0: utterance_features = np.array( [embeddings[dialogue.index[rowidx]]]) else: utterance_features = np.concatenate( (utterance_features, np.array([embeddings[dialogue.index[rowidx]]])), axis=0) # 2. Build edges. If this is the first row of a dialogue, # begin by drawing an edge from the "START-Node" (source) # to the current utterance index (target) if using_start and rowidx == 0: source_edges.append("START-Node") target_edges.append(utt_idx) # 3. Construct remaining edges. # 3.1 Actor to the utterance source_edges.append(actor_idx) target_edges.append(utt_idx) # 3.2 Utterance to the next utterance if (rowidx + 1) != len(dialogue): source_edges.append(utt_idx) target_edges.append(f"u_dID{dialogueID}_#{rowidx + 1}") # 3.3 Utterance to all actors for actor in dialogue['Actor'].unique(): all_actor_idx = f"{actor}_{dialogueID}" source_edges.append(utt_idx) target_edges.append(all_actor_idx) # GraphSAGE (Does not support modelling nodes of different kind) ..less bad if using_start: start_features = np.random.normal(0.0, 1.0, [1, 1024]) start_index = "START-Node" node_features = np.concatenate( (actor_features, utterance_features, start_features), axis=0) node_indeces = actor_indeces + utterance_indeces + [start_index] else: node_features = np.concatenate((actor_features, utterance_features), axis=0) node_indeces = actor_indeces + utterance_indeces nodes = IndexedArray(node_features, node_indeces) edges = pd.DataFrame({"source": source_edges, "target": target_edges}) # GraphSAGE: full_graph = StellarDiGraph(nodes, edges) targets = pd.Series( dataframe['Dialogue Act'].tolist()[0:len(utterance_indeces)], index=utterance_indeces) print("Check if graph has all properties required for ML/Inference...") full_graph.check_graph_for_ml(expensive_check=True) print("Check successful.") print(full_graph.info()) print("---- Graph Creation Finished ----") netx_graph = full_graph.to_networkx(feature_attr='utterance_embedding') # Save graphs for later use. if testing: pickle.dump((netx_graph, targets), open(f"visualizeGraph/test_{name}_netx.pickle", "wb")) pickle.dump((full_graph, targets), open(f"createdGraphs/test_{name}_graph.pickle", "wb")) else: pickle.dump((netx_graph, targets), open(f"visualizeGraph/{name}_netx.pickle", "wb")) pickle.dump((full_graph, targets), open(f"createdGraphs/{name}_graph.pickle", "wb")) return full_graph, targets
uids = set(uids) """Creating Edges""" edges = [(y,x) for x,y in zip(follows['followed'],follows['follower'])] """Creating Directional Graph and adding the edges""" G = nx.DiGraph() G.add_edges_from(edges) edges_dict = dict() for edge in edges: edges_dict[edge]=1 rw = UniformRandomMetaPathWalk(StellarDiGraph(G)) """ Creating random walks. Each walk can be seen as a chain: uid->uid->uid ... They are of length 100 """ walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']]) """Word2Vec on those chains""" user_model = Word2Vec(walks,size=128,window=5) user_model.wv.vectors.shape
# add features for renamed genes feature_rename = feature_df.loc[gene_list] feature_rename_gm = feature_rename.copy(deep=True) feature_rename_k = feature_rename.copy(deep=True) feature_rename_gm.index = feature_rename_gm.index.map(lambda name: name + '_gm') feature_rename_k.index = feature_rename_k.index.map(lambda name: name + '_k') feature_df = pd.concat([feature_df, feature_rename_gm, feature_rename_k], axis=0) # %% [markdown] # ## Read graph # %% G = StellarDiGraph(edges=df[['source', 'target']], nodes=feature_df) print(G.info()) # %% [markdown] # ## Data Generators # # Now we create the data generators using `CorruptedGenerator`. `CorruptedGenerator` returns shuffled node features along with the regular node features and we train our model to discriminate between the two. # # Note that: # # - We typically pass all nodes to `corrupted_generator.flow` because this is an unsupervised task # - We don't pass `targets` to `corrupted_generator.flow` because these are binary labels (true nodes, false nodes) that are created by `CorruptedGenerator` # %% # HinSAGE model graphsage_generator = DirectedGraphSAGENodeGenerator(
def create_test_graph(is_directed=False): # biased random walker, breadth first walker, directed breadth first walker, uniform random walker if is_directed: return StellarDiGraph(create_test_graph_nx(is_directed)) else: return StellarGraph(create_test_graph_nx(is_directed))
import os import pandas as pd from cell import graph_utils import cell.BiasedDirectedWeightedWalk as BDWW from stellargraph import StellarDiGraph layer = 'base_unnormalized_allcombined' edge_path = "/Users/fahimehb/Documents/NPP_GNN_project/dat/edgelists/VISp/" edge_filename = "selfconnection_added_edges_node21_32_removed.csv" file_name = os.path.join(edge_path, layer, edge_filename) tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0") tmp_edge[['source', 'target']] = tmp_edge[['source', 'target']].astype(str) nxg = graph_utils.build_nx_graph(source_target_weight=tmp_edge, directed=True) sdg = StellarDiGraph(nxg) BDWW.BeginWalk(sdg, begin_checks=True, weighted=True, directed=True) rw = BDWW.BiasedDirectedRandomWalk(sdg, directed=True, weighted=True, begin_checks=False) nodes = list(sdg.nodes()) walks = rw.run(nodes=nodes, length=2, n=1, p=1, q=1, weighted=True, directed=True) print(walks)