Ejemplo n.º 1
0
def main(layer, edge_path, edge_filename, output_path, walk_filename, n,
         length, p, q, is_weighted, is_directed, job_id):

    if is_weighted == 1:
        weighted = True
    else:
        weighted = False

    if is_directed == 1:
        directed = True
    else:
        directed = False

    file_name = os.path.join(edge_path, layer, edge_filename)
    tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0")
    tmp_edge[['source', 'target']] = tmp_edge[['source', 'target']].astype(str)

    # 1) for each layer first create a nx-Digraph
    nxg = graph_utils.Build_nx_Graph(source_target_weight=tmp_edge,
                                     directed=True)

    # 2) Create stellar Di graphs
    sdg = StellarDiGraph(nxg)

    # 3) Initialize the walk and do the begin checks
    BDWW.BeginWalk(sdg, begin_checks=True, weighted=True, directed=True)

    rw = BDWW.BiasedDirectedRandomWalk(sdg,
                                       directed=True,
                                       weighted=True,
                                       begin_checks=False)

    nodes = list(sdg.nodes())
    walks = rw.run(nodes=nodes,
                   length=length,
                   n=n,
                   p=p,
                   q=q,
                   weighted=weighted,
                   directed=directed)

    result_path = os.path.join(output_path, layer)
    if not os.path.isdir(result_path):
        print("making a new directory for the output")
        os.mkdir(result_path)

    if job_id is not None:
        walk_file_name = str.split(walk_filename,
                                   ".")[0] + "_" + str(job_id) + ".csv"
    else:
        walk_file_name = walk_filename
    utils.Write_List_of_Lists_from_CSV(result_path, walk_file_name, walks)
Ejemplo n.º 2
0
def test_tie_breaking(tie_breaking):
    pred_scores = np.array(
        [
            [1, 5, 8],  # true_modified_node_ilocs:
            [1, 3, 8],  # 1
            [1, 2, 7],  # 2
            [1, 2, 6],  # 3
        ]
    )
    known_edges_graph = StellarDiGraph(
        nodes=pd.DataFrame(index=["a", "b", "c", "d"]),
        edges=pd.DataFrame(
            [
                # preds[0, :]: edge being predicted, checking it's counted properly for 'filtered'
                ("a", "b"),
                # preds[1, :]: the other tied edge, to see the 'bottom' score move up
                ("b", "d"),
            ],
            columns=["source", "target"],
        ),
    )

    copies = 100

    rankings = [
        _ranks_from_score_columns(
            pred_scores,
            true_modified_node_ilocs=np.array([1, 2, 3]),
            unmodified_node_ilocs=np.array([0, 1, 2]),
            true_rel_ilocs=np.array([0, 0, 0]),
            modified_object=True,
            known_edges_graph=known_edges_graph,
            tie_breaking=tie_breaking,
        )
        for _ in range(copies)
    ]

    all_rankings = np.array(rankings)
    assert all_rankings.shape == (copies, 2, 3)

    top_expected = np.repeat([[[1, 3, 4], [1, 3, 4]]], copies, axis=0)
    bottom_expected = np.repeat([[[4, 4, 4], [4, 3, 4]]], copies, axis=0)

    if tie_breaking == "top":
        np.testing.assert_array_equal(all_rankings, top_expected)

    elif tie_breaking == "bottom":
        np.testing.assert_array_equal(all_rankings, bottom_expected)

    elif tie_breaking == "random":
        assert (all_rankings >= top_expected).all()
        assert (all_rankings <= bottom_expected).all()

        # check both raw and filtered results (independently) have some variation in them
        for i in range(all_rankings.shape[1]):
            raw_or_filtered = all_rankings[:, i, :]
            assert (raw_or_filtered != top_expected[:, i, :]).any()
            assert (raw_or_filtered != bottom_expected[:, i, :]).any()
Ejemplo n.º 3
0
def tree_graph() -> StellarGraph:
    nodes = pd.DataFrame(index=["root", "0", 1, 2, "c1.1", "c2.1", "c2.2"])
    edges = pd.DataFrame(
        [
            ("root", 2),
            ("root", 1),
            ("root", "0"),
            (2, "c2.1"),
            (2, "c2.2"),
            (1, "c1.1"),
        ],
        columns=["source", "target"],
    )

    return StellarDiGraph(nodes, edges)
Ejemplo n.º 4
0
def knowledge_graph():
    nodes = ["a", "b", "c", "d"]

    edge_counter = 0

    def edge_df(*elements):
        nonlocal edge_counter
        end = edge_counter + len(elements)
        index = range(edge_counter, end)
        edge_counter = end
        return pd.DataFrame(elements, columns=["source", "target"], index=index)

    edges = {
        "W": edge_df(("a", "b")),
        "X": edge_df(("a", "b"), ("b", "c")),
        "Y": edge_df(("b", "a")),
        "Z": edge_df(("d", "b")),
    }

    return StellarDiGraph(nodes=pd.DataFrame(index=nodes), edges=edges)
def test_model_rankings(model_maker):
    nodes = pd.DataFrame(index=["a", "b", "c", "d"])
    rels = ["W", "X", "Y", "Z"]
    empty = pd.DataFrame(columns=["source", "target"])

    every_edge = itertools.product(nodes.index, rels, nodes.index)
    every_edge_df = triple_df(*every_edge)

    no_edges = StellarDiGraph(nodes, {name: empty for name in rels})

    # the filtering is most interesting when there's a smattering of edges, somewhere between none
    # and all; this does a stratified sample by label, to make sure there's at least one edge from
    # each label.
    one_per_label_df = (every_edge_df.groupby("label").apply(
        lambda df: df.sample(n=1)).droplevel(0))
    others_df = every_edge_df.sample(frac=0.25)
    some_edges_df = pd.concat([one_per_label_df, others_df], ignore_index=True)

    some_edges = StellarDiGraph(
        nodes,
        {
            name: df.drop(columns="label")
            for name, df in some_edges_df.groupby("label")
        },
    )

    all_edges = StellarDiGraph(
        nodes=nodes,
        edges={
            name: df.drop(columns="label")
            for name, df in every_edge_df.groupby("label")
        },
    )

    gen = KGTripleGenerator(all_edges, 3)
    sg_model = model_maker(gen, embedding_dimension=5)
    x_inp, x_out = sg_model.in_out_tensors()
    model = Model(x_inp, x_out)

    raw_some, filtered_some = sg_model.rank_edges_against_all_nodes(
        gen.flow(every_edge_df), some_edges)
    # basic check that the ranks are formed correctly
    assert raw_some.dtype == int
    assert np.all(raw_some >= 1)
    # filtered ranks are never greater, and sometimes less
    assert np.all(filtered_some <= raw_some)
    assert np.any(filtered_some < raw_some)

    raw_no, filtered_no = sg_model.rank_edges_against_all_nodes(
        gen.flow(every_edge_df), no_edges)
    np.testing.assert_array_equal(raw_no, raw_some)
    # with no edges, filtering does nothing
    np.testing.assert_array_equal(raw_no, filtered_no)

    raw_all, filtered_all = sg_model.rank_edges_against_all_nodes(
        gen.flow(every_edge_df), all_edges)
    np.testing.assert_array_equal(raw_all, raw_some)
    # when every edge is known, the filtering should eliminate every possibility
    assert np.all(filtered_all == 1)

    # check the ranks against computing them from the model predictions directly. That is, for each
    # edge, compare the rank against one computed by counting the predictions. This computes the
    # filtered ranks naively too.
    predictions = model.predict(gen.flow(every_edge_df))

    for (source, rel, target), score, raw, filtered in zip(
            every_edge_df.itertuples(index=False), predictions, raw_some,
            filtered_some):
        # rank for the subset specified by the given selector
        def rank(compare_selector):
            return 1 + (predictions[compare_selector] > score).sum()

        same_r = every_edge_df.label == rel

        same_s_r = (every_edge_df.source == source) & same_r

        expected_raw_mod_o_rank = rank(same_s_r)
        assert raw[0] == expected_raw_mod_o_rank

        known_objects = some_edges_df[(some_edges_df.source == source)
                                      & (some_edges_df.label == rel)]
        object_is_unknown = ~every_edge_df.target.isin(known_objects.target)
        expected_filt_mod_o_rank = rank(same_s_r & object_is_unknown)
        assert filtered[0] == expected_filt_mod_o_rank

        same_r_o = same_r & (every_edge_df.target == target)

        expected_raw_mod_s_rank = rank(same_r_o)
        assert raw[1] == expected_raw_mod_s_rank

        known_subjects = some_edges_df[(some_edges_df.label == rel)
                                       & (some_edges_df.target == target)]
        subject_is_unknown = ~every_edge_df.source.isin(known_subjects.source)
        expected_filt_mod_s_rank = rank(subject_is_unknown & same_r_o)
        assert filtered[1] == expected_filt_mod_s_rank
def main(edge_path, edge_filename, output_path, walk_filename, n, length, p, q,
         is_weighted, is_directed, job_id):

    start_time = timeit.default_timer()
    if is_weighted == 1:
        weighted = True
    else:
        weighted = False

    if is_directed == 1:
        directed = True
    else:
        directed = False

    layers = os.listdir(edge_path)
    nx_graphs = {}  # keep all the nxDigraphs
    stellar_Di_graphs = {}  # keep all the stellarDigraphs
    node_importance = {}  # keep all the node_importance per layers

    for layer in layers:
        file_name = os.path.join(edge_path, layer, edge_filename)
        tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0")
        tmp_edge[['source', 'target']] = tmp_edge[['source',
                                                   'target']].astype(str)

        # 1) for each layer first create a nx-Digraph
        nxg = graph_utils.Build_nx_Graph(source_target_weight=tmp_edge,
                                         directed=True)
        nx_graphs[layer] = nxg

        # 2) Create stellar Di graphs
        sdg = StellarDiGraph(nxg)
        stellar_Di_graphs[layer] = sdg

        # 3) Initialize the walk and return the layer_node_importance
        obj = BDWW.BeginWalk(sdg,
                             begin_checks=True,
                             weighted=True,
                             directed=True)
        node_importance[layer] = obj.node_importance
        print("for layer: ", layer, "this is the end node:", obj.end_nodes)

    # 4) Find all the nodes in all the graphs
    base_nodes = BDWW.get_all_nodes(stellar_Di_graphs)

    # 5) Find the node importance
    layer_importance = BDWW.get_layer_importance(base_nodes, node_importance)

    #6) finally lets walk
    walks = BDWW.biased_directed_multi_walk(
        stellar_multi_graph_dict=stellar_Di_graphs,
        nodes=base_nodes,
        layer_importance=layer_importance,
        n=n,
        length=length,
        p=p,
        q=q,
        tol=10**-6,
        weighted=weighted,
        directed=directed)[0]

    if job_id is not None:
        walk_file_name = str.split(walk_filename,
                                   ".")[0] + "_" + str(job_id) + ".csv"
    else:
        walk_file_name = walk_filename
    utils.Write_List_of_Lists_from_CSV(output_path, walk_file_name, walks)

    elapsed = timeit.default_timer() - start_time

    print('-------------------------------')
    print('Training time:', elapsed)
    print('-------------------------------')
def model_train():
  print("Loading files..")
  cluster = MongoClient('mongodb+srv://nirmal:[email protected]/<dbname>?retryWrites=true&w=majority')

  db = cluster.Dataset
  pcol = db.posts  
  vcol= db.views
  fcol= db.follows
  favcol = db.favourites  


  # posts = pd.read_csv('./posts.csv',engine='python')
  # users = pd.read_csv('./users.csv')
  # views = pd.read_csv('./views.csv')
  # favorites = pd.read_csv('./favourites.csv')
  # userPosts = pd.read_csv('./usersPosts.csv')
  # print("Files loaded..")

  ''' Create DataFrame for preprocessing '''
  posts = pd.DataFrame(list(pcol.find()))  
  views = pd.DataFrame(list(vcol.find()))
  favorites = pd.DataFrame(list(favcol.find()))
  userPosts = posts[['_id','postedBy']]
  follows = pd.DataFrame(list(fcol.find()))
  print("Collections loaded..")

  print("Started preprocessing..")

  views = views[views['user_id']!='anonymous']
  posts = posts.dropna(subset=['title',' post_type','tags'])

  posts['category'] = posts['category'].fillna(posts['tags'])
  posts['tags'] = posts['tags'].apply(clean_text)

  """Splitting on '|' and '#' for getting categories"""

  uniq_category = dict()
  uniq_post_type = dict()
  i=0
  j=0
  for cats,pt in zip(posts['category'].values,posts[' post_type'].values):
    for cat in re.split('[#|]',cats):
      if cat not in uniq_category.keys():
        uniq_category[cat]=i
        i+=1
    if pt not in uniq_post_type.keys():
      uniq_post_type[pt]=j
      j+=1

  category_ohe = np.zeros((len(posts),513))

  for i,cats in enumerate(posts['category'].values):
    for cat in re.split('[#|]',cats):
      category_ohe[i][uniq_category[cat]]=1

  token_tag = [word_tokenize(tag) for tag in posts['tags'].values.tolist()]
  tag_model = Word2Vec(token_tag,sg=1,size=100,window=5, min_count=5, workers=4,iter=100)
  tag_model.save('./tag.model')

  tag_model = Sentence2Vec('./tag.model')

  processed_title = posts['title'].apply(clean_text)
  token_title = [word_tokenize(tag) for tag in processed_title]
  title_model = Word2Vec(token_title,sg=1,size=100,window=5, min_count=5, workers=4,iter=100)
  title_model.save('./title.model')

  title_model = Sentence2Vec('./title.model')

  posts_info = dict()
  for pid,title,cat,tag in zip(posts['_id'],posts['title'].values,category_ohe,posts['tags'].values):
    posts_info[pid] = dict()
    posts_info[pid]['title'] = title_model.get_vector(title)
    posts_info[pid]['tag'] = tag_model.get_vector(tag)
    posts_info[pid]['cat'] = cat

  """Removing rows in views.csv, favorites.csv and usrPosts.csv
  that has pid not present in posts.csv
  """

  pidr=set()
  for pid in views['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  for pid in favorites['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  for pid in userPosts['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  
  for pid in list(pidr):  
    views = views[views['post_id']!=pid]
    userPosts = userPosts[userPosts['post_id']!=pid]
    favorites = favorites[favorites['post_id']!=pid]

  """Representing the user based on the categories seen by the user"""

  users_info = defaultdict(lambda :np.zeros((513)))
  for uid,pid in zip(views['user_id'],views['post_id']):    
    a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe']))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """Increasing the weightage for categories by 100% for posts posted by user"""

  for uid,pid in zip(userPosts['user_id'],userPosts['post_id']):    
    a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe']))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """Increasing weightage for categories by 50% for favorite posts"""

  for uid,pid in zip(favorites['user_id'],favorites['post_id']):    
    a = 1/2*posts_info[pid]['cat'] #,posts_info[pid]['pt'])))#,posts_info[pid]['title_ohe'])))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """## MODEL 

  Generating -ive datapoints for each user where the posts chosen have categories that are not seen by the user
  """

  def gen_pseudoDP(user_id):
    cat_user = users_info[uid]
    arr=[]
    k=0
    for pid in posts_info.keys():
      cat = posts_info[pid]['cat']
      flag=0
      for i in range(len(cat)):
        if (cat[i]!=0 and cat_user[i] != 0):        
          flag=1
          break    
      if flag==0:
        arr.append([uid,pid,0])
        k+=1
      if k==4:
        break
    return arr

  pseudo = pd.DataFrame(np.zeros((len(users_info)*4,3)),columns=['user_id','post_id','view'])
  i=0
  for uid in list(users_info.keys()):
    arr = gen_pseudoDP(uid)  
    if len(arr):
      pseudo[i:i+len(arr)] = arr
      i+=4

  views['view'] = np.ones((len(views)))
  views = views.drop(columns=['timestamp'],axis=1)
  data = views.append(pseudo)

  print("Preprocessing done!")

  class Datagenerator(tf.keras.utils.Sequence):
    def __init__(self,X,y=None,batch_size=1,shuffle=True):
      super().__init__()
      self.X = X
      self.y = y
      self.batch_size = batch_size    
      self.on_epoch_end()
      

    def __getitem__(self,index):
      
      indices = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]    
      batch = self.X.iloc[indices]    
      y = self.y.iloc[indices]
      
      user=np.zeros((self.batch_size,len(uniq_category)))    
      title=np.zeros((self.batch_size,100))
      tag=np.zeros((self.batch_size,100))
      category = np.zeros((self.batch_size,len(uniq_category)))
      
      for i in range(self.batch_size): 
        title[i] = posts_info[batch.post_id.values[i]]['title']
        tag[i] = posts_info[batch.post_id.values[i]]['tag']
        category[i] = posts_info[batch.post_id.values[i]]['cat']
        user[i] = users_info[batch['user_id'].values[i]]
      
      return [user,title,tag,category],y.values.reshape(-1,1)
      

    def __len__(self):
      return int(np.floor(len(self.X) / self.batch_size))

    def on_epoch_end(self):    
      self.indexes = np.arange(len(self.X))
      np.random.shuffle(self.indexes)

  y = data['view']
  X = data.drop(['view'],axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X,y)

  train_dg = Datagenerator(X_train,y_train,128)
  test_dg = Datagenerator(X_test,y_test,128)

  """Model predicts whether a user will see a post or not. Based on that user embeddings will be learnt which will then be used for recommendation"""

  def create_model():

    user_inp = Input((len(uniq_category)))
    embed = Embedding(input_dim=len(uniq_category),output_dim=50)(user_inp)
    dense = Dense(2056)(Flatten()(embed))
    user = Dense(500,activation='relu')(dense)
    user = Dense(400,activation='relu')(user)

    cat = Input((len(uniq_category)))
    cat_ = Dense(300,activation='relu')(cat)

    title = Input((100))
    title_ = Dense(50,activation='relu')(title)
    tag = Input((100))
    tag_ = Dense(50,activation='relu')(tag)

    post_concat = Concatenate()([cat_,title_,tag_])

    output = Dot(axes=[-1,-1],normalize=True)([user,post_concat])    

    model = tf.keras.Model([user_inp,title,tag,cat],output)

    return model

  model = create_model()

  model.compile(optimizer=Adagrad(lr=0.0001), loss='binary_crossentropy',metrics=['accuracy'])

  print("model started training...")
  model.fit_generator(train_dg,validation_data=test_dg,epochs=1)
  print("Model trained")

  """Retrieving trained user embeddings"""

  user_embeddings = model.get_layer('embedding').get_weights()[0]

  follows = pd.read_csv('./follows.csv')

  follows = follows.drop(['timestamp'],axis=1)

  """Users present in follows.csv"""

  uids = np.concatenate((follows['user_id'].values,follows['follower_id'].values))
  uids = set(uids)

  """Creating Edges"""

  edges = [(y,x) for x,y in zip(follows['user_id'],follows['follower_id'])]

  """Creating Directional Graph and adding the edges"""

  G = nx.DiGraph()
  G.add_edges_from(edges)

  edges_dict = dict()
  for edge in edges:
    edges_dict[edge]=1

  rw = UniformRandomMetaPathWalk(StellarDiGraph(G))

  """Creating random walks.

  Each walk can be seen as a chain:  uid->uid->uid ... 

  They are of length 100
  """

  walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']])

  """Word2Vec on those chains"""

  user_model =  Word2Vec(walks,size=128,window=5)
  user_model.wv.vectors.shape

  """Each user represented by 128 dim vector"""

  node_ids = user_model.wv.index2word
  node_embed = user_model.wv.vectors

  print("Pushing to database...")
  userCollection = cluster.Users.User_Embeddings
  userCollection.delete_many({})
  followCollection = cluster.Users.Follows
  followCollection.delete_many({})
  posted = cluster.Users.Posted
  posted.delete_many({})
  catCol = cluster.Users.Categories
  catCol.delete_many({})
  embedCol = cluster.Users.Embedding_Matrix
  embedCol.delete_many({})

  folDict = dict()
  for i,id in enumerate(node_ids):
    folDict[id]=i

  user_ins=[]
  for user in tqdm(users_info.keys()):
    embed = list(np.matmul(users_info[user],user_embeddings))
    if folDict.get(user,-1) == -1:
      #userCollection.insert_one({'user_id':user, 'user_embed':embed})
      user_ins.append({'user_id':user, 'user_embed':embed})
    else:
      yo = node_embed[folDict[user]].tolist()
      #userCollection.insert_one({'user_id':user, 'user_embed':embed, 'node_embed':yo})
      user_ins.append({'user_id':user, 'user_embed':embed, 'node_embed':yo})

  userCollection.insert_many(user_ins)

  fol=[]
  for uid,fid in tqdm(zip(follows['user_id'],follows['follower_id'])):
      d = dict()
      d['user_id'] = uid
      d['follower_id'] = fid
      fol.append(d)

  followCollection.insert_many(fol)

  categories = pickle.dumps(uniq_category)
  user_embed = pickle.dumps(user_embeddings)

  catCol.insert_one({"Categories":categories})
  embedCol.insert_one({"Matrix":user_embed})


  uids = set()
  for uid in userPosts['user_id']:
      uids.add(uid)
  to_ins=[]
  for uid in uids:
      noob = dict()
      noob['user_id']=uid    
      to_ins.append(noob)

  posted.insert_many(to_ins)
  requests.get('http://3.7.185.166/train')
  print("Done!")
Ejemplo n.º 8
0
def load_graph_data(dataframe,
                    embeddings,
                    name="default",
                    testing=False,
                    num_test=100,
                    using_start=False):

    actor_indeces = []
    actor_features = []
    utterance_indeces = []
    utterance_features = []
    source_edges = []
    target_edges = []

    if testing:
        num_dialogues = num_test
    else:
        num_dialogues = len(dataframe['Dialogue ID'].unique())

    print("Building graph, 1 dialogue at a time...")
    for dialogueID in tqdm(dataframe['Dialogue ID'].unique()[0:num_dialogues]):
        dialogue = dataframe[dataframe["Dialogue ID"] == dialogueID]

        # Loop through all utterances of the dialogue
        for rowidx in range(len(dialogue)):
            row = dialogue.iloc[rowidx]

            # 0. Add actor index-feature if it does not already exist
            actor_idx = f"{row.Actor}_{dialogueID}"
            if actor_idx not in actor_indeces:
                actor_indeces.append(actor_idx)
                if len(actor_features) == 0:
                    # Create new numpy array of actor features
                    actor_features = np.random.normal(0.0, 1.0, [1, 1024])
                else:
                    # Concatenate features to already existing array
                    actor_features = np.concatenate(
                        (actor_features, np.random.normal(0.0, 1.0,
                                                          [1, 1024])),
                        axis=0)
            # 1. Add utterance index-feature (ELMo embeddings)
            utt_idx = f"u_dID{dialogueID}_#{rowidx}"
            utterance_indeces.append(utt_idx)
            # To iterate over the ELMo embeddings we use the index list of the
            # dataset, indexed by the row of the dialogue we are currently parsing
            if len(utterance_features) == 0:
                utterance_features = np.array(
                    [embeddings[dialogue.index[rowidx]]])
            else:
                utterance_features = np.concatenate(
                    (utterance_features,
                     np.array([embeddings[dialogue.index[rowidx]]])),
                    axis=0)

            # 2. Build edges. If this is the first row of a dialogue,
            # begin by drawing an edge from the "START-Node" (source)
            # to the current utterance index (target)
            if using_start and rowidx == 0:
                source_edges.append("START-Node")
                target_edges.append(utt_idx)

            # 3. Construct remaining edges.
            # 3.1 Actor to the utterance
            source_edges.append(actor_idx)
            target_edges.append(utt_idx)
            # 3.2 Utterance to the next utterance
            if (rowidx + 1) != len(dialogue):
                source_edges.append(utt_idx)
                target_edges.append(f"u_dID{dialogueID}_#{rowidx + 1}")
            # 3.3 Utterance to all actors
            for actor in dialogue['Actor'].unique():
                all_actor_idx = f"{actor}_{dialogueID}"
                source_edges.append(utt_idx)
                target_edges.append(all_actor_idx)

    # GraphSAGE (Does not support modelling nodes of different kind) ..less bad

    if using_start:
        start_features = np.random.normal(0.0, 1.0, [1, 1024])
        start_index = "START-Node"
        node_features = np.concatenate(
            (actor_features, utterance_features, start_features), axis=0)
        node_indeces = actor_indeces + utterance_indeces + [start_index]
    else:
        node_features = np.concatenate((actor_features, utterance_features),
                                       axis=0)
        node_indeces = actor_indeces + utterance_indeces

    nodes = IndexedArray(node_features, node_indeces)

    edges = pd.DataFrame({"source": source_edges, "target": target_edges})

    # GraphSAGE:
    full_graph = StellarDiGraph(nodes, edges)

    targets = pd.Series(
        dataframe['Dialogue Act'].tolist()[0:len(utterance_indeces)],
        index=utterance_indeces)

    print("Check if graph has all properties required for ML/Inference...")
    full_graph.check_graph_for_ml(expensive_check=True)
    print("Check successful.")
    print(full_graph.info())
    print("---- Graph Creation Finished ----")

    netx_graph = full_graph.to_networkx(feature_attr='utterance_embedding')
    # Save graphs for later use.
    if testing:
        pickle.dump((netx_graph, targets),
                    open(f"visualizeGraph/test_{name}_netx.pickle", "wb"))
        pickle.dump((full_graph, targets),
                    open(f"createdGraphs/test_{name}_graph.pickle", "wb"))
    else:
        pickle.dump((netx_graph, targets),
                    open(f"visualizeGraph/{name}_netx.pickle", "wb"))
        pickle.dump((full_graph, targets),
                    open(f"createdGraphs/{name}_graph.pickle", "wb"))

    return full_graph, targets
Ejemplo n.º 9
0
  uids = set(uids)

  """Creating Edges"""

  edges = [(y,x) for x,y in zip(follows['followed'],follows['follower'])]

  """Creating Directional Graph and adding the edges"""

  G = nx.DiGraph()
  G.add_edges_from(edges)

  edges_dict = dict()
  for edge in edges:
    edges_dict[edge]=1

  rw = UniformRandomMetaPathWalk(StellarDiGraph(G))

  """		Creating random walks.

  Each walk can be seen as a chain:  uid->uid->uid ... 
  They are of length 100

  """

  walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']])

  """Word2Vec on those chains"""

  user_model =  Word2Vec(walks,size=128,window=5)
  user_model.wv.vectors.shape
Ejemplo n.º 10
0

# add features for renamed genes
feature_rename = feature_df.loc[gene_list]
feature_rename_gm = feature_rename.copy(deep=True)
feature_rename_k = feature_rename.copy(deep=True)

feature_rename_gm.index = feature_rename_gm.index.map(lambda name: name + '_gm')
feature_rename_k.index = feature_rename_k.index.map(lambda name: name + '_k')
feature_df = pd.concat([feature_df, feature_rename_gm, feature_rename_k], axis=0)

# %% [markdown]
# ## Read graph

# %%
G = StellarDiGraph(edges=df[['source', 'target']], nodes=feature_df)
print(G.info())

# %% [markdown]
# ## Data Generators
# 
# Now we create the data generators using `CorruptedGenerator`. `CorruptedGenerator` returns shuffled node features along with the regular node features and we train our model to discriminate between the two. 
# 
# Note that:
# 
# - We typically pass all nodes to `corrupted_generator.flow` because this is an unsupervised task
# - We don't pass `targets` to `corrupted_generator.flow` because these are binary labels (true nodes, false nodes) that are created by `CorruptedGenerator`

# %%
# HinSAGE model 
graphsage_generator = DirectedGraphSAGENodeGenerator(
Ejemplo n.º 11
0
def create_test_graph(is_directed=False):
    # biased random walker, breadth first walker, directed breadth first walker, uniform random walker
    if is_directed:
        return StellarDiGraph(create_test_graph_nx(is_directed))
    else:
        return StellarGraph(create_test_graph_nx(is_directed))
Ejemplo n.º 12
0
import os
import pandas as pd
from cell import graph_utils
import cell.BiasedDirectedWeightedWalk as BDWW
from stellargraph import StellarDiGraph

layer = 'base_unnormalized_allcombined'

edge_path = "/Users/fahimehb/Documents/NPP_GNN_project/dat/edgelists/VISp/"
edge_filename = "selfconnection_added_edges_node21_32_removed.csv"

file_name = os.path.join(edge_path, layer, edge_filename)
tmp_edge = pd.read_csv(file_name, index_col="Unnamed: 0")
tmp_edge[['source', 'target']] = tmp_edge[['source', 'target']].astype(str)
nxg = graph_utils.build_nx_graph(source_target_weight=tmp_edge, directed=True)
sdg = StellarDiGraph(nxg)
BDWW.BeginWalk(sdg, begin_checks=True, weighted=True, directed=True)
rw = BDWW.BiasedDirectedRandomWalk(sdg,
                                   directed=True,
                                   weighted=True,
                                   begin_checks=False)

nodes = list(sdg.nodes())
walks = rw.run(nodes=nodes,
               length=2,
               n=1,
               p=1,
               q=1,
               weighted=True,
               directed=True)
print(walks)