コード例 #1
0
    def fit(self, metapaths=None, d=128, r=10, l=80, k=10):
        """
        Pipeline for representational learning for all nodes in a graph.

        :param k:
        :return:
        """
        self._assert_positive_int(d, msg="d should be positive integer")
        self._assert_positive_int(r, msg="r should be positive integer")
        self._assert_positive_int(l, msg="l should be positive integer")
        self._assert_positive_int(k, msg="k should be positive integer")

        start_time_fit = time.time()
        # self.G = node2vec.Graph(self.nxG, False, p, q)
        # self.G.preprocess_transition_probs()
        metapath_walker = UniformRandomMetaPathWalk(self.nxG)
        # walks = self.G.simulate_walks(r, l)
        time_b = time.time()
        walks = metapath_walker.run(
            nodes=list(self.nxG.nodes()),
            metapaths=metapaths,
            length=l,
            n=r,
            node_type_attribute="label",
            seed=None,
        )
        print("({}) Time for random walks {:.0f} seconds.".format(
            type(self).__name__,
            time.time() - time_b))
        self.learn_embeddings(walks, d, k)
        print("Total time for fit() was {:.0f}".format(time.time() -
                                                       start_time_fit))
コード例 #2
0
def metapath2vec(G, walk_length, metapaths):
    """
    performs metapath2vec and returns representations with labels
    
    G --> stellargraph object of the graph
    label --> (0 or 1) benign or not
    walk_length --> int, defines how long the sentences should be
    """
    rw = UniformRandomMetaPathWalk(G)
    walks = rw.run(nodes=list(G.nodes()),
                   length=walk_length,
                   n=10,
                   metapaths=metapaths)
    return walks
コード例 #3
0
def _metapath_randomwalk(graph):
    # Create the random walker
    rw = UniformRandomMetaPathWalk(StellarGraph(graph))

    # specify the metapath schemas as a list of lists of node types.

    walks = rw.run(nodes=list(graph.nodes()),  # root nodes
                   length=WALK_DISTANCE,  # maximum length of a random walk
                   n=1,        # number of random walks per root node
                   metapaths=METAPATHS  # the metapaths
                   )

    print("Number of random walks: {}".format(len(walks)))

    return walks
コード例 #4
0
def metapath2vec_walk(G, params):
    """Performs uniform random metapath walks using StellarGraph to generate corpus used in metapath2vec and writes corpus to a txt file 
    
    :param G : StellarGraph graph 
    Nodes consist of apps, api calls, packages, and invoke methods
    
    :param params : dict
    dict["key"] where dict is global parameter dictionary and key returns metapath2vec parameter sub-dictionary
    """
    fp=os.path.join(params["save_dir"],params["filename"])
    os.makedirs(params["save_dir"], exist_ok=True)
    # Create the random walker
    rw = UniformRandomMetaPathWalk(G,
        length=params["walk_length"],  # maximum length of a random walk
        n=params["n"],  # number of random walks per root node
        metapaths=params["metapaths"]  # the metapaths)
                                  )
    print("Starting MetaPath Walks")
    
    start_walks = time.time()
    walks = rw.run(
        nodes=list(G.nodes_of_type("app_nodes")),  # root nodes (app_nodes)
        length=params["walk_length"],  # maximum length of a random walk
        n=params["n"],  # number of random walks per root node
        metapaths=params["metapaths"]  # the metapaths
    )
    
    print("--- Done Walking in " + str(int(time.time() - start_walks)) + " Seconds ---")
    print()
    print("Number of metapath walks: {}".format(len(walks)))

    # save walks to file
    with open(fp, 'w') as f:
        for walk in walks:
            for node in walk:
                f.write(str(node) + ' ')
            f.write('\n')
    f.close()
    
    if params["verbose"]:
        print("Saved %s to %s" %(params["filename"], params["save_dir"]))
    
    return
コード例 #5
0
 def load4graph(self):
     """
     将所有边和顶点信息装入图,并生成游走路径rw
     :return:
     """
     g_nx = self.load_dataset_SMDB(self.location, self.graph_infos)
     print("Number of nodes {} and number  of edges {} in graph.".format(g_nx.number_of_nodes(),
                                                                         g_nx.number_of_edges()))
     from stellargraph.data import UniformRandomMetaPathWalk
     rw = UniformRandomMetaPathWalk(StellarGraph(g_nx))
     return g_nx, rw
コード例 #6
0
def common_metapath2vec(metapaths, commongraph, root_nodes, walk_length):
    """
    from the filepath, returned a combined list of all metapaths
    
    metapathsFP --> filepath to the directory containing all metapaths
                    should be: "/teams/DSC180A_FA20_A00/a04malware/personal-group03/actualdroid_intermediate_files/metapath2vec_metapaths"
                    
    common_graph_txts --> path to a folder containing elements of the common graph
                    should be: "/teams/DSC180A_FA20_A00/a04malware/personal-group03/common_graph/common_graph_txts"
    """

    # start traversal
    walk_length = 100
    rw = UniformRandomMetaPathWalk(commongraph)
    walks = rw.run(
        nodes=root_nodes,  # root nodes
        length=walk_length,  # maximum length of a random walk
        n=1,  # number of random walks per root node
        metapaths=metapaths,  # the metapaths
    )
    print("Number of random walks: {}".format(len(walks)))

    return walks
コード例 #7
0
def generate_random_walks(graph_obj, num_walks_per_node, walk_length, metapaths):

    random_walk_object = UniformRandomMetaPathWalk(graph_obj)
    cpu_count = multiprocessing.cpu_count()
    list_nodes = list(graph_obj.nodes())
    num_chunks = cpu_count
    chunk_len = (len(list_nodes) // num_chunks)
    chunks = [
        list_nodes[i * chunk_len: (i + 1) * chunk_len] for i in range(0, num_chunks + 1)
    ]

    res = Parallel(n_jobs=cpu_count)(
        delayed(aux_gen_walks)(
            node_chunk, walk_length, random_walk_object, metapaths, num_walks_per_node
        )
        for node_chunk in chunks
    )

    all_walks = []
    for r in res:
        all_walks.extend(r)
    return all_walks
コード例 #8
0
    # add the user-user edges with label 'friend'
    g_nx.add_edges_from(u_m_edges, label="rating")
    g_nx.add_edges_from(u_u_edges, label="friend")
    g_nx.add_edges_from(m_m_edges, label="same")
    g_nx.add_edges_from(m_a_edges, weight=1, label="include")

    print(g_nx.number_of_nodes())
    print(g_nx.number_of_edges())
    return g_nx


g_nx = construct_hin_pandas()

# Create the random walker
rw = UniformRandomMetaPathWalk(StellarGraph(g_nx))

# specify the metapath schemas as a list of lists of node types.
metapaths = [
    ["user", "user", "movie"],
    #["user", "movie", "aspect", "movie"],
]

walks = rw.run(
    nodes=list(g_nx.nodes()),  # root nodes
    length=3,  # maximum length of a random walk
    n=50,  # number of random walks per root node
    metapaths=metapaths  # the metapaths
)

for i in range(1000):
コード例 #9
0
def model_train():
  print("Loading files..")
  cluster = MongoClient('mongodb+srv://nirmal:[email protected]/<dbname>?retryWrites=true&w=majority')

  db = cluster.Dataset
  pcol = db.posts  
  vcol= db.views
  fcol= db.follows
  favcol = db.favourites  


  # posts = pd.read_csv('./posts.csv',engine='python')
  # users = pd.read_csv('./users.csv')
  # views = pd.read_csv('./views.csv')
  # favorites = pd.read_csv('./favourites.csv')
  # userPosts = pd.read_csv('./usersPosts.csv')
  # print("Files loaded..")

  ''' Create DataFrame for preprocessing '''
  posts = pd.DataFrame(list(pcol.find()))  
  views = pd.DataFrame(list(vcol.find()))
  favorites = pd.DataFrame(list(favcol.find()))
  userPosts = posts[['_id','postedBy']]
  follows = pd.DataFrame(list(fcol.find()))
  print("Collections loaded..")

  print("Started preprocessing..")

  views = views[views['user_id']!='anonymous']
  posts = posts.dropna(subset=['title',' post_type','tags'])

  posts['category'] = posts['category'].fillna(posts['tags'])
  posts['tags'] = posts['tags'].apply(clean_text)

  """Splitting on '|' and '#' for getting categories"""

  uniq_category = dict()
  uniq_post_type = dict()
  i=0
  j=0
  for cats,pt in zip(posts['category'].values,posts[' post_type'].values):
    for cat in re.split('[#|]',cats):
      if cat not in uniq_category.keys():
        uniq_category[cat]=i
        i+=1
    if pt not in uniq_post_type.keys():
      uniq_post_type[pt]=j
      j+=1

  category_ohe = np.zeros((len(posts),513))

  for i,cats in enumerate(posts['category'].values):
    for cat in re.split('[#|]',cats):
      category_ohe[i][uniq_category[cat]]=1

  token_tag = [word_tokenize(tag) for tag in posts['tags'].values.tolist()]
  tag_model = Word2Vec(token_tag,sg=1,size=100,window=5, min_count=5, workers=4,iter=100)
  tag_model.save('./tag.model')

  tag_model = Sentence2Vec('./tag.model')

  processed_title = posts['title'].apply(clean_text)
  token_title = [word_tokenize(tag) for tag in processed_title]
  title_model = Word2Vec(token_title,sg=1,size=100,window=5, min_count=5, workers=4,iter=100)
  title_model.save('./title.model')

  title_model = Sentence2Vec('./title.model')

  posts_info = dict()
  for pid,title,cat,tag in zip(posts['_id'],posts['title'].values,category_ohe,posts['tags'].values):
    posts_info[pid] = dict()
    posts_info[pid]['title'] = title_model.get_vector(title)
    posts_info[pid]['tag'] = tag_model.get_vector(tag)
    posts_info[pid]['cat'] = cat

  """Removing rows in views.csv, favorites.csv and usrPosts.csv
  that has pid not present in posts.csv
  """

  pidr=set()
  for pid in views['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  for pid in favorites['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  for pid in userPosts['post_id']:
    if posts_info.get(pid,0) == 0:
      pidr.add(pid)
  
  for pid in list(pidr):  
    views = views[views['post_id']!=pid]
    userPosts = userPosts[userPosts['post_id']!=pid]
    favorites = favorites[favorites['post_id']!=pid]

  """Representing the user based on the categories seen by the user"""

  users_info = defaultdict(lambda :np.zeros((513)))
  for uid,pid in zip(views['user_id'],views['post_id']):    
    a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe']))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """Increasing the weightage for categories by 100% for posts posted by user"""

  for uid,pid in zip(userPosts['user_id'],userPosts['post_id']):    
    a = posts_info[pid]['cat'] #,posts_info[pid]['pt']))#,posts_info[pid]['title_ohe']))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """Increasing weightage for categories by 50% for favorite posts"""

  for uid,pid in zip(favorites['user_id'],favorites['post_id']):    
    a = 1/2*posts_info[pid]['cat'] #,posts_info[pid]['pt'])))#,posts_info[pid]['title_ohe'])))
    users_info[uid] = np.add(users_info[uid],a)
    assert(np.sum(users_info[uid])!=0)

  """## MODEL 

  Generating -ive datapoints for each user where the posts chosen have categories that are not seen by the user
  """

  def gen_pseudoDP(user_id):
    cat_user = users_info[uid]
    arr=[]
    k=0
    for pid in posts_info.keys():
      cat = posts_info[pid]['cat']
      flag=0
      for i in range(len(cat)):
        if (cat[i]!=0 and cat_user[i] != 0):        
          flag=1
          break    
      if flag==0:
        arr.append([uid,pid,0])
        k+=1
      if k==4:
        break
    return arr

  pseudo = pd.DataFrame(np.zeros((len(users_info)*4,3)),columns=['user_id','post_id','view'])
  i=0
  for uid in list(users_info.keys()):
    arr = gen_pseudoDP(uid)  
    if len(arr):
      pseudo[i:i+len(arr)] = arr
      i+=4

  views['view'] = np.ones((len(views)))
  views = views.drop(columns=['timestamp'],axis=1)
  data = views.append(pseudo)

  print("Preprocessing done!")

  class Datagenerator(tf.keras.utils.Sequence):
    def __init__(self,X,y=None,batch_size=1,shuffle=True):
      super().__init__()
      self.X = X
      self.y = y
      self.batch_size = batch_size    
      self.on_epoch_end()
      

    def __getitem__(self,index):
      
      indices = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]    
      batch = self.X.iloc[indices]    
      y = self.y.iloc[indices]
      
      user=np.zeros((self.batch_size,len(uniq_category)))    
      title=np.zeros((self.batch_size,100))
      tag=np.zeros((self.batch_size,100))
      category = np.zeros((self.batch_size,len(uniq_category)))
      
      for i in range(self.batch_size): 
        title[i] = posts_info[batch.post_id.values[i]]['title']
        tag[i] = posts_info[batch.post_id.values[i]]['tag']
        category[i] = posts_info[batch.post_id.values[i]]['cat']
        user[i] = users_info[batch['user_id'].values[i]]
      
      return [user,title,tag,category],y.values.reshape(-1,1)
      

    def __len__(self):
      return int(np.floor(len(self.X) / self.batch_size))

    def on_epoch_end(self):    
      self.indexes = np.arange(len(self.X))
      np.random.shuffle(self.indexes)

  y = data['view']
  X = data.drop(['view'],axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X,y)

  train_dg = Datagenerator(X_train,y_train,128)
  test_dg = Datagenerator(X_test,y_test,128)

  """Model predicts whether a user will see a post or not. Based on that user embeddings will be learnt which will then be used for recommendation"""

  def create_model():

    user_inp = Input((len(uniq_category)))
    embed = Embedding(input_dim=len(uniq_category),output_dim=50)(user_inp)
    dense = Dense(2056)(Flatten()(embed))
    user = Dense(500,activation='relu')(dense)
    user = Dense(400,activation='relu')(user)

    cat = Input((len(uniq_category)))
    cat_ = Dense(300,activation='relu')(cat)

    title = Input((100))
    title_ = Dense(50,activation='relu')(title)
    tag = Input((100))
    tag_ = Dense(50,activation='relu')(tag)

    post_concat = Concatenate()([cat_,title_,tag_])

    output = Dot(axes=[-1,-1],normalize=True)([user,post_concat])    

    model = tf.keras.Model([user_inp,title,tag,cat],output)

    return model

  model = create_model()

  model.compile(optimizer=Adagrad(lr=0.0001), loss='binary_crossentropy',metrics=['accuracy'])

  print("model started training...")
  model.fit_generator(train_dg,validation_data=test_dg,epochs=1)
  print("Model trained")

  """Retrieving trained user embeddings"""

  user_embeddings = model.get_layer('embedding').get_weights()[0]

  follows = pd.read_csv('./follows.csv')

  follows = follows.drop(['timestamp'],axis=1)

  """Users present in follows.csv"""

  uids = np.concatenate((follows['user_id'].values,follows['follower_id'].values))
  uids = set(uids)

  """Creating Edges"""

  edges = [(y,x) for x,y in zip(follows['user_id'],follows['follower_id'])]

  """Creating Directional Graph and adding the edges"""

  G = nx.DiGraph()
  G.add_edges_from(edges)

  edges_dict = dict()
  for edge in edges:
    edges_dict[edge]=1

  rw = UniformRandomMetaPathWalk(StellarDiGraph(G))

  """Creating random walks.

  Each walk can be seen as a chain:  uid->uid->uid ... 

  They are of length 100
  """

  walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']])

  """Word2Vec on those chains"""

  user_model =  Word2Vec(walks,size=128,window=5)
  user_model.wv.vectors.shape

  """Each user represented by 128 dim vector"""

  node_ids = user_model.wv.index2word
  node_embed = user_model.wv.vectors

  print("Pushing to database...")
  userCollection = cluster.Users.User_Embeddings
  userCollection.delete_many({})
  followCollection = cluster.Users.Follows
  followCollection.delete_many({})
  posted = cluster.Users.Posted
  posted.delete_many({})
  catCol = cluster.Users.Categories
  catCol.delete_many({})
  embedCol = cluster.Users.Embedding_Matrix
  embedCol.delete_many({})

  folDict = dict()
  for i,id in enumerate(node_ids):
    folDict[id]=i

  user_ins=[]
  for user in tqdm(users_info.keys()):
    embed = list(np.matmul(users_info[user],user_embeddings))
    if folDict.get(user,-1) == -1:
      #userCollection.insert_one({'user_id':user, 'user_embed':embed})
      user_ins.append({'user_id':user, 'user_embed':embed})
    else:
      yo = node_embed[folDict[user]].tolist()
      #userCollection.insert_one({'user_id':user, 'user_embed':embed, 'node_embed':yo})
      user_ins.append({'user_id':user, 'user_embed':embed, 'node_embed':yo})

  userCollection.insert_many(user_ins)

  fol=[]
  for uid,fid in tqdm(zip(follows['user_id'],follows['follower_id'])):
      d = dict()
      d['user_id'] = uid
      d['follower_id'] = fid
      fol.append(d)

  followCollection.insert_many(fol)

  categories = pickle.dumps(uniq_category)
  user_embed = pickle.dumps(user_embeddings)

  catCol.insert_one({"Categories":categories})
  embedCol.insert_one({"Matrix":user_embed})


  uids = set()
  for uid in userPosts['user_id']:
      uids.add(uid)
  to_ins=[]
  for uid in uids:
      noob = dict()
      noob['user_id']=uid    
      to_ins.append(noob)

  posted.insert_many(to_ins)
  requests.get('http://3.7.185.166/train')
  print("Done!")
コード例 #10
0
  uids = set(uids)

  """Creating Edges"""

  edges = [(y,x) for x,y in zip(follows['followed'],follows['follower'])]

  """Creating Directional Graph and adding the edges"""

  G = nx.DiGraph()
  G.add_edges_from(edges)

  edges_dict = dict()
  for edge in edges:
    edges_dict[edge]=1

  rw = UniformRandomMetaPathWalk(StellarDiGraph(G))

  """		Creating random walks.

  Each walk can be seen as a chain:  uid->uid->uid ... 
  They are of length 100

  """

  walks = rw.run(nodes=list(uids),length=100,n=2,metapaths=[['default','default']])

  """Word2Vec on those chains"""

  user_model =  Word2Vec(walks,size=128,window=5)
  user_model.wv.vectors.shape
コード例 #11
0
def get_features(outfolder, walk_args=None, w2v_args=None, redo=False):
    '''
    Implements metapath2vec by:
    1. Building a graph
    2. Performing a random metapath walk then 
    3. Applying word2vec on the walks generated.
    ---------
    Parameters:
    outfolder:      Path to directory where output will be saved, should contain app_list.csv
    walk_args:      Arguments for stellargraph.data.UniformRandomMetaPathWalk
    w2v_args:       Arguments for gensim.models.Word2Vec
    '''
    # save parameters to outfolder
    params = {
        "outfolder": outfolder,
        "walk_args": walk_args,
        "w2v_args": w2v_args 
    }
    with open(os.path.join(outfolder, 'params.json'), 'w') as param_file:
        json.dump(params, param_file)
    
    # define paths
    app_list_path = os.path.join(outfolder, 'app_list.csv')
    nodes_path = os.path.join(outfolder, 'nodes.json')
    edge_path = os.path.join(outfolder, 'edges.csv')
    graph_path = os.path.join(outfolder, 'graph.pkl')
    feature_path = os.path.join(outfolder, 'features.csv')
    app_heap_path = os.path.join('data', 'out', 'all-apps', 'app-data/')
    metapath_walk_outpath = os.path.join(outfolder, 'metapath_walk.json')
    
    # generate app list
    apps_df = pd.read_csv(app_list_path)
    app_data_list = app_heap_path + apps_df.app + '.csv'
    
    if os.path.exists(graph_path) and not redo:  # load graph from file if present
        with open(graph_path, 'rb') as file:
            g = pickle.load(file)
    else:                                        # otherwise compute from data
        g = build_graph(outfolder, app_data_list, nodes_path, edge_path)

    # save graph to file
    with open(graph_path, 'wb') as file:
        pickle.dump(g, file)

    if os.path.exists(metapath_walk_outpath) and not redo:  # load graph from file if present
        with open(metapath_walk_outpath, 'r') as file:
            metapath_walks = json.load(file)
    else:                                        # otherwise compute from data
        # random walk on all apps, save to metapath_walk.json
        print('Performing random walks')
        rw = UniformRandomMetaPathWalk(g)
        app_nodes = list(
            apps_df.app.map(
                pd.read_csv(os.path.join(outfolder, 'app_map.csv'), index_col='app').uid
            )
        )
        metapath_walks = rw.run(app_nodes, n=walk_args['n'], length=walk_args['length'], metapaths=walk_args['metapaths'])
        
        with open(metapath_walk_outpath, 'w') as file:
            json.dump(metapath_walks, file)
    
    print('Running Word2vec')
    w2v = Word2Vec(metapath_walks, **w2v_args)
    
    features = pd.DataFrame(w2v.wv.vectors)
    features['uid'] = w2v.wv.index2word
    features['app'] = features['uid'].map(
        pd.read_csv(os.path.join(outfolder, 'app_map.csv'), index_col='uid').app
    )
    features = features[features.uid.str.contains('app')].set_index('uid')
    features.to_csv(feature_path)
コード例 #12
0
    def fit_predict(self, path):
        outpath = os.path.join(path, f'm2v-{self.name}')
        os.makedirs(outpath, exist_ok=True)
        # get app data, compute unique apis
        apps = pd.read_csv(os.path.join(path, 'app_list.csv'),
                           usecols=['app'],
                           squeeze=True,
                           dtype=str)
        #         apps = set(apps)
        app_data_list = os.path.join('data', 'out', 'all-apps',
                                     'app-data/') + apps + '.csv'

        print('Computing new edges')
        data = dd.read_csv(list(app_data_list),
                           dtype=str,
                           usecols=['app', 'api']).drop_duplicates().compute()
        data.api = data.api.map(self.api_map)
        data.columns = ['source', 'target']
        data = data.dropna()

        nodes = self.nodes.copy()
        nodes['app'] = IndexedArray(
            index=np.array(list(nodes['app'].index) + list(apps)))
        edges = pd.concat([pd.read_csv(self.edges_path, dtype=str), data],
                          ignore_index=True).reset_index(drop=True)
        g = StellarGraph(nodes=nodes, edges=edges)
        print(g)

        print('Running random walk')
        rw = UniformRandomMetaPathWalk(g)
        walk_args = self.params['walk_args']
        new_walks = rw.run(list(apps),
                           n=walk_args['n'],
                           length=walk_args['length'],
                           metapaths=walk_args['metapaths'])
        metapath_walks = (self.metapath_walks + new_walks)

        print('Running Word2Vec')
        # make features with word2vec
        w2v = Word2Vec(metapath_walks, **self.params['w2v_args'])

        print('Fitting model')
        features = pd.DataFrame(w2v.wv.vectors)
        features['app'] = w2v.wv.index2word
        map_func = lambda uid: uid if uid not in self.inverse_app_map else self.inverse_app_map[
            uid]
        features['app'] = features['app'].map(map_func)
        features = features.set_index('app')
        X_train = features.loc[self.app_map.keys()]
        #         X_train = X_train.uid.map(self.inverse_app_map)
        X_test = features.loc[apps]

        # train model and predict new apps
        labels = pd.read_csv('data/out/all-apps/app_list.csv',
                             usecols=['app', 'malware'],
                             index_col='app',
                             squeeze=True)
        y_test = labels[X_test.index]
        y_train = labels[X_train.index]

        mdl = self.classifier(**self.classifier_args)
        mdl.fit(X_train, y_train)
        pred = mdl.predict(X_test)

        print(classification_report(y_test, pred))

        results = X_test.assign(m2vDroid=pred, true=y_test)

        # save results and training data
        results.to_csv(os.path.join(outpath, 'predictions.csv'))
        X_train.assign(m2vDroid=mdl.predict(X_train), true=y_train).to_csv(
            os.path.join(outpath, 'training_data.csv'))

        return results
コード例 #13
0
    phrase_id_map = pickle.load(open(data_path + "phrase_id_map.pkl", "rb"))
    id_phrase_map = pickle.load(open(data_path + "id_phrase_map.pkl", "rb"))

    labels, label_to_index, index_to_label = get_distinct_labels(df)
    label_term_dict = get_label_term_json(data_path + "seedwords.json")
    label_term_dict = modify_phrases(label_term_dict, phrase_id_map)

    graph, metapaths = get_book_graph_metapaths(df, tokenizer, id_phrase_map)

    print(
        "Number of nodes {} and number of edges {} in graph.".format(
            graph.number_of_nodes(), graph.number_of_edges()
        )
    )

    rw = UniformRandomMetaPathWalk(graph)
    walks = rw.run(
        nodes=list(graph.nodes()),  # root nodes
        length=5,  # maximum length of a random walk
        n=5,  # number of random walks per root node
        metapaths=metapaths,  # the metapaths
    )
    print("Number of random walks: {}".format(len(walks)))

    model = Word2Vec(walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=10)
    print("Embeddings shape: ", model.wv.vectors.shape)

    node_ids = model.wv.index2word  # list of node IDs
    node_embeddings = (
        model.wv.vectors
    )  # numpy.ndarray of size number of nodes times embeddings dimensionality