def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    print model                    
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm
def process(args):
    G = graph.load_adjacencylist_npy(args.input, undirected=args.undirected)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=args.number_walks,
                                        path_length=args.walk_length,
                                        alpha=0,
                                        rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks,
                     size=args.dimensions,
                     window=args.window_size,
                     min_count=0,
                     sg=1,
                     hs=1,
                     workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Example #3
0
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=args.number_walks,
                                        path_length=args.walk_length,
                                        alpha=0,
                                        rand=random.Random(args.seed))
    model = Word2Vec(walks,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)

    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.readline()
        groundtruth = [line.strip().split("\t")[:3]
                       for line in fin]  # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    pr = [
        predict_rating(model, nodedict, "u" + g[0], "m" + g[1])
        for g in groundtruth
    ]

    print("MSE = %f" % mean_squared_error(tr, pr))
    print("accuracy = %f" % accuracy_score(tr, pr))
    cm = confusion_matrix(tr, pr, labels=range(1, 6))
    print(cm)
Example #4
0
    def graph2walks(self, method="", params={}):

        self.params = params

        if method == "deepwalk":
            number_of_walks = self.params['number_of_walks']
            walk_length = self.params['walk_length']
            alpha = self.params['alpha']

            # Temporarily generate the edge list
            with open("./temp/graph.edgelist", 'w') as f:
                for line in nx.generate_edgelist(self.graph, data=False):
                    f.write("{}\n".format(line))

            dwg = deepwalk.load_edgelist("./temp/graph.edgelist",
                                         undirected=True)
            corpus = deepwalk.build_deepwalk_corpus(G=dwg,
                                                    num_paths=number_of_walks,
                                                    path_length=walk_length,
                                                    alpha=alpha,
                                                    rand=random.Random(0))

        elif method == "node2vec":

            number_of_walks = self.params['number_of_walks']
            walk_length = self.params['walk_length']
            p = self.params['p']
            q = self.params['q']

            for edge in self.graph.edges():
                self.graph[edge[0]][edge[1]]['weight'] = 1
            G = node2vec.Graph(nx_G=self.graph, p=p, q=q, is_directed=False)
            G.preprocess_transition_probs()
            corpus = G.simulate_walks(num_walks=number_of_walks,
                                      walk_length=walk_length)

        else:
            raise ValueError("Invalid method name!")
        """
        new_corpus = []
        line_counter = 0
        line = []
        for walk in corpus:
            if line_counter < self.params['number_of_walks']:
                line.extend(walk)
                line_counter += 1
            else:
                line_counter = 0
                new_corpus.append(line)
                line = []

        corpus = new_corpus
        """
        self.corpus = corpus

        return self.corpus
Example #5
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                         workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,
                                                                                                             args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0,
                                                          rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output, binary=False)
    print('saved!')
Example #6
0
def construct_rpr_matrix(G, INDUCTIVE=False):
    '''
	Construct Rooted PageRank matrix
	'''
    print("Number of nodes: {}".format(len(G.nodes())))
    num_walks = len(G.nodes()) * FLAGS.walk_times
    num_nodes = len(G.nodes())

    print("Number of walks: {}".format(num_walks))
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=FLAGS.walk_times,
                                        path_length=FLAGS.walk_length,
                                        alpha=FLAGS.alpha,
                                        rand=random.Random(FLAGS.seed))
    all_counts = {}
    for node in walks.keys():
        walks_n = walks[node]
        all_counts[node] = Counter()
        for walk in walks_n:
            all_counts[node].update(walk)

    print("Normal random walks started...")
    pairs = graph.write_normal_randomwalks(
        G,
        file_='./var/' + FLAGS.train_prefix + '_normal_walks.txt',
        rand=random.Random(FLAGS.seed))

    print("Normal random walks dumped.")

    rpr_matrix = []
    rpr_arg = []
    for node in tqdm(xrange(num_nodes)):
        if node not in all_counts.keys():
            raise NotImplementedError
        temp = all_counts[node].most_common(FLAGS.k_RPR)
        temp_arg = [i[0] for i in temp]
        temp_value = [i[1] for i in temp]
        if len(temp) < FLAGS.k_RPR:
            for _ in xrange(FLAGS.k_RPR - len(temp)):
                temp_value.append(0.0)
                temp_arg.append(node)
        temp_value = np.asarray(temp_value, dtype='double')
        temp_value = temp_value / sum(temp_value)
        rpr_matrix.append(temp_value)
        rpr_arg.append(temp_arg)
    rpr_matrix = np.asarray(rpr_matrix, dtype='double')
    rpr_arg = np.asarray(rpr_arg, dtype='double')
    rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat'

    sio.savemat(rpr_file, {'rpr_matrix': rpr_matrix})
    return rpr_matrix, pairs, rpr_arg
Example #7
0
def process(args):

  if args.format == "adjlist":
      G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
      G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
      G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
      raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

#   G = graphConstruction.buildGraphAPA()


  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Example #8
0
def graph_walk_data(data_path=None):
    p_train_path = os.path.join(data_path, "train.txt")
    p_valid_path = os.path.join(data_path, "valid.txt")
    p_test_path = os.path.join(data_path, "test.txt")
    train_path = os.path.join(data_path, "train.edgelist")
    valid_path = os.path.join(data_path, "valid.edgelist")
    test_path = os.path.join(data_path, "test.edgelist")

    word2id = _build_vocab(p_train_path)
    _graph_to_edgelist(p_train_path, train_path, word2id)
    _graph_to_edgelist(p_valid_path, valid_path, word2id)
    _graph_to_edgelist(p_test_path, test_path, word2id)
    json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w'))

    G_train = graph.load_edgelist(train_path)
    G_valid = graph.load_edgelist(valid_path)
    G_test = graph.load_edgelist(test_path)

    train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=30, path_length=50)
    valid_walks = graph.build_deepwalk_corpus(G_valid, list_exclud=[], num_paths=30, path_length=50)
    test_walks = graph.build_deepwalk_corpus(G_test, list_exclud=[], num_paths=30, path_length=50)
    vacabulary = len(word2id)

    return train_walks, valid_walks, test_walks, vacabulary
Example #9
0
def graph_walk_data(data_path=None):
    p_train_path = os.path.join(data_path, "train.txt")
    train_path = os.path.join(data_path, "train.edgelist")

    word2id = _build_vocab(p_train_path)
    _graph_to_edgelist(p_train_path, train_path, word2id)

    print("export word2id file....")
    json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w'))
    print("word2id file exported.")

    G_train = graph.load_edgelist(train_path)
    train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=RWConf["num_paths"], path_length=RWConf["path_length"])

    vocabulary = len(word2id)

    return train_walks, vocabulary
Example #10
0
    def perform_random_walks(self, output_node_corpus_file):

        if not ('number_of_walks' and 'walk_length') in self.params.keys() or self.graph is None:
            raise ValueError("Missing parameter !")

        self.number_of_nodes = self.graph.number_of_nodes()
        self.N = self.number_of_nodes * self.params['number_of_walks']
        self.L = self.params['walk_length']

        initial_time = time.time()
        # Generate a corpus

        if self.params['random_walk'] == "deepwalk":
            if not ('dw_alpha') in self.params.keys():
                raise ValueError("A parameter is missing!")

            # Temporarily generate the edge list
            with open(os.path.join(self.temp_folder,  "graph_deepwalk.edgelist"), 'w') as f:
                for line in nx.generate_edgelist(self.graph, data=False):
                    f.write("{}\n".format(line))

            dwg = deepwalk.load_edgelist(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), undirected=True)
            self.corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=self.params['number_of_walks'],
                                                         path_length=self.params['walk_length'],
                                                         alpha=self.params['dw_alpha'])

        elif self.params['random_walk'] == "node2vec":

            if not ('n2v_p' and 'n2v_q') in self.params.keys():
                raise ValueError("A missing parameter exists!")

            for edge in self.graph.edges():
                self.graph[edge[0]][edge[1]]['weight'] = 1
            G = node2vec.Graph(nx_G=self.graph, p=self.params['n2v_p'], q=self.params['n2v_q'], is_directed=False)
            G.preprocess_transition_probs()
            self.corpus = G.simulate_walks(num_walks=self.params['number_of_walks'],
                                           walk_length=self.params['walk_length'])

        else:
            raise ValueError("Invalid method name!")

        self.save_corpus(output_node_corpus_file, with_title=False)

        print("The corpus was generated in {:.2f} secs.".format(time.time() - initial_time))
Example #11
0
def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G,
                                       args.number_walks,
                                       args.walk_length,
                                       alpha=0,
                                       rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)
    print model
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3]
                       for line in fin]  # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [
        predict_rating(model, nodedict, "u" + g[0], "m" + g[1])
        for g in groundtruth
    ]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1, 6))
    print cm
Example #12
0
def process(args):

  #if args.format == "adjlist":
  #  G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  #elif args.format == "edgelist":
  #  G = graph.load_edgelist(args.input, undirected=args.undirected)
  #elif args.format == "mat":
  #  G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  if args.format == "w_edgelist":
    G = graph.load_weighted_edgelist(args.input, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  This version supports only 'w_edgelist'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if True:
    
    print("Initailizing...")
    
    vertex_counts = G.degree(nodes=G.iterkeys())
    #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    model = Skipgram(sentences=None, vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers, sg=args.sg)

    print("Walking & Training...")
    sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks+1))

    for i in xrange(args.number_walks):
        
        sys.stderr.write("\rprogress: %.2f %% [%d/%d] (walk step) " % ((i)*100./(args.number_walks+1), i+1, args.number_walks+1))
        sys.stderr.flush()
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers)

        sys.stderr.write("\rprogress: %.2f %% [%d/%d] (train step) " % ((i+.5)*100./(args.number_walks+1), i+1, args.number_walks+1))
        sys.stderr.flush()

        #model.build_vocab(walks)
        model.train(walks)
    sys.stderr.write("\rprogress: 100.00 %%\n")
    sys.stderr.flush()

  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Example #13
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, 
                test_links_ratio=args.test_links, test_links_file=args.test_links_file,
                train_links_file=args.train_links_file)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    if args.heuristic_wrb_for_wbr is not None:
        wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr))
        print(wrb, err)
        return


    if (args.weighted is not None) and (args.weighted != 'unweighted'):
      G = graph.set_weights(G, args.weighted)

    if args.just_write_graph:
        with open('wgraph.out', 'w') as fout:
            if args.weighted == 'unweighted':
                for v in G:
                    s = len(G[v])
                    for u in G[v]:
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n')
            elif args.weighted.startswith('random_walk'):
                for v in G:
                    for u, w in zip(G[v], G.edge_weights[v]):
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n')
            else:
                raise Exception('just-write-graph is not supported for this weighting method')
        return None




    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, p_modified=args.pmodified,
                                            alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                             path_length=args.walk_length, p_modified=args.pmodified,
                                             alpha=0, rand=random.Random(args.seed),
                                             num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
          vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
          # use degree distribution for frequency in tree
          vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Example #14
0
rg = randomgraph.RanGraphGen()
rg.set_model(model=lfr_params)
g = rg.lfr_model()

graph_path = "./outputs/lfr_synthetic_n1000.gml"
nx.write_gml(g, graph_path)

# Find the embedding of the
temp_adjlist_file = "./temp/graph.adjlist"
embedding_file = "./outputs/output.embedding"
nx.write_edgelist(g, temp_adjlist_file)

dwg = dw.load_edgelist(temp_adjlist_file, undirected=True)
walks = dw.build_deepwalk_corpus(dwg,
                                 num_paths=dw_params['n'],
                                 path_length=dw_params['l'],
                                 alpha=0)
model = Word2Vec(walks,
                 size=dw_params['d'],
                 window=dw_params['w'],
                 min_count=0,
                 sg=1,
                 hs=1,
                 workers=dw_params['workers'])
model.wv.save_word2vec_format(embedding_file)

comdetect = CommunityDetection(embedding_file,
                               graph_path,
                               params={'directed': False})
score = comdetect.evaluate(num_of_communities=kmeans_num_of_communities)
print("Score: {}".format(score))
Example #15
0
try:
    f = open(path + 'sbm_node_labels.pickle', 'rb')
    node_colors = pickle.load(f)
except UnicodeDecodeError:
    f.seek(0)
    node_colors = pickle.load(f, encoding='latin1')
node_colors_arr = [None] * node_colors.shape[0]
for idx in range(node_colors.shape[0]):
    node_colors_arr[idx] = np.where(node_colors[idx, :].toarray() == 1)[1][0]

models = ['manela']

for model in models:
    if model == 'deepwalk':
        gr = graph.from_networkx(G, undirected=True)
        walks = graph.build_deepwalk_corpus(gr, 10, 80, 0)
        model = Word2Vec(walks,
                         size=128,
                         window=10,
                         min_count=0,
                         sg=1,
                         hs=0,
                         negative=5,
                         workers=4,
                         iter=1)
        emb_matrix = np.zeros((len(gr), 128))
        for key in range(len(gr)):
            emb_matrix[key] = model.wv.get_vector(str(key))

    elif model == 'manela':
        gr = graph.from_networkx(G, undirected=True)
Example #16
0
def evaluatePrediction(ori_graph,
                       emb_name=['dnela'],
                       train_ratio=0.8,
                       sample_nodes=None,
                       v1=[None],
                       v2=[None]):
    #1. split the original graph to train and test. Remove edges from original graph
    #to create train graph, the complimentary part left is test graph
    #if the split train graph is not connected, return the max connected component
    print(ori_graph.order())
    print(str(ori_graph.is_connected()))
    train_graph, test_graph = graph.graph_splitter(ori_graph, train_ratio)

    if not train_graph.is_connected():
        train_graph = max(graph.weak_connected_components(train_graph),
                          key=len)
        train_nodes = list(train_graph.keys())
        train_nodes_dict = dict(zip(train_nodes, range(len(train_nodes))))
        train_graph = graph.re_label_nodes(train_graph, train_nodes_dict)
        test_graph = test_graph.subgraph(train_nodes)
        test_graph = graph.re_label_nodes(test_graph, train_nodes_dict)
    node_num = train_graph.order()
    print(node_num)

    MAP = [None] * len(emb_name)
    precision_curve = [None] * len(emb_name)
    auc = [None] * len(emb_name)
    if sample_nodes:
        if sample_nodes < node_num:
            trimed_test_graph, node_l = graph.sample_graph(
                test_graph, sample_nodes)

    for k, name in enumerate(emb_name):

        #2. train embeddings using methods specified
        if name == 'manela':
            emb = ds.Distributed(train_graph)
            emb.setArgs(numUpdates=v1[k],
                        outputPath='temp_emb.embeddings',
                        ratio=v2[k])
            emb.process()
            emb_matrix = emb.getEmbeddings()
        elif name == 'deepwalk':
            walks = graph.build_deepwalk_corpus(train_graph, 10, 80, 0)
            model = Word2Vec(walks,
                             size=128,
                             window=10,
                             min_count=0,
                             sg=1,
                             hs=0,
                             negative=5,
                             workers=4,
                             iter=1)
            emb_matrix = zeros((node_num, 128))
            for key in range(node_num):
                emb_matrix[key] = model.wv.get_vector(str(key))

        elif name == 'node2vec':
            #1. transform graph format from graph to nx.Graph()
            ngraph = nx.Graph()
            for key, value in train_graph.items():
                for adj in value:
                    ngraph.add_edge(key, adj, weight=1)
            ngraph.to_undirected()
            G = n2v.Graph(ngraph, False, v1[k], v2[k])
            G.preprocess_transition_probs()
            walks = G.simulate_walks(10, 80)
            walks = [list(map(str, walk)) for walk in walks]
            model = Word2Vec(walks,
                             size=128,
                             window=10,
                             min_count=0,
                             sg=1,
                             hs=0,
                             negative=5,
                             workers=4,
                             iter=1)
            emb_matrix = zeros((node_num, 128))
            for key in range(node_num):
                emb_matrix[key] = model.wv.get_vector(str(key))

        else:
            pass

        #3. sample some nodes for validation
        if name == 'common_neighbors':
            ori_test_graph = copy.deepcopy(test_graph)
        if name == 'manela' or name == 'deepwalk' or name == 'node2vec':
            emb_matrix = emb_matrix[node_l]

        #4. construct node weights from embeddings
        if name == 'common_neighbors':
            result_pair_list = eu.get_edge_list_from_cn(node_l,
                                                        ori_test_graph,
                                                        threshold=-1)
        else:
            adj_matrix = eu.get_recontructed_adj(emb_matrix)
            result_pair_list = eu.get_edge_list_from_adj(adj_matrix,
                                                         threshold=-100000)
        #filter the result edge list from those appeared in train_graph
        #NOTE: THIS STEP IS IMPORTANT SINCE train_set HERE IS COMPLETE, NOT SAMPLED WHILE test_graph
        #IS SAMPLED SO THEY HAVE DIFFRENT LABELS. THIS DICTIONARY IS FOR NODE TRANSLATION
        filtered_pair_list = [
            pair for pair in result_pair_list
            if not train_graph.has_edge(node_l[pair[0]], node_l[pair[1]])
        ]
        #5. compute MAP and precision curve
        MAP[k] = eu.compute_map(filtered_pair_list,
                                trimed_test_graph,
                                max_k=-1)
        precision_curve[k], _, auc[k] = eu.compute_precision_curves(
            filtered_pair_list, trimed_test_graph, max_k=1024, a=True)

    return MAP, precision_curve, auc
Example #17
0
    def getEmbeddings(self, relationships):

        G = graph.load_py4jclient(relationships)

        print("Number of nodes: {}".format(len(G.nodes())))

        num_walks = len(G.nodes()) * self.args.number_walks

        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * self.args.walk_length

        print("Data size (walks*length): {}".format(data_size))

        if data_size < self.args.max_memory_data_size:
            print("Walking...")
            walks = graph.build_deepwalk_corpus(
                G,
                num_paths=self.args.number_walks,
                path_length=self.args.walk_length,
                alpha=0,
                rand=random.Random(self.args.seed))
            print("Training...")
            model = Word2Vec(walks,
                             size=self.args.representation_size,
                             window=self.args.window_size,
                             min_count=0,
                             sg=1,
                             hs=1,
                             workers=self.args.workers)
        else:
            print(
                "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
                .format(data_size, self.args.max_memory_data_size))
            print("Walking...")

            walks_filebase = self.args.output + ".walks"
            walk_files = serialized_walks.write_walks_to_disk(
                G,
                walks_filebase,
                num_paths=self.args.number_walks,
                path_length=self.args.walk_length,
                alpha=0,
                rand=random.Random(self.args.seed),
                num_workers=self.args.workers)

            print("Counting vertex frequency...")
            if not self.args.vertex_freq_degree:
                vertex_counts = serialized_walks.count_textfiles(
                    walk_files, self.args.workers)
            else:
                # use degree distribution for frequency in tree
                vertex_counts = G.degree(nodes=G.iterkeys())

            print("Training...")
            walks_corpus = serialized_walks.WalksCorpus(walk_files)
            model = Skipgram(sentences=walks_corpus,
                             vocabulary_counts=vertex_counts,
                             size=self.args.representation_size,
                             window=self.args.window_size,
                             min_count=0,
                             trim_rule=None,
                             workers=self.args.workers)

        # to_return = {}
        # for word, vec in zip(model.wv.vocab, model.wv.vectors):
        #   to_return[word] = " ".join([for str(x) in vec])
        to_return = ""
        for word, vec in zip(model.wv.vocab, model.wv.vectors):
            vector_str = " ".join([str(x) for x in vec])
            to_return = to_return + word + "\t" + vector_str + "\n"

        print(to_return)
        # from py4j.java_collections import SetConverter, MapConverter, ListConverter
        # to_return = MapConverter().convert(to_return, client)
        # to_return = D()
        # for word, vec in zip(model.wv.vocab, model.wv.vectors):
        #   to_return.word = str(vec)

        return to_return
Example #18
0
                       lr=learning_rate,
                       weight_decay=weight_decay)
#print(model.parameters())
model.cuda()
features = features.cuda()
adj = adj.cuda()
labels = labels.cuda()
idx_train = idx_train.cuda()
idx_val = idx_val.cuda()
idx_test = idx_test.cuda()

G = graph.from_numpy(adjn, undirected=True)

walks_sq = graph.build_deepwalk_corpus(G,
                                       num_paths=1,
                                       path_length=20,
                                       alpha=0,
                                       rand=random.Random(0))

#print(walks.__next__())
walks_sq = np.array(walks_sq)
#print(walks.shape)

#inputs = np.empty([2708, 20,1433], dtype = int)
walks = torch.empty([2708, 20, 1433], dtype=torch.float)
walks = walks.cuda()
#print(features[walks_sq[0][0]])
for i in range(0, 2078):
    for j in range(0, 20):
        walks[i][j] = features[walks_sq[i][j]]
Example #19
0
def deepwalk_get_feature(args, adj_indices, result_path):
    model_path = result_path + '.model'
    if os.path.exists(model_path):
        return Word2Vec.load(model_path)
    G = graph.load_edgelist(adj_indices, undirected=args.undirected)

    print(G)
    if len(G) < 10:
        print('输出随机游走点太少')
        return []
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.dataset + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=args.workers)

    model.wv.save_word2vec_format(result_path + '.feature')
    model.save(model_path)
    return model
Example #20
0
def process(args):

    #if args.format == "adjlist":
    #  G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    #elif args.format == "edgelist":
    #  G = graph.load_edgelist(args.input, undirected=args.undirected)
    #elif args.format == "mat":
    #  G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    if args.format == "w_edgelist":
        G = graph.load_weighted_edgelist(args.input,
                                         undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  This version supports only 'w_edgelist'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if True:

        print("Initailizing...")

        vertex_counts = G.degree(nodes=G.iterkeys())
        #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
        model = Skipgram(sentences=None,
                         vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         workers=args.workers,
                         sg=args.sg)

        print("Walking & Training...")
        sys.stderr.write("\rprogress: 0.00 [0/%d] %%" %
                         (args.number_walks + 1))

        for i in xrange(args.number_walks):

            sys.stderr.write(
                "\rprogress: %.2f %% [%d/%d] (walk step) " %
                ((i) * 100. /
                 (args.number_walks + 1), i + 1, args.number_walks + 1))
            sys.stderr.flush()
            walks = graph.build_deepwalk_corpus(G,
                                                num_paths=args.number_walks,
                                                path_length=args.walk_length,
                                                alpha=0.,
                                                rand=random.Random(args.seed),
                                                workers=args.workers)

            sys.stderr.write(
                "\rprogress: %.2f %% [%d/%d] (train step) " %
                ((i + .5) * 100. /
                 (args.number_walks + 1), i + 1, args.number_walks + 1))
            sys.stderr.flush()

            #model.build_vocab(walks)
            model.train(walks)
        sys.stderr.write("\rprogress: 100.00 %%\n")
        sys.stderr.flush()

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0.1,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=args.representation_size,
            window=args.window_size,
            min_count=0,
            workers=args.workers)

    model.save_word2vec_format(args.output)
Example #21
0
def main(graph_fname, node_vec_fname, options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    '''

    print 'Load a road Graph...'
    # g = loader.load_a_HIN(graph_fname)
    G = graph.load_edgelist(graph_fname, undirected=True)
    print 'Generate random walks...'

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * options.walk_num

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * options.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=options.walk_num,
                                        path_length=options.walk_length,
                                        alpha=0,
                                        rand=random.Random(0))

    tmp_walk_fname = "tmp_walk_fname.txt"
    tmp_walk_json = "tmp_walk_fname.json"

    with open(tmp_walk_json, 'w+') as tmp_walks:
        tmp_walks.write(json.dumps(walks))

    with open(tmp_walk_fname, 'w') as f:
        for walk in walks:
            f.write('%s\n' % ' '.join(map(str, walk)))

    print("Walking done...")

    model = MP2Vec(
        size=options.dim,
        window=options.window,
        neg=options.neg,
        num_processes=options.num_processes,
        alpha=options.alpha,
        same_w=True,
        normed=False,
    )

    neighbors = None  # {node_osmid: [<node_osmid>, <node_osmid>, ...]}
    if options.correct_neg:
        for id_ in G:
            G._get_k_hop_neighborhood(id_, options.window)

        neighbors = G.k_hop_neighbors[options.window]

    model.train(G, walks, k_hop_neighbors=neighbors)

    print 'Dump vectors...'
    model.dump_to_file(node_vec_fname, type_='node')
    return 0
Example #22
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    if (os.path.isfile(format(args.excludlist))):
        #num_exlud = number_excluded_nodes(args.excludlist)
        list_exclud = open(args.excludlist).readlines()
        list_exclud = [int(x) for x in list_exclud]
        list_exclud = set(list_exclud)
        num_exlud = len(set(list_exclud))
    else:
        num_exlud = 0
        list_exclud = []
    if (num_exlud > 0):
        print("Number of nodes excluded from the walk: {}".format(num_exlud))

    #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            list_exclud=list_exclud,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            list_exclud,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=args.representation_size,
            window=args.window_size,
            min_count=0,
            workers=args.workers)

    model.wv.save_word2vec_format(args.output)