Exemple #1
0
 def homogeneous_graph_random_walks_for_large_bipartite_graph(
         self, datafile, percentage, maxT, minT):
     G = graph.load_edgelist(datafile, undirected=True)
     A, row_index, item_index = bi.biadjacency_matrix(self.G,
                                                      self.node_u,
                                                      self.node_v,
                                                      dtype=np.float,
                                                      weight='weight',
                                                      format='csr')
     index_row = dict(zip(row_index.values(), row_index.keys()))
     index_item = dict(zip(item_index.values(), item_index.keys()))
     AT = A.transpose()
     matrix_u = self.get_homogenous_graph(A.dot(AT), self.fw_u, index_row,
                                          index_row)
     matrix_v = self.get_homogenous_graph(AT.dot(A), self.fw_v, index_item,
                                          index_item)
     self.G_u, self.walks_u = self.get_random_walks_restart_for_large_bipartite_graph(
         matrix_u,
         self.authority_u,
         percentage=percentage,
         maxT=maxT,
         minT=minT)
     self.G_v, self.walks_v = self.get_random_walks_restart_for_large_bipartite_graph(
         matrix_v,
         self.authority_v,
         percentage=percentage,
         maxT=maxT,
         minT=minT)
Exemple #2
0
def read_graph():
    '''
	Reads the input network.
	'''
    print(" - Loading graph...")
    G = graph.load_edgelist(FLAGS.input, undirected=True)
    print(" - Graph loaded.")
    return G
 def read_sentences_and_homogeneous_graph(self, filesentences=None, datafile=None):
     G = graph.load_edgelist(datafile, undirected=True)
     walks = []
     with open(filesentences,"r") as fin:
         for line in fin.readlines():
             walk = line.strip().split(" ")
             walks.append(walk)
     return G, walks
Exemple #4
0
def read_graph():
    '''
	Reads the input network.
	'''
    logging.info(" - Loading graph...")
    G = graph.load_edgelist(args.input, undirected=True)
    logging.info(" - Graph loaded.")
    return G
 def get_random_walks_restart(self, datafile, hits_dict, percentage, maxT, minT):
     if datafile is None:
         datafile = os.path.join(self.model_path,"rating_train.dat")
     G = graph.load_edgelist(datafile, undirected=True)
     print("number of nodes: {}".format(len(G.nodes())))
     print("walking...")
     walks = graph.build_deepwalk_corpus_random(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0)
     print("walking...ok")
     return G, walks
Exemple #6
0
def read_graph(args):
    """
    Reads the input network.
    """
    logging.info(" - Loading graph...")
    graph_dict, in_degrees, out_degrees = graph.load_edgelist(
        args.input, args.directed, args.weighted)
    logging.info(" - Graph loaded.")
    return graph_dict, in_degrees, out_degrees
Exemple #7
0
    def graph2walks(self, method="", params={}):

        self.params = params

        if method == "deepwalk":
            number_of_walks = self.params['number_of_walks']
            walk_length = self.params['walk_length']
            alpha = self.params['alpha']

            # Temporarily generate the edge list
            with open("./temp/graph.edgelist", 'w') as f:
                for line in nx.generate_edgelist(self.graph, data=False):
                    f.write("{}\n".format(line))

            dwg = deepwalk.load_edgelist("./temp/graph.edgelist",
                                         undirected=True)
            corpus = deepwalk.build_deepwalk_corpus(G=dwg,
                                                    num_paths=number_of_walks,
                                                    path_length=walk_length,
                                                    alpha=alpha,
                                                    rand=random.Random(0))

        elif method == "node2vec":

            number_of_walks = self.params['number_of_walks']
            walk_length = self.params['walk_length']
            p = self.params['p']
            q = self.params['q']

            for edge in self.graph.edges():
                self.graph[edge[0]][edge[1]]['weight'] = 1
            G = node2vec.Graph(nx_G=self.graph, p=p, q=q, is_directed=False)
            G.preprocess_transition_probs()
            corpus = G.simulate_walks(num_walks=number_of_walks,
                                      walk_length=walk_length)

        else:
            raise ValueError("Invalid method name!")
        """
        new_corpus = []
        line_counter = 0
        line = []
        for walk in corpus:
            if line_counter < self.params['number_of_walks']:
                line.extend(walk)
                line_counter += 1
            else:
                line_counter = 0
                new_corpus.append(line)
                line = []

        corpus = new_corpus
        """
        self.corpus = corpus

        return self.corpus
Exemple #8
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                         workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,
                                                                                                             args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0,
                                                          rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output, binary=False)
    print('saved!')
Exemple #9
0
def process(args):

  if args.format == "adjlist":
      G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
      G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
      G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
      raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

#   G = graphConstruction.buildGraphAPA()


  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Exemple #10
0
def read_graph(args):
    """
    Reads the input network.
    """
    from utils import save_variable_on_disk

    logging.info(" - Loading graph...")
    graph_dict, in_degrees, out_degrees, val_density = graph.load_edgelist(
        args.input, args.directed, args.weighted)
    save_variable_on_disk(val_density, "density")
    logging.info(" - Graph Density Save.")
    ss = restore_variable_from_disk("density")
    logging.info(" - Graph loaded.")
    return graph_dict, in_degrees, out_degrees, val_density
 def get_random_walks_restart_for_large_bipartite_graph_without_generating(self, datafile, hits_dict, percentage, maxT, minT, node_type='u'):
     if datafile is None:
         datafile = os.path.join(self.model_path,"rating_train.dat")
     G = graph.load_edgelist(datafile, undirected=True)
     cnt = 0
     for n in G.nodes():
         if n[0] == node_type:
             cnt += 1
     print("number of nodes: {}".format(cnt))
     print("walking...")
     walks = graph.build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0,node_type=node_type)
     # print(walks)
     print("walking...ok")
     return G, walks
Exemple #12
0
def graph_walk_data(data_path=None):
    p_train_path = os.path.join(data_path, "train.txt")
    p_valid_path = os.path.join(data_path, "valid.txt")
    p_test_path = os.path.join(data_path, "test.txt")
    train_path = os.path.join(data_path, "train.edgelist")
    valid_path = os.path.join(data_path, "valid.edgelist")
    test_path = os.path.join(data_path, "test.edgelist")

    word2id = _build_vocab(p_train_path)
    _graph_to_edgelist(p_train_path, train_path, word2id)
    _graph_to_edgelist(p_valid_path, valid_path, word2id)
    _graph_to_edgelist(p_test_path, test_path, word2id)
    json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w'))

    G_train = graph.load_edgelist(train_path)
    G_valid = graph.load_edgelist(valid_path)
    G_test = graph.load_edgelist(test_path)

    train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=30, path_length=50)
    valid_walks = graph.build_deepwalk_corpus(G_valid, list_exclud=[], num_paths=30, path_length=50)
    test_walks = graph.build_deepwalk_corpus(G_test, list_exclud=[], num_paths=30, path_length=50)
    vacabulary = len(word2id)

    return train_walks, valid_walks, test_walks, vacabulary
Exemple #13
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")

    start = time.time()
    walks_filebase = args.output + ".txt"
    walk_files = serialized_walks.write_walks_to_disk(
        G,
        walks_filebase,
        num_paths=args.number_walks,
        path_length=args.walk_length,
        alpha=0,
        rand=random.Random(args.seed),
        num_workers=args.workers)

    # print("Counting vertex frequency...")
    # if not args.vertex_freq_degree:
    #   vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    # else:
    #   # use degree distribution for frequency in tree
    #   vertex_counts = G.degree(nodes=G.iterkeys())
    end = time.time()
    exe_time = end - start
    print("--------- walking time: {:.5f} -----------".format(exe_time))
Exemple #14
0
def graph_walk_data(data_path=None):
    p_train_path = os.path.join(data_path, "train.txt")
    train_path = os.path.join(data_path, "train.edgelist")

    word2id = _build_vocab(p_train_path)
    _graph_to_edgelist(p_train_path, train_path, word2id)

    print("export word2id file....")
    json.dump(word2id, open(os.path.join(data_path, "word2id.json"), 'w'))
    print("word2id file exported.")

    G_train = graph.load_edgelist(train_path)
    train_walks = graph.build_deepwalk_corpus(G_train, list_exclud=[], num_paths=RWConf["num_paths"], path_length=RWConf["path_length"])

    vocabulary = len(word2id)

    return train_walks, vocabulary
Exemple #15
0
    def perform_random_walks(self, output_node_corpus_file):

        if not ('number_of_walks' and 'walk_length') in self.params.keys() or self.graph is None:
            raise ValueError("Missing parameter !")

        self.number_of_nodes = self.graph.number_of_nodes()
        self.N = self.number_of_nodes * self.params['number_of_walks']
        self.L = self.params['walk_length']

        initial_time = time.time()
        # Generate a corpus

        if self.params['random_walk'] == "deepwalk":
            if not ('dw_alpha') in self.params.keys():
                raise ValueError("A parameter is missing!")

            # Temporarily generate the edge list
            with open(os.path.join(self.temp_folder,  "graph_deepwalk.edgelist"), 'w') as f:
                for line in nx.generate_edgelist(self.graph, data=False):
                    f.write("{}\n".format(line))

            dwg = deepwalk.load_edgelist(os.path.join(self.temp_folder, "graph_deepwalk.edgelist"), undirected=True)
            self.corpus = deepwalk.build_deepwalk_corpus(G=dwg, num_paths=self.params['number_of_walks'],
                                                         path_length=self.params['walk_length'],
                                                         alpha=self.params['dw_alpha'])

        elif self.params['random_walk'] == "node2vec":

            if not ('n2v_p' and 'n2v_q') in self.params.keys():
                raise ValueError("A missing parameter exists!")

            for edge in self.graph.edges():
                self.graph[edge[0]][edge[1]]['weight'] = 1
            G = node2vec.Graph(nx_G=self.graph, p=self.params['n2v_p'], q=self.params['n2v_q'], is_directed=False)
            G.preprocess_transition_probs()
            self.corpus = G.simulate_walks(num_walks=self.params['number_of_walks'],
                                           walk_length=self.params['walk_length'])

        else:
            raise ValueError("Invalid method name!")

        self.save_corpus(output_node_corpus_file, with_title=False)

        print("The corpus was generated in {:.2f} secs.".format(time.time() - initial_time))
Exemple #16
0
    def get_random_walks_restart_for_large_bipartite_graph_without_generating(self, datafile, hits_dict, percentage, maxT, minT, node_type='u'):
        if datafile is None:
            datafile = os.path.join(self.model_path,"rating_train.dat")

        # todo 8. change datafile so that
        #       >gene has u as a first string
        #       >disease has i as a first string
        G = graph.load_edgelist(datafile, undirected=True)
        cnt = 0
        for n in G.nodes():
            if n[0] == node_type:
                cnt += 1
        print("number of nodes: {}".format(cnt))
        print("walking...")

        # todo 5.
        #  input hits_dict is empty because authority_v and authority_u are empty
        #         :What is authority v and u for?
        #  output walk = {}
        walks = graph.build_deepwalk_corpus_random_for_large_bibartite_graph(G, hits_dict, percentage=percentage, maxT = maxT, minT = minT, alpha=0,node_type=node_type)
        # print(walks)
        print("walking...ok")
        return G, walks
Exemple #17
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    if (os.path.isfile(format(args.excludlist))):
        #num_exlud = number_excluded_nodes(args.excludlist)
        list_exclud = open(args.excludlist).readlines()
        list_exclud = [int(x) for x in list_exclud]
        list_exclud = set(list_exclud)
        num_exlud = len(set(list_exclud))
    else:
        num_exlud = 0
        list_exclud = []
    if (num_exlud > 0):
        print("Number of nodes excluded from the walk: {}".format(num_exlud))

    #num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    num_walks = (len(G.nodes()) - num_exlud) * args.number_walks
    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            list_exclud=list_exclud,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            list_exclud,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=args.representation_size,
            window=args.window_size,
            min_count=0,
            workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Exemple #18
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, 
                test_links_ratio=args.test_links, test_links_file=args.test_links_file,
                train_links_file=args.train_links_file)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    if args.heuristic_wrb_for_wbr is not None:
        wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr))
        print(wrb, err)
        return


    if (args.weighted is not None) and (args.weighted != 'unweighted'):
      G = graph.set_weights(G, args.weighted)

    if args.just_write_graph:
        with open('wgraph.out', 'w') as fout:
            if args.weighted == 'unweighted':
                for v in G:
                    s = len(G[v])
                    for u in G[v]:
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n')
            elif args.weighted.startswith('random_walk'):
                for v in G:
                    for u, w in zip(G[v], G.edge_weights[v]):
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n')
            else:
                raise Exception('just-write-graph is not supported for this weighting method')
        return None




    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, p_modified=args.pmodified,
                                            alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                             path_length=args.walk_length, p_modified=args.pmodified,
                                             alpha=0, rand=random.Random(args.seed),
                                             num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
          vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
          # use degree distribution for frequency in tree
          vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Exemple #19
0
dw_params['d'] = 128
dw_params['workers'] = 3

rg = randomgraph.RanGraphGen()
rg.set_model(model=lfr_params)
g = rg.lfr_model()

graph_path = "./outputs/lfr_synthetic_n1000.gml"
nx.write_gml(g, graph_path)

# Find the embedding of the
temp_adjlist_file = "./temp/graph.adjlist"
embedding_file = "./outputs/output.embedding"
nx.write_edgelist(g, temp_adjlist_file)

dwg = dw.load_edgelist(temp_adjlist_file, undirected=True)
walks = dw.build_deepwalk_corpus(dwg,
                                 num_paths=dw_params['n'],
                                 path_length=dw_params['l'],
                                 alpha=0)
model = Word2Vec(walks,
                 size=dw_params['d'],
                 window=dw_params['w'],
                 min_count=0,
                 sg=1,
                 hs=1,
                 workers=dw_params['workers'])
model.wv.save_word2vec_format(embedding_file)

comdetect = CommunityDetection(embedding_file,
                               graph_path,
Exemple #20
0
def main(graph_fname, node_vec_fname, options):
    '''\
    %prog [options] <graph_fname> <node_vec_fname> <path_vec_fname>

    graph_fname: the graph file
        It can be a file contained edges per line (e.g., res/karate_club_edges.txt)
        or a pickled graph file.
    node_vec_fname: the output file for nodes' vectors
    '''

    print 'Load a road Graph...'
    # g = loader.load_a_HIN(graph_fname)
    G = graph.load_edgelist(graph_fname, undirected=True)
    print 'Generate random walks...'

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * options.walk_num

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * options.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=options.walk_num,
                                        path_length=options.walk_length,
                                        alpha=0,
                                        rand=random.Random(0))

    tmp_walk_fname = "tmp_walk_fname.txt"
    tmp_walk_json = "tmp_walk_fname.json"

    with open(tmp_walk_json, 'w+') as tmp_walks:
        tmp_walks.write(json.dumps(walks))

    with open(tmp_walk_fname, 'w') as f:
        for walk in walks:
            f.write('%s\n' % ' '.join(map(str, walk)))

    print("Walking done...")

    model = MP2Vec(
        size=options.dim,
        window=options.window,
        neg=options.neg,
        num_processes=options.num_processes,
        alpha=options.alpha,
        same_w=True,
        normed=False,
    )

    neighbors = None  # {node_osmid: [<node_osmid>, <node_osmid>, ...]}
    if options.correct_neg:
        for id_ in G:
            G._get_k_hop_neighborhood(id_, options.window)

        neighbors = G.k_hop_neighbors[options.window]

    model.train(G, walks, k_hop_neighbors=neighbors)

    print 'Dump vectors...'
    model.dump_to_file(node_vec_fname, type_='node')
    return 0
Exemple #21
0
def read_graph(args):
    '''
    Reads the input network.
    '''
    G = graph.load_edgelist(args.input, undirected=True)
    return G
Exemple #22
0
def deepwalk_get_feature(args, adj_indices, result_path):
    model_path = result_path + '.model'
    if os.path.exists(model_path):
        return Word2Vec.load(model_path)
    G = graph.load_edgelist(adj_indices, undirected=args.undirected)

    print(G)
    if len(G) < 10:
        print('输出随机游走点太少')
        return []
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.dataset + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=args.workers)

    model.wv.save_word2vec_format(result_path + '.feature')
    model.save(model_path)
    return model