Beispiel #1
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                         workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size,
                                                                                                             args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0,
                                                          rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output, binary=False)
    print('saved!')
Beispiel #2
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, 
                test_links_ratio=args.test_links, test_links_file=args.test_links_file,
                train_links_file=args.train_links_file)
    elif args.format == "mat":
        G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    if args.heuristic_wrb_for_wbr is not None:
        wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr))
        print(wrb, err)
        return


    if (args.weighted is not None) and (args.weighted != 'unweighted'):
      G = graph.set_weights(G, args.weighted)

    if args.just_write_graph:
        with open('wgraph.out', 'w') as fout:
            if args.weighted == 'unweighted':
                for v in G:
                    s = len(G[v])
                    for u in G[v]:
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n')
            elif args.weighted.startswith('random_walk'):
                for v in G:
                    for u, w in zip(G[v], G.edge_weights[v]):
                        fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n')
            else:
                raise Exception('just-write-graph is not supported for this weighting method')
        return None




    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, p_modified=args.pmodified,
                                            alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                             path_length=args.walk_length, p_modified=args.pmodified,
                                             alpha=0, rand=random.Random(args.seed),
                                             num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
          vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        else:
          # use degree distribution for frequency in tree
          vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Beispiel #3
0
def deepwalk_get_feature(args, adj_indices, result_path):
    model_path = result_path + '.model'
    if os.path.exists(model_path):
        return Word2Vec.load(model_path)
    G = graph.load_edgelist(adj_indices, undirected=args.undirected)

    print(G)
    if len(G) < 10:
        print('输出随机游走点太少')
        return []
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.dataset + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=args.workers)

    model.wv.save_word2vec_format(result_path + '.feature')
    model.save(model_path)
    return model
Beispiel #4
0
    def getEmbeddings(self, relationships):

        G = graph.load_py4jclient(relationships)

        print("Number of nodes: {}".format(len(G.nodes())))

        num_walks = len(G.nodes()) * self.args.number_walks

        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * self.args.walk_length

        print("Data size (walks*length): {}".format(data_size))

        if data_size < self.args.max_memory_data_size:
            print("Walking...")
            walks = graph.build_deepwalk_corpus(
                G,
                num_paths=self.args.number_walks,
                path_length=self.args.walk_length,
                alpha=0,
                rand=random.Random(self.args.seed))
            print("Training...")
            model = Word2Vec(walks,
                             size=self.args.representation_size,
                             window=self.args.window_size,
                             min_count=0,
                             sg=1,
                             hs=1,
                             workers=self.args.workers)
        else:
            print(
                "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
                .format(data_size, self.args.max_memory_data_size))
            print("Walking...")

            walks_filebase = self.args.output + ".walks"
            walk_files = serialized_walks.write_walks_to_disk(
                G,
                walks_filebase,
                num_paths=self.args.number_walks,
                path_length=self.args.walk_length,
                alpha=0,
                rand=random.Random(self.args.seed),
                num_workers=self.args.workers)

            print("Counting vertex frequency...")
            if not self.args.vertex_freq_degree:
                vertex_counts = serialized_walks.count_textfiles(
                    walk_files, self.args.workers)
            else:
                # use degree distribution for frequency in tree
                vertex_counts = G.degree(nodes=G.iterkeys())

            print("Training...")
            walks_corpus = serialized_walks.WalksCorpus(walk_files)
            model = Skipgram(sentences=walks_corpus,
                             vocabulary_counts=vertex_counts,
                             size=self.args.representation_size,
                             window=self.args.window_size,
                             min_count=0,
                             trim_rule=None,
                             workers=self.args.workers)

        # to_return = {}
        # for word, vec in zip(model.wv.vocab, model.wv.vectors):
        #   to_return[word] = " ".join([for str(x) in vec])
        to_return = ""
        for word, vec in zip(model.wv.vocab, model.wv.vectors):
            vector_str = " ".join([str(x) for x in vec])
            to_return = to_return + word + "\t" + vector_str + "\n"

        print(to_return)
        # from py4j.java_collections import SetConverter, MapConverter, ListConverter
        # to_return = MapConverter().convert(to_return, client)
        # to_return = D()
        # for word, vec in zip(model.wv.vocab, model.wv.vectors):
        #   to_return.word = str(vec)

        return to_return