Ejemplo n.º 1
0
    def load_graph(self, input_address, output_name="g1_out.embeddings", number_walks=10, walk_length=40,
                   max_memory_data_size=1000000000, matfile_variable_name="network", format='adjlist', undirected=True,
                   representation_size=16, workers=1, window_size=5, vertex_freq_degree=False, seed=0):
        if format == "adjlist":
            G = graph.load_adjacencylist(input_address, undirected=undirected)
        elif format == "edgelist":
            G = graph.load_edgelist(input_address, undirected=undirected)
        elif format == "mat":
            G = graph.load_matfile(input_address, variable_name=matfile_variable_name, undirected=undirected)
        else:
            raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % format)

        print("Number of nodes: {}".format(len(G.nodes())))

        num_walks = len(G.nodes()) * number_walks

        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * walk_length

        print("Data size (walks*length): {}".format(data_size))

        if data_size < max_memory_data_size:
            print("Walking...")
            walks = graph.build_deepwalk_corpus(G, num_paths=number_walks,
                                                path_length=walk_length, alpha=0, rand=random.Random(seed))
            print("Training...")
            model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1,
                             workers=workers)
        else:
            print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
                data_size,
                max_memory_data_size))
            print("Walking...")

            walks_filebase = output_name + ".walks"
            walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                                              path_length=walk_length, alpha=0,
                                                              rand=random.Random(seed),
                                                              num_workers=workers)

            print("Counting vertex frequency...")
            if not vertex_freq_degree:
                vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
            else:
                # use degree distribution for frequency in tree
                vertex_counts = G.degree(nodes=G.iterkeys())

            print("Training...")
            walks_corpus = serialized_walks.WalksCorpus(walk_files)
            model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                             size=representation_size,
                             window=window_size, min_count=0, trim_rule=None, workers=workers)

        model.wv.save_word2vec_format("./dataset/{}".format(output_name))
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(
        G,
        walks_filebase,
        num_paths=args.number_walks,
        path_length=args.walk_length,
        alpha=0,
        rand=random.Random(args.seed),
        num_workers=args.workers)
    walks = serialized_walks.WalksCorpus(walk_files)

    print("Training...")
    model = Word2Vec(walks,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     sg=1,
                     hs=1,
                     workers=args.workers)

    model.wv.save_word2vec_format(args.output)
Ejemplo n.º 3
0
def run_dw(matrix,
           num_walks=100,
           walk_length=5,
           representation_size=32,
           window_size=2,
           undirected=True,
           seed=0,
           workers=1):
    random.seed(seed)
    np.random.seed(seed)
    adj_list = []
    for n, edges in enumerate(matrix):
        adj_list.append([n] + edges.nonzero()[0].tolist())

    print(adj_list)

    G = graph.from_adjlist(adj_list)
    if undirected:
        G.make_undirected()

    print("Number of nodes: {}".format(len(G.nodes())))
    num_walks = len(G.nodes()) * num_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < 1000000000:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=num_walks,
                                            path_length=walk_length,
                                            alpha=0,
                                            rand=random.Random(seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=representation_size,
                         window=window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, 1000000000))
        print("Walking...")

        walks_filebase = str(adj_list) + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=num_walks,
            path_length=walk_length,
            alpha=0,
            rand=random.Random(seed),
            num_workers=workers)

        print("Counting vertex frequency...")
        #if not args.vertex_freq_degree:
        vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
        #else:
        #  # use degree distribution for frequency in tree
        #  vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=representation_size,
                         window=window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=workers,
                         seed=seed)

    embeddings = np.zeros((len(G.nodes()), representation_size))

    for i in range(len(G.nodes())):
        embeddings[i] = model.wv.get_vector(str(i))

    return embeddings
Ejemplo n.º 4
0
def process(params, save=True):
    """
    :param params:  传入参数用于训练
    :param save:   是否保存 训练的数据
    :return:
    """
    if params["format"] == "adjlist":
        G = graph.load_adjacencylist(params["input"],
                                     undirected=params["undirected"])
    elif params["format"] == "edgelist":
        G = graph.load_edgelist(params["input"],
                                undirected=params["undirected"])
    elif params["format"] == "mat":
        G = graph.load_matfile(params["input"],
                               variable_name=params["matfile_variable_name"],
                               undirected=params["undirected"])
    else:
        print("输入格式有误,当前输入格式为 %s" % (params["format"]))
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', "
            "mat" % params["format"])
    print("Number of node :{}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * params["number_walks"]

    print("Number of walks:{}".format(num_walks))

    data_size = num_walks * params["walk_length"]

    print("Data size (walks*length):{}".format(data_size))

    if data_size < params["max_memory_data_size"]:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(
            G,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)))

        print("Training...")
        model = Word2Vec(walks,
                         size=params.get("representation_size", 64),
                         window=params.get("window_siz", 5),
                         min_count=params.get("min_count", 0),
                         sg=params.get("sg", 1),
                         hs=params.get("hs", 1),
                         workers=params.get("workers", 1))
    else:
        print(
            "Data size{} is larger than limit(max-memory-data-size:{}).Dumping walks t disk."
            .format(data_size, params.get("max_memory_data_size")))

        print("walking...")

        walks_filebase = params["output"] + ".walks"
        walks_files = wk.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)),
            num_workers=params.get("workers", 1))

        print("Counting vertex frequecy...")  # 统计节点频次

        if params["vertex_freq_degree"]:
            vertex_counts = wk.count_textfiles(walks_files, params["workers"])

        else:
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")

        walks_corpus = wk.WalksCorpus(walks_files)  # walk 语料

        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=params.get("representation_size"),
                         window=params.get("windows_size", 80),
                         min_count=params.get("min_count", 0),
                         trim_rule=params.get("trim_rule", None),
                         workers=params.get("workers", 8))
    if save == True:
        model.wv.save_word2vec_format(params["output"])  # 对模型进行保存
    else:
        models = model.wv.load_word2vec_format(params["output"])  # 加载模型.
        return models
def process(args):
    """Section1:
      deepwalk的输入可以是三种格式的数据,因此针对这三种格式的数据分别构建图类对象G
      我们这里主要关心的是karae.adjlist和p2p-Gnutella08.edgelist两个文件对应的格式,
      这两个文件都在文件夹example_graphs里面
      对于.mat格式我们不关心,也不使用
    """
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)
    """Section2:
    下面打印的都是一些基本信息
    """
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))
    """Section3:
      这里的data_size不是实际的内存大小,只是节点使用总次数的计数。
      在这个if-else代码段中,更常出现的情况是if,而else里的代码几乎从来不会出现,除非data_size超过了max_memory_data_size
      max_memory_data_size默认的是10亿次。
    """
    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        """
        这里直接调用图对象里的build_deepwalk_corpus,它的主要作用是,对每一个点进行多次随机游走,最后的到全部的walks。
        相当于deepwalk论文里两层for循环randomwalk,
        也相当于node2vec里两层for循环node2vecWalk,
        也相当于你算法里的两层for循环下的adawalk
        num_paths=args.number_walks:是采样的轮数,即每一个起点采样的次数,相当于你论文里的$\gamma$
        path_length=args.walk_length, 
        alpha=0, 指的是以概率1-alpha从当前节点继续走下去(可以回到上一个节点),或者以alpha的概率停止继续走,回到路径的起点重新走。
                 请注意:这里的随机游走路径未必是连续的,有可能是走着走着突然回到起点接着走
                 即便alpha=0,也不能保证以后的路径不会出现起点,因为有可能从起点开始走到第二个点时,这第二个点以1的概率继续走下去,
                 这时候它可能会回到起点。
                 但是如果alpha=1,那么就意味着它不可能继续走下去,只能每次都从起点开始重新走,所以这个时候打印出来的路径序列是一连串的起点。
        rand=random.Random(args.seed)
        """

        print("Training...")

        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
        """
        这段代码几乎不需要改动,它就是你算法里的SkipGram,你算发里有三个输入:
        random walks---->walks
        context size---->window=args.window_size
        embedding size---->size=args.representation_size
        如果你想对这个模型了解的更多,或者是做更加人性化的定制,需要继续了解from gensim.models import Word2Vec下的源码。
        我们会在下一篇博文中专门解读这个源码
        Word2Vec:

        Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.

        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        compute_loss: bool
            If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `Word2Vec` model

        >>> from gensim.models import Word2Vec
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = Word2Vec(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        """

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")
        """
        在这种情况下,游走路径会被存入到一系列文件output.walks.x中,即如果你在命令行指定了--output file_path
        那么程序结束后会出现两个文件,一个是file_path,一个是file_path.walks.0,file_path.walks.1,...file_path.walks.n。
        file_path存的是各个节点的embedding,
        output.walks.x存的是采样的路径,x表示这个文件是第x个处理器存入的
        x的个数与处理器个数相同
        """
        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)
        """
        在目前的层次上,你只需要知道serialized_walks.write_walks_to_disk本质上也是在调用graph里的randwalk,只不过包装一下
        加入了并行化代码和写到磁盘的程序。
        这个函数具体的细节需要稍后剖析另一个文件walk.py
        函数里各个参数的含义可以参考if语句里面的代码
        """

        # print("Counting vertex frequency...")
        # if not args.vertex_freq_degree:
        #     """Use vertex degree to estimate the frequency of nodes
        #     in the random walks. This option is faster than
        #     calculating the vocabulary."""
        #     vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        # else:
        #     # use degree distribution for frequency in tree
        #     vertex_counts = G.degree(nodes=G.iterkeys())
        #
        # print("Training...")
        # walks_corpus = serialized_walks.WalksCorpus(walk_files)
        # model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
        #                  size=args.representation_size,
        #                  window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
        """
        上面注释掉的代码是原来的代码,这里没有使用Word2Vec而是使用了自己编写的Skipgram,
        但是Skipgram仅仅是对Word2Vec的输入参数做了一些保装,里面仍然在调用word2vec,
        word2vec本身也可以实现并行化运算,不知道这里为什么要这么做。
        需要说明的是:从现有的情况类看vertex_counts似乎并没什么卵用
        直接向下面这样写就可以:         
        """
        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Word2Vec(sentences=walks_corpus,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    """Section?:
      训练完成之后,将模型输出
    """
    model.wv.save_word2vec_format(args.output)
Ejemplo n.º 6
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))

        print(walks)
        print("Training...")
        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    else:
        # if True:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=args.workers)

    model.wv.save_word2vec_format(args.output)