def generate_vocab(walks): index2word = [] raw_vocab = defaultdict(int) for walk in walks: for word in walk: raw_vocab[word] += 1 vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) index2word.sort(key=lambda word: vocab[word].count, reverse=True) for i, word in enumerate(index2word): vocab[word].index = i return vocab, index2word
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict(int) for walks in all_walks: # 遍历每一层walks for walk in walks: for word in walk: raw_vocab[word] += 1 # 统计一下word(node)出现的次数 vocab = {} for word, v in iteritems(raw_vocab): # 构建词袋,给word一个编码,按照出现的次数进行排序 vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) index2word.sort(key=lambda word: vocab[word].count, reverse=True) # 按照词袋的数量从大到小进行排序 for i, word in enumerate(index2word): vocab[word].index = i # 词袋的index重新排序 return vocab, index2word
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict(int) for walks in all_walks: for walk in walks: for word in walk: raw_vocab[word] += 1 vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) # 按照每个单词出现的频次进行从大到小排序在vocab中 index2word.sort(key=lambda word: vocab[word].count, reverse=True) for i, word in enumerate(index2word): vocab[word].index = i # vocab是一个按照walks中所有节点出现的频率从大到小排序后的单词表 # index2word是节点集合 return vocab, index2word
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict( int) # stores the count of a word appearing in the walk for walks in all_walks: for walk in walks: for word in walk: raw_vocab[word] += 1 # compute count, then to sort based on count, at last set index after sort vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab( count=v, index=len(index2word)) # vocab[word] = (count, index) index2word.append(word) # index2word[index] == word index2word.sort(key=lambda word: vocab[word].count, reverse=True) # decending order for i, word in enumerate(index2word): vocab[word].index = i # word2inedx return vocab, index2word # vocab: