def __call__(self): np.random.seed(os.getpid()) if self.neg_sample_type == "outdegree": outdegree = self.graph.outdegree() distribution = 1. * outdegree / outdegree.sum() alias, events = alias_sample_build_table(distribution) max_len = int(self.batch_size * self.walk_len * ((1 + self.win_size) - 0.3)) for walks in self.walk_generator(): src, pos = [], [] for walk in walks: s, p = skip_gram_gen_pair(walk, self.win_size) src.extend(s), pos.extend(p) src = np.array(src, dtype=np.int64), pos = np.array(pos, dtype=np.int64) src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1]) if src.shape[0] == 0: continue neg_sample_size = [len(pos), self.neg_num, 1] if self.neg_sample_type == "average": negs = self.graph.sample_nodes(neg_sample_size) elif self.neg_sample_type == "outdegree": negs = alias_sample(neg_sample_size, alias, events) # [batch_size, 1, 1] [batch_size, neg_num+1, 1] dst = np.concatenate([pos, negs], 1) src_feat = np.concatenate([src, self.node_feat[src[:, :, 0]]], -1) dst_feat = np.concatenate([dst, self.node_feat[dst[:, :, 0]]], -1) src_feat, dst_feat = np.expand_dims(src_feat, -1), np.expand_dims( dst_feat, -1) yield src_feat[:max_len], dst_feat[:max_len]
def dump_graph(args): if not os.path.exists(args.outpath): os.makedirs(args.outpath) neg_samples = [] str2id = dict() term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding) terms = [] count = 0 item_distribution = [] with io.open(args.inpath, encoding=args.encoding) as f: edges = [] for idx, line in enumerate(f): if idx % 100000 == 0: log.info("%s readed %s lines" % (args.inpath, idx)) slots = [] for col_idx, col in enumerate(line.strip("\n").split("\t")): s = col[:args.max_seqlen] if s not in str2id: str2id[s] = count count += 1 term_file.write(str(col_idx) + "\t" + col + "\n") item_distribution.append(0) slots.append(str2id[s]) src = slots[0] dst = slots[1] neg_samples.append(slots[2:]) edges.append((src, dst)) edges.append((dst, src)) item_distribution[dst] += 1 term_file.close() edges = np.array(edges, dtype="int64") num_nodes = len(str2id) str2id.clear() log.info("building graph...") graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges) indegree = graph.indegree() graph.indegree() graph.outdegree() graph.dump(args.outpath) # dump alias sample table item_distribution = np.array(item_distribution) item_distribution = np.sqrt(item_distribution) distribution = 1. * item_distribution / item_distribution.sum() alias, events = alias_sample_build_table(distribution) np.save(os.path.join(args.outpath, "alias.npy"), alias) np.save(os.path.join(args.outpath, "events.npy"), events) np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples)) log.info("End Build Graph")
def test_resut(self): """test_result """ size = [450000] num = 10 probs = np.arange(1, num).astype(np.float64) probs /= np.sum(probs) alias, events = alias_sample_build_table(probs) ret = alias_sample(size, alias, events) cnt = Counter(ret) sort_cnt_keys = [x[1] for x in sorted(zip(cnt.values(), cnt.keys()))] self.assertEqual(sort_cnt_keys, np.arange(0, num - 1).tolist())
def graph_alias_sample_table(graph, edge_weight_name): """Build alias sample table for weighted deepwalk. Args: graph: The input graph edge_weight_name: The name of edge weight in edge_feat. Return: Alias sample tables for each nodes. """ edge_weight = graph.edge_feat[edge_weight_name] _, eids_array = graph.successor(return_eids=True) alias_array, events_array = [], [] for eids in eids_array: probs = edge_weight[eids] probs /= np.sum(probs) alias, events = graph_kernel.alias_sample_build_table(probs) alias_array.append(alias), events_array.append(events) alias_array, events_array = np.array(alias_array), np.array(events_array) return alias_array, events_array
def test_speed(self): """test_speed """ num = 1000 size = [10240, 1, 5] probs = np.random.uniform(0.0, 1.0, [num]) probs /= np.sum(probs) start = time.time() alias, events = alias_sample_build_table(probs) for i in range(100): alias_sample(size, alias, events) alias_sample_time = time.time() - start start = time.time() for i in range(100): np.random.choice(num, size, p=probs) np_sample_time = time.time() - start self.assertTrue(alias_sample_time < np_sample_time)
def dump_graph(config): if not os.path.exists(config.graph_work_path): os.makedirs(config.graph_work_path) str2id = dict() term_file = io.open(os.path.join(config.graph_work_path, "terms.txt"), "w", encoding=config.encoding) terms = [] item_distribution = [] edges = load_graph(config, str2id, term_file, terms, item_distribution) #load_train_data(config, str2id, term_file, terms, item_distribution) if config.task == "link_predict": load_link_predict_train_data(config, str2id, term_file, terms, item_distribution) elif config.task == "node_classification": load_node_classification_train_data(config, str2id, term_file, terms, item_distribution) else: raise ValueError term_file.close() num_nodes = len(str2id) str2id.clear() log.info("building graph...") graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges) indegree = graph.indegree() graph.indegree() graph.outdegree() graph.dump(config.graph_work_path) # dump alias sample table item_distribution = np.array(item_distribution) item_distribution = np.sqrt(item_distribution) distribution = 1. * item_distribution / item_distribution.sum() alias, events = alias_sample_build_table(distribution) np.save(os.path.join(config.graph_work_path, "alias.npy"), alias) np.save(os.path.join(config.graph_work_path, "events.npy"), events) log.info("End Build Graph")
def __call__(self): np.random.seed(os.getpid()) if self.neg_sample_type == "outdegree": outdegree = self.graph.outdegree() distribution = 1. * outdegree / outdegree.sum() alias, events = alias_sample_build_table(distribution) max_len = int(self.batch_size * self.walk_len * ((1 + self.win_size) - 0.3)) for walks in self.walk_generator(): try: src_list, pos_list = [], [] for walk in walks: s, p = skip_gram_gen_pair(walk, self.win_size) src_list.append(s[:max_len]), pos_list.append(p[:max_len]) src = [s for x in src_list for s in x] pos = [s for x in pos_list for s in x] src = np.array(src, dtype=np.int64), pos = np.array(pos, dtype=np.int64) src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1]) neg_sample_size = [len(pos), self.neg_num, 1] if src.shape[0] == 0: continue if self.neg_sample_type == "average": negs = np.random.randint(low=0, high=self.graph.num_nodes, size=neg_sample_size) elif self.neg_sample_type == "outdegree": negs = alias_sample(neg_sample_size, alias, events) elif self.neg_sample_type == "inbatch": pass dst = np.concatenate([pos, negs], 1) # [batch_size, 1, 1] [batch_size, neg_num+1, 1] yield src[:max_len], dst[:max_len] except Exception as e: log.exception(e)
def normlization_layer_weight(self): """ Normlation the distance between nodes, weight[1, 2, ....N] = distance[1, 2, ......N] / sum(distance) """ for sd_keys, layer_weight in self.distance.items(): src, dist = sd_keys layers, weights = layer_weight.keys(), layer_weight.values() for layer, weight in zip(layers, weights): if layer not in self.layer_distance: self.layer_distance[layer] = {} if layer not in self.layer_message: self.layer_message[layer] = {} self.layer_distance[layer][src, dist] = weight if src not in self.layer_message[layer]: self.layer_message[layer][src] = [] if dist not in self.layer_message[layer]: self.layer_message[layer][dist] = [] self.layer_message[layer][src].append(dist) self.layer_message[layer][dist].append(src) # normalization the layer weight for i in range(0, self.depth): layer_weight = 0.0 layer_count = 0 if i not in self.layer_norm_distance: self.layer_norm_distance[i] = {} if i not in self.sample_alias: self.sample_alias[i] = {} if i not in self.sample_events: self.sample_events[i] = {} if i not in self.layer_message: continue for node in self.nodes: if node not in self.layer_message[i]: continue nbhs = self.layer_message[i][node] weights = [] sum_weight = 0.0 for dist in nbhs: if (node, dist) in self.layer_distance[i]: weight = self.layer_distance[i][node, dist] else: weight = self.layer_distance[i][dist, node] weight = np.exp(-float(weight)) weights.append(weight) # norm the weight sum_weight = sum(weights) if sum_weight == 0.0: sum_weight = 1.0 weight_list = [weight / sum_weight for weight in weights] self.layer_norm_distance[i][node] = weight_list alias, events = alias_sample_build_table(np.array(weight_list)) self.sample_alias[i][node] = alias self.sample_events[i][node] = events layer_weight += 1.0 #layer_weight += sum(weight_list) layer_count += len(weights) layer_avg_weight = layer_weight / (1.0 * layer_count) self.layer_node_weight_count[i] = dict() for node in self.nodes: if node not in self.layer_norm_distance[i]: continue weight_list = self.layer_norm_distance[i][node] node_cnt = 0 for weight in weight_list: if weight > layer_avg_weight: node_cnt += 1 self.layer_node_weight_count[i][node] = node_cnt