def _restart_random_walk(self, restart_node=None, walk_times=0): """random walk with restart.""" # if restart_node == None: # # Sampling is uniform w.r.t V, and not w.r.t E # restart_node = random.choice(self._walk_nodes) # target = restart_node context_list = [] except_set = set() except_set.add(restart_node) for _ in range(walk_times): start_node = restart_node context = [] while len(context) < self._walk_length: if random.random() < self._walk_restart: start_node = restart_node adj_list = self._net._nodes_adjlist[start_node] if len(adj_list) > 0: start_node = random.choice( adj_list) # Generate a uniform random sample else: start_node = restart_node # logger.warning('no type-corresponding node found, walk restarted.') # continue # if start_node != restart_node: context.append(start_node) # context except_set.add(start_node) context_list.extend(context) neg_nodes = utils.neg_sample(self._walk_nodes, except_set, num=self._neg_sampled, alias_table=self._alias_nodesdegrees) # np.asarray(context_list) return restart_node, context_list, neg_nodes # input(center_node), targets(context_nodes), neg_targets(neg_nodes)
def next_batch(self): data_list = [] labels_list = [] neg_labels_list = [] for edge_type, edge_list in self._edges_type_dict.items(): edges = random.sample(edge_list, k=self._batch_size) data = [] labels = [] neg_labels = [] for source, target in edges: target_type = edge_type[1] neg_nodes = utils.neg_sample( self._nodes_type_dict[target_type][0], {source, target}, num=self._neg_sampled, alias_table=self._nodes_type_dict[target_type][1]) data.append(source) labels.append(target) neg_labels.append(neg_nodes) data_list.append(data) labels_list.append(labels) neg_labels_list.append(neg_labels) return np.asarray(data_list), np.asarray(labels_list), np.asarray( neg_labels_list)
def test_raw_xgb(logs_raw, module_name, time_interval=300, is_bin=False): ''' params: module_name: string 模块名比如aaa,大小写都可以 is_train: bool 是否训练 time_interval: int 预测间隔 is_bin: bool 忽略此参数保持False 二分类True 多分类False out: res: dict,测试集指标 例如{"accuracy":ac,"recall":rc,"confusion_matrix":cf.tolist()} ''' ###load request #data_raw = request.data #data_raw = json.loads(data_raw) #logs_data = data_raw['logs'] #module_name = data_raw['module_name'].upper() #time_interval = data_raw['interval'] if 'interval' in data_raw else 300 #is_train = data_raw['is_train'] if 'is_train' in data_raw else False #is_bin = data_raw.get('is_bin') #is_bin = False if is_bin is None else True module_name = module_name.upper() logs = [] data = [] res = {} module_name = module_name.upper() module_log_pos = utils.mod_logpos[module_name.upper()] for line in tqdm(logs_raw): res = utils.match_log(line, module_name) if (not (res is None)) and (not (res.group() == '')): data.append(res.group()) logs.append(res.group(module_log_pos)) ###parse logs # data, logs = load_data_from_file(module_name,file_name) num_labels = len(utils.mod_template[module_name.upper()]) + 2 labels = utils.label_logs(logs, module_name) # utils.write_data(data,labels,f'data/{module_name.lower()}.csv') timestamps, weekdays = utils.parse_data_time(data, module_name, True) timediffs = utils.timediff(timestamps) neg_diffs, neg_week = utils.neg_sample(timestamps, True) neg_label = [0] * int(len(neg_diffs)) weekdays.extend(neg_week) x = timediffs x.extend(neg_diffs) x = [[x[i], weekdays[i]] for i in range(len(x))] y = [x + 1 for x in labels] y.extend(neg_label) if is_bin: num_labels = 2 y = [0 if iii == 0 else 1 for iii in y] xy = np.hstack([np.array(x), np.array(y)[:, np.newaxis]]) np.random.shuffle(xy) x = xy[:, 0:2] y = xy[:, 2][:, np.newaxis] #labels_seq = utils.labels_sequence(labels,timestamps) train_idx_end = int(len(x) * 0.5) x_train = x[:train_idx_end] y_train = y[:train_idx_end] x_test = x[train_idx_end:] y_test = y[train_idx_end:] print('标签分布', np.bincount(y_train.flatten())) #x_train = torch.tensor(x_train).float() #x_test = torch.tensor(x_test).float() #y_train = torch.tensor(y_train).float() #y_test = torch.tensor(y_test).float() #feats_dim = len(x[0]) lr = 1 data_train = xgb.DMatrix(x_train, label=y_train) data_test = xgb.DMatrix(x_test) params = { 'num_class': num_labels, 'max_depth': 20, 'min_child_weight': 4, 'eta': lr, 'objective': 'multi:softprob', 'verbosity': 3 } print('training...', file=sys.stderr) bst = xgb.train(params, data_train, 100) pkl.dump(bst, open(f'xgb.{module_name}.{time_interval}.bat', 'wb')) print('testing...', file=sys.stderr) p_list = bst.predict(data_test) y_list = np.argmax(p_list, axis=1) ac = metrics.accuracy_score(y_test, y_list) rc = metrics.recall_score(y_test, y_list, average='macro') cf = metrics.confusion_matrix(y_test, y_list) print(cf) print(f'accuracy: {ac}, recall: {rc}') res = {"accuracy": ac, "recall": rc, "confusion_matrix": cf.tolist()} return json.dumps(res)
def _spacey_metatree_random_walk( self, root_node, walk_times=0): # metapath, multi-metapath, metagraph root_type = self._net.get_node_type(root_node) if root_type not in self._metatree_type_id_dict: root_type = random.choice(list(self._metatree_type_id_dict.keys())) root_node = random.choice(self._nodes_type_dict[root_type][0]) root_id = random.choice(self._metatree_type_id_dict[root_type]) context_nodes_dict = {} # if self._history_position == "local": history = np.ones([len(self._metagraph.nodes())], dtype=np.float64) elif self._history_position == "global": history = self._history for _ in range(walk_times): if self._history_position == "local_walktime": history = np.ones([len(self._metagraph.nodes())], dtype=np.float64) # current node cur_node = root_node cur_type = root_type cur_id = root_id # logger.info('start: %d %d' % (cur_type, cur_id)) for __ in range(self._walk_length): # choose next type if random.random() < self._walk_restart: cur_node = root_node cur_type = root_type cur_id = root_id cur_node_adj_typelist = self._adj_lookupdict[cur_node].keys() next_id_list = [ v for v in self._metagraph[cur_id] if self._metagraph.nodes[v]["type"] in cur_node_adj_typelist ] if len(next_id_list) == 0: cur_type = root_type cur_node = root_node cur_id = root_id elif len(next_id_list) == 1: cur_id = next_id_list[0] cur_type = self._metagraph.nodes[cur_id]["type"] cur_node = random.choice( self._adj_lookupdict[cur_node][cur_type]) history[cur_id] += 1 else: occupancy = history[next_id_list] cur_id = utils.unigram_sample(population=next_id_list, size=1, weight=occupancy)[0] cur_type = self._metagraph.nodes[cur_id]["type"] cur_node = random.choice( self._adj_lookupdict[cur_node][cur_type]) history[cur_id] += 1 cur_id = utils.unigram_sample( population=self._metatree_type_id_dict[cur_type], size=1, weight=history[self._metatree_type_id_dict[cur_type]])[0] if cur_type in context_nodes_dict: context_nodes_dict[cur_type][0].append( cur_node) # context_list context_nodes_dict[cur_type][1].add(cur_node) # except_set else: context_nodes_dict[cur_type] = [[cur_node], {cur_node, root_node}] type_context_nodes_list = [] type_neg_nodes_list = [] type_mask_list = [] for k in range(self.node_types_size): if k in context_nodes_dict: context_nodes = context_nodes_dict[k][0] except_set = context_nodes_dict[k][1] type_mask_list.append(1) type_context_nodes_list.append(context_nodes) type_neg_nodes_list.append( utils.neg_sample(self._nodes_type_dict[k][0], except_set, num=self._neg_sampled, alias_table=self._nodes_type_dict[k][1])) else: type_mask_list.append(0) type_context_nodes_list.append([0]) type_neg_nodes_list.append([0]) return root_node, type_context_nodes_list, type_mask_list, type_neg_nodes_list
def _spacey_metaschema_random_walk(self, root_node, walk_times=0): root_type = self._net.get_node_type(root_node) context_nodes_dict = {} # if self._history_position == "local": history = np.ones([self.node_types_size], dtype=np.float64) elif self._history_position == "global": history = self._history for _ in range(walk_times): if self._history_position == "local_walktime": history = np.ones([self.node_types_size], dtype=np.float64) # current node cur_node = root_node cur_type = root_type for __ in range(self._walk_length): # choose next type if random.random() < self._walk_restart: cur_node = root_node cur_type = root_type next_type_list = list(self._adj_lookupdict[cur_node].keys()) if len(next_type_list) == 0: cur_type = root_type cur_node = root_node elif len(next_type_list) == 1: cur_type = next_type_list[0] cur_node = random.choice( self._adj_lookupdict[cur_node][cur_type]) history[cur_type] += 1 else: occupancy = history[next_type_list] cur_type = utils.unigram_sample(population=next_type_list, size=1, weight=occupancy)[0] cur_node = random.choice( self._adj_lookupdict[cur_node][cur_type]) history[cur_type] += 1 if cur_type in context_nodes_dict: context_nodes_dict[cur_type][0].append( cur_node) # context_list context_nodes_dict[cur_type][1].add(cur_node) # except_set else: context_nodes_dict[cur_type] = [[cur_node], {cur_node, root_node}] type_context_nodes_list = [] type_neg_nodes_list = [] type_mask_list = [] for k in range(self.node_types_size): if k in context_nodes_dict: context_nodes = context_nodes_dict[k][0] except_set = context_nodes_dict[k][1] type_mask_list.append(1) type_context_nodes_list.append(context_nodes) type_neg_nodes_list.append( utils.neg_sample(self._nodes_type_dict[k][0], except_set, num=self._neg_sampled, alias_table=self._nodes_type_dict[k][1])) else: type_mask_list.append(0) type_context_nodes_list.append([0]) type_neg_nodes_list.append([0]) return root_node, type_context_nodes_list, type_mask_list, type_neg_nodes_list
def __getitem__(self, index): sequence = self.part_sequence[index] # pos_items # sample neg item for every masked item masked_item_sequence = [] neg_items = [] # Masked Item Prediction item_set = set(sequence) for item in sequence[:-1]: prob = random.random() if prob < self.args.mask_p: masked_item_sequence.append(self.args.mask_id) neg_items.append(neg_sample(item_set, self.args.item_size)) else: masked_item_sequence.append(item) neg_items.append(item) # add mask at the last position masked_item_sequence.append(self.args.mask_id) neg_items.append(neg_sample(item_set, self.args.item_size)) # Segment Prediction if len(sequence) < 2: masked_segment_sequence = sequence pos_segment = sequence neg_segment = sequence else: sample_length = random.randint(1, len(sequence) // 2) start_id = random.randint(0, len(sequence) - sample_length) neg_start_id = random.randint( 0, len(self.long_sequence) - sample_length) pos_segment = sequence[start_id:start_id + sample_length] neg_segment = self.long_sequence[neg_start_id:neg_start_id + sample_length] masked_segment_sequence = sequence[:start_id] + [ self.args.mask_id ] * sample_length + sequence[start_id + sample_length:] pos_segment = [self.args.mask_id] * start_id + pos_segment + [ self.args.mask_id ] * (len(sequence) - (start_id + sample_length)) neg_segment = [self.args.mask_id] * start_id + neg_segment + [ self.args.mask_id ] * (len(sequence) - (start_id + sample_length)) assert len(masked_segment_sequence) == len(sequence) assert len(pos_segment) == len(sequence) assert len(neg_segment) == len(sequence) # padding sequence pad_len = self.max_len - len(sequence) masked_item_sequence = [0] * pad_len + masked_item_sequence pos_items = [0] * pad_len + sequence neg_items = [0] * pad_len + neg_items masked_segment_sequence = [0] * pad_len + masked_segment_sequence pos_segment = [0] * pad_len + pos_segment neg_segment = [0] * pad_len + neg_segment masked_item_sequence = masked_item_sequence[-self.max_len:] pos_items = pos_items[-self.max_len:] neg_items = neg_items[-self.max_len:] masked_segment_sequence = masked_segment_sequence[-self.max_len:] pos_segment = pos_segment[-self.max_len:] neg_segment = neg_segment[-self.max_len:] # Associated Attribute Prediction # Masked Attribute Prediction attributes = [] for item in pos_items: attribute = [0] * self.args.attribute_size try: now_attribute = self.args.item2attribute[str(item)] for a in now_attribute: attribute[a] = 1 except: pass attributes.append(attribute) assert len(attributes) == self.max_len assert len(masked_item_sequence) == self.max_len assert len(pos_items) == self.max_len assert len(neg_items) == self.max_len assert len(masked_segment_sequence) == self.max_len assert len(pos_segment) == self.max_len assert len(neg_segment) == self.max_len cur_tensors = ( torch.tensor(attributes, dtype=torch.long), torch.tensor(masked_item_sequence, dtype=torch.long), torch.tensor(pos_items, dtype=torch.long), torch.tensor(neg_items, dtype=torch.long), torch.tensor(masked_segment_sequence, dtype=torch.long), torch.tensor(pos_segment, dtype=torch.long), torch.tensor(neg_segment, dtype=torch.long), ) return cur_tensors
def __getitem__(self, index): user_id = index items = self.user_seq[index] assert self.data_type in {"train", "valid", "test"} # [0, 1, 2, 3, 4, 5, 6] # train [0, 1, 2, 3] # target [1, 2, 3, 4] # valid [0, 1, 2, 3, 4] # answer [5] # test [0, 1, 2, 3, 4, 5] # answer [6] if self.data_type == "train": input_ids = items[:-3] target_pos = items[1:-2] answer = [0] # no use elif self.data_type == 'valid': input_ids = items[:-2] target_pos = items[1:-1] answer = [items[-2]] else: input_ids = items[:-1] target_pos = items[1:] answer = [items[-1]] target_neg = [] seq_set = set(items) for _ in input_ids: target_neg.append(neg_sample(seq_set, self.args.item_size)) pad_len = self.max_len - len(input_ids) input_ids = [0] * pad_len + input_ids target_pos = [0] * pad_len + target_pos target_neg = [0] * pad_len + target_neg input_ids = input_ids[-self.max_len:] target_pos = target_pos[-self.max_len:] target_neg = target_neg[-self.max_len:] assert len(input_ids) == self.max_len assert len(target_pos) == self.max_len assert len(target_neg) == self.max_len if self.test_neg_items is not None: test_samples = self.test_neg_items[index] cur_tensors = ( torch.tensor(user_id, dtype=torch.long), # user_id for testing torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_pos, dtype=torch.long), torch.tensor(target_neg, dtype=torch.long), torch.tensor(answer, dtype=torch.long), torch.tensor(test_samples, dtype=torch.long), ) else: cur_tensors = ( torch.tensor(user_id, dtype=torch.long), # user_id for testing torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_pos, dtype=torch.long), torch.tensor(target_neg, dtype=torch.long), torch.tensor(answer, dtype=torch.long), ) return cur_tensors