Beispiel #1
0
 def _restart_random_walk(self, restart_node=None, walk_times=0):
     """random walk with restart."""
     # if restart_node == None:
     #     # Sampling is uniform w.r.t V, and not w.r.t E
     #     restart_node = random.choice(self._walk_nodes)
     # target = restart_node
     context_list = []
     except_set = set()
     except_set.add(restart_node)
     for _ in range(walk_times):
         start_node = restart_node
         context = []
         while len(context) < self._walk_length:
             if random.random() < self._walk_restart:
                 start_node = restart_node
             adj_list = self._net._nodes_adjlist[start_node]
             if len(adj_list) > 0:
                 start_node = random.choice(
                     adj_list)  # Generate a uniform random sample
             else:
                 start_node = restart_node
                 # logger.warning('no type-corresponding node found, walk restarted.')
                 # continue
             # if start_node != restart_node:
             context.append(start_node)  # context
             except_set.add(start_node)
         context_list.extend(context)
     neg_nodes = utils.neg_sample(self._walk_nodes,
                                  except_set,
                                  num=self._neg_sampled,
                                  alias_table=self._alias_nodesdegrees)
     # np.asarray(context_list)
     return restart_node, context_list, neg_nodes  # input(center_node), targets(context_nodes), neg_targets(neg_nodes)
Beispiel #2
0
 def next_batch(self):
     data_list = []
     labels_list = []
     neg_labels_list = []
     for edge_type, edge_list in self._edges_type_dict.items():
         edges = random.sample(edge_list, k=self._batch_size)
         data = []
         labels = []
         neg_labels = []
         for source, target in edges:
             target_type = edge_type[1]
             neg_nodes = utils.neg_sample(
                 self._nodes_type_dict[target_type][0], {source, target},
                 num=self._neg_sampled,
                 alias_table=self._nodes_type_dict[target_type][1])
             data.append(source)
             labels.append(target)
             neg_labels.append(neg_nodes)
         data_list.append(data)
         labels_list.append(labels)
         neg_labels_list.append(neg_labels)
     return np.asarray(data_list), np.asarray(labels_list), np.asarray(
         neg_labels_list)
Beispiel #3
0
def test_raw_xgb(logs_raw, module_name, time_interval=300, is_bin=False):
    '''
    params:
        module_name: string 模块名比如aaa,大小写都可以
        is_train: bool 是否训练
        time_interval: int 预测间隔
        is_bin: bool 忽略此参数保持False 二分类True 多分类False
    out:
        res: dict,测试集指标 例如{"accuracy":ac,"recall":rc,"confusion_matrix":cf.tolist()}
    '''
    ###load request
    #data_raw = request.data
    #data_raw = json.loads(data_raw)
    #logs_data = data_raw['logs']
    #module_name = data_raw['module_name'].upper()
    #time_interval = data_raw['interval'] if 'interval' in data_raw else 300
    #is_train = data_raw['is_train'] if 'is_train' in data_raw else False
    #is_bin = data_raw.get('is_bin')
    #is_bin = False if is_bin is None else True
    module_name = module_name.upper()

    logs = []
    data = []
    res = {}
    module_name = module_name.upper()
    module_log_pos = utils.mod_logpos[module_name.upper()]

    for line in tqdm(logs_raw):
        res = utils.match_log(line, module_name)
        if (not (res is None)) and (not (res.group() == '')):
            data.append(res.group())
            logs.append(res.group(module_log_pos))
    ###parse logs
    # data, logs = load_data_from_file(module_name,file_name)
    num_labels = len(utils.mod_template[module_name.upper()]) + 2
    labels = utils.label_logs(logs, module_name)
    #   utils.write_data(data,labels,f'data/{module_name.lower()}.csv')
    timestamps, weekdays = utils.parse_data_time(data, module_name, True)
    timediffs = utils.timediff(timestamps)
    neg_diffs, neg_week = utils.neg_sample(timestamps, True)
    neg_label = [0] * int(len(neg_diffs))

    weekdays.extend(neg_week)
    x = timediffs
    x.extend(neg_diffs)
    x = [[x[i], weekdays[i]] for i in range(len(x))]
    y = [x + 1 for x in labels]
    y.extend(neg_label)
    if is_bin:
        num_labels = 2
        y = [0 if iii == 0 else 1 for iii in y]
    xy = np.hstack([np.array(x), np.array(y)[:, np.newaxis]])
    np.random.shuffle(xy)
    x = xy[:, 0:2]
    y = xy[:, 2][:, np.newaxis]
    #labels_seq = utils.labels_sequence(labels,timestamps)

    train_idx_end = int(len(x) * 0.5)
    x_train = x[:train_idx_end]
    y_train = y[:train_idx_end]
    x_test = x[train_idx_end:]
    y_test = y[train_idx_end:]
    print('标签分布', np.bincount(y_train.flatten()))
    #x_train = torch.tensor(x_train).float()
    #x_test = torch.tensor(x_test).float()
    #y_train = torch.tensor(y_train).float()
    #y_test = torch.tensor(y_test).float()
    #feats_dim = len(x[0])
    lr = 1
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test)
    params = {
        'num_class': num_labels,
        'max_depth': 20,
        'min_child_weight': 4,
        'eta': lr,
        'objective': 'multi:softprob',
        'verbosity': 3
    }
    print('training...', file=sys.stderr)
    bst = xgb.train(params, data_train, 100)
    pkl.dump(bst, open(f'xgb.{module_name}.{time_interval}.bat', 'wb'))
    print('testing...', file=sys.stderr)
    p_list = bst.predict(data_test)
    y_list = np.argmax(p_list, axis=1)
    ac = metrics.accuracy_score(y_test, y_list)
    rc = metrics.recall_score(y_test, y_list, average='macro')
    cf = metrics.confusion_matrix(y_test, y_list)
    print(cf)
    print(f'accuracy: {ac}, recall: {rc}')

    res = {"accuracy": ac, "recall": rc, "confusion_matrix": cf.tolist()}

    return json.dumps(res)
Beispiel #4
0
 def _spacey_metatree_random_walk(
         self,
         root_node,
         walk_times=0):  # metapath, multi-metapath, metagraph
     root_type = self._net.get_node_type(root_node)
     if root_type not in self._metatree_type_id_dict:
         root_type = random.choice(list(self._metatree_type_id_dict.keys()))
         root_node = random.choice(self._nodes_type_dict[root_type][0])
     root_id = random.choice(self._metatree_type_id_dict[root_type])
     context_nodes_dict = {}  #
     if self._history_position == "local":
         history = np.ones([len(self._metagraph.nodes())], dtype=np.float64)
     elif self._history_position == "global":
         history = self._history
     for _ in range(walk_times):
         if self._history_position == "local_walktime":
             history = np.ones([len(self._metagraph.nodes())],
                               dtype=np.float64)
         # current node
         cur_node = root_node
         cur_type = root_type
         cur_id = root_id
         # logger.info('start: %d %d' % (cur_type, cur_id))
         for __ in range(self._walk_length):
             # choose next type
             if random.random() < self._walk_restart:
                 cur_node = root_node
                 cur_type = root_type
                 cur_id = root_id
             cur_node_adj_typelist = self._adj_lookupdict[cur_node].keys()
             next_id_list = [
                 v for v in self._metagraph[cur_id] if
                 self._metagraph.nodes[v]["type"] in cur_node_adj_typelist
             ]
             if len(next_id_list) == 0:
                 cur_type = root_type
                 cur_node = root_node
                 cur_id = root_id
             elif len(next_id_list) == 1:
                 cur_id = next_id_list[0]
                 cur_type = self._metagraph.nodes[cur_id]["type"]
                 cur_node = random.choice(
                     self._adj_lookupdict[cur_node][cur_type])
                 history[cur_id] += 1
             else:
                 occupancy = history[next_id_list]
                 cur_id = utils.unigram_sample(population=next_id_list,
                                               size=1,
                                               weight=occupancy)[0]
                 cur_type = self._metagraph.nodes[cur_id]["type"]
                 cur_node = random.choice(
                     self._adj_lookupdict[cur_node][cur_type])
                 history[cur_id] += 1
             cur_id = utils.unigram_sample(
                 population=self._metatree_type_id_dict[cur_type],
                 size=1,
                 weight=history[self._metatree_type_id_dict[cur_type]])[0]
             if cur_type in context_nodes_dict:
                 context_nodes_dict[cur_type][0].append(
                     cur_node)  # context_list
                 context_nodes_dict[cur_type][1].add(cur_node)  # except_set
             else:
                 context_nodes_dict[cur_type] = [[cur_node],
                                                 {cur_node, root_node}]
     type_context_nodes_list = []
     type_neg_nodes_list = []
     type_mask_list = []
     for k in range(self.node_types_size):
         if k in context_nodes_dict:
             context_nodes = context_nodes_dict[k][0]
             except_set = context_nodes_dict[k][1]
             type_mask_list.append(1)
             type_context_nodes_list.append(context_nodes)
             type_neg_nodes_list.append(
                 utils.neg_sample(self._nodes_type_dict[k][0],
                                  except_set,
                                  num=self._neg_sampled,
                                  alias_table=self._nodes_type_dict[k][1]))
         else:
             type_mask_list.append(0)
             type_context_nodes_list.append([0])
             type_neg_nodes_list.append([0])
     return root_node, type_context_nodes_list, type_mask_list, type_neg_nodes_list
Beispiel #5
0
 def _spacey_metaschema_random_walk(self, root_node, walk_times=0):
     root_type = self._net.get_node_type(root_node)
     context_nodes_dict = {}  #
     if self._history_position == "local":
         history = np.ones([self.node_types_size], dtype=np.float64)
     elif self._history_position == "global":
         history = self._history
     for _ in range(walk_times):
         if self._history_position == "local_walktime":
             history = np.ones([self.node_types_size], dtype=np.float64)
         # current node
         cur_node = root_node
         cur_type = root_type
         for __ in range(self._walk_length):
             # choose next type
             if random.random() < self._walk_restart:
                 cur_node = root_node
                 cur_type = root_type
             next_type_list = list(self._adj_lookupdict[cur_node].keys())
             if len(next_type_list) == 0:
                 cur_type = root_type
                 cur_node = root_node
             elif len(next_type_list) == 1:
                 cur_type = next_type_list[0]
                 cur_node = random.choice(
                     self._adj_lookupdict[cur_node][cur_type])
                 history[cur_type] += 1
             else:
                 occupancy = history[next_type_list]
                 cur_type = utils.unigram_sample(population=next_type_list,
                                                 size=1,
                                                 weight=occupancy)[0]
                 cur_node = random.choice(
                     self._adj_lookupdict[cur_node][cur_type])
                 history[cur_type] += 1
             if cur_type in context_nodes_dict:
                 context_nodes_dict[cur_type][0].append(
                     cur_node)  # context_list
                 context_nodes_dict[cur_type][1].add(cur_node)  # except_set
             else:
                 context_nodes_dict[cur_type] = [[cur_node],
                                                 {cur_node, root_node}]
     type_context_nodes_list = []
     type_neg_nodes_list = []
     type_mask_list = []
     for k in range(self.node_types_size):
         if k in context_nodes_dict:
             context_nodes = context_nodes_dict[k][0]
             except_set = context_nodes_dict[k][1]
             type_mask_list.append(1)
             type_context_nodes_list.append(context_nodes)
             type_neg_nodes_list.append(
                 utils.neg_sample(self._nodes_type_dict[k][0],
                                  except_set,
                                  num=self._neg_sampled,
                                  alias_table=self._nodes_type_dict[k][1]))
         else:
             type_mask_list.append(0)
             type_context_nodes_list.append([0])
             type_neg_nodes_list.append([0])
     return root_node, type_context_nodes_list, type_mask_list, type_neg_nodes_list
Beispiel #6
0
    def __getitem__(self, index):

        sequence = self.part_sequence[index]  # pos_items
        # sample neg item for every masked item
        masked_item_sequence = []
        neg_items = []
        # Masked Item Prediction
        item_set = set(sequence)
        for item in sequence[:-1]:
            prob = random.random()
            if prob < self.args.mask_p:
                masked_item_sequence.append(self.args.mask_id)
                neg_items.append(neg_sample(item_set, self.args.item_size))
            else:
                masked_item_sequence.append(item)
                neg_items.append(item)

        # add mask at the last position
        masked_item_sequence.append(self.args.mask_id)
        neg_items.append(neg_sample(item_set, self.args.item_size))

        # Segment Prediction
        if len(sequence) < 2:
            masked_segment_sequence = sequence
            pos_segment = sequence
            neg_segment = sequence
        else:
            sample_length = random.randint(1, len(sequence) // 2)
            start_id = random.randint(0, len(sequence) - sample_length)
            neg_start_id = random.randint(
                0,
                len(self.long_sequence) - sample_length)
            pos_segment = sequence[start_id:start_id + sample_length]
            neg_segment = self.long_sequence[neg_start_id:neg_start_id +
                                             sample_length]
            masked_segment_sequence = sequence[:start_id] + [
                self.args.mask_id
            ] * sample_length + sequence[start_id + sample_length:]
            pos_segment = [self.args.mask_id] * start_id + pos_segment + [
                self.args.mask_id
            ] * (len(sequence) - (start_id + sample_length))
            neg_segment = [self.args.mask_id] * start_id + neg_segment + [
                self.args.mask_id
            ] * (len(sequence) - (start_id + sample_length))

        assert len(masked_segment_sequence) == len(sequence)
        assert len(pos_segment) == len(sequence)
        assert len(neg_segment) == len(sequence)

        # padding sequence
        pad_len = self.max_len - len(sequence)
        masked_item_sequence = [0] * pad_len + masked_item_sequence
        pos_items = [0] * pad_len + sequence
        neg_items = [0] * pad_len + neg_items
        masked_segment_sequence = [0] * pad_len + masked_segment_sequence
        pos_segment = [0] * pad_len + pos_segment
        neg_segment = [0] * pad_len + neg_segment

        masked_item_sequence = masked_item_sequence[-self.max_len:]
        pos_items = pos_items[-self.max_len:]
        neg_items = neg_items[-self.max_len:]

        masked_segment_sequence = masked_segment_sequence[-self.max_len:]
        pos_segment = pos_segment[-self.max_len:]
        neg_segment = neg_segment[-self.max_len:]

        # Associated Attribute Prediction
        # Masked Attribute Prediction
        attributes = []
        for item in pos_items:
            attribute = [0] * self.args.attribute_size
            try:
                now_attribute = self.args.item2attribute[str(item)]
                for a in now_attribute:
                    attribute[a] = 1
            except:
                pass
            attributes.append(attribute)

        assert len(attributes) == self.max_len
        assert len(masked_item_sequence) == self.max_len
        assert len(pos_items) == self.max_len
        assert len(neg_items) == self.max_len
        assert len(masked_segment_sequence) == self.max_len
        assert len(pos_segment) == self.max_len
        assert len(neg_segment) == self.max_len

        cur_tensors = (
            torch.tensor(attributes, dtype=torch.long),
            torch.tensor(masked_item_sequence, dtype=torch.long),
            torch.tensor(pos_items, dtype=torch.long),
            torch.tensor(neg_items, dtype=torch.long),
            torch.tensor(masked_segment_sequence, dtype=torch.long),
            torch.tensor(pos_segment, dtype=torch.long),
            torch.tensor(neg_segment, dtype=torch.long),
        )
        return cur_tensors
Beispiel #7
0
    def __getitem__(self, index):

        user_id = index
        items = self.user_seq[index]

        assert self.data_type in {"train", "valid", "test"}

        # [0, 1, 2, 3, 4, 5, 6]
        # train [0, 1, 2, 3]
        # target [1, 2, 3, 4]

        # valid [0, 1, 2, 3, 4]
        # answer [5]

        # test [0, 1, 2, 3, 4, 5]
        # answer [6]
        if self.data_type == "train":
            input_ids = items[:-3]
            target_pos = items[1:-2]
            answer = [0]  # no use

        elif self.data_type == 'valid':
            input_ids = items[:-2]
            target_pos = items[1:-1]
            answer = [items[-2]]

        else:
            input_ids = items[:-1]
            target_pos = items[1:]
            answer = [items[-1]]

        target_neg = []
        seq_set = set(items)
        for _ in input_ids:
            target_neg.append(neg_sample(seq_set, self.args.item_size))

        pad_len = self.max_len - len(input_ids)
        input_ids = [0] * pad_len + input_ids
        target_pos = [0] * pad_len + target_pos
        target_neg = [0] * pad_len + target_neg

        input_ids = input_ids[-self.max_len:]
        target_pos = target_pos[-self.max_len:]
        target_neg = target_neg[-self.max_len:]

        assert len(input_ids) == self.max_len
        assert len(target_pos) == self.max_len
        assert len(target_neg) == self.max_len

        if self.test_neg_items is not None:
            test_samples = self.test_neg_items[index]

            cur_tensors = (
                torch.tensor(user_id, dtype=torch.long),  # user_id for testing
                torch.tensor(input_ids, dtype=torch.long),
                torch.tensor(target_pos, dtype=torch.long),
                torch.tensor(target_neg, dtype=torch.long),
                torch.tensor(answer, dtype=torch.long),
                torch.tensor(test_samples, dtype=torch.long),
            )
        else:
            cur_tensors = (
                torch.tensor(user_id, dtype=torch.long),  # user_id for testing
                torch.tensor(input_ids, dtype=torch.long),
                torch.tensor(target_pos, dtype=torch.long),
                torch.tensor(target_neg, dtype=torch.long),
                torch.tensor(answer, dtype=torch.long),
            )

        return cur_tensors