Esempio n. 1
0
    def __init__(self, args):  # 传入的参数是yaml文件里面的一大堆参数
        assert args.task in ['link_pred'], 'sbm only implements link_pred'
        self.ecols = u.Namespace({
            'FromNodeId': 0,
            'ToNodeId': 1,
            'Weight': 2,
            'TimeStep': 3
        })  # 每列数字代表的含义
        args.sbm_args = u.Namespace(args.sbm_args)

        #build edge data structure
        edges = self.load_edges(
            args.sbm_args)  # 建立边,数据格式:[[souce, target, weight, time].[...]...]
        timesteps = u.aggregate_by_time(edges[:, self.ecols.TimeStep],
                                        args.sbm_args.aggr_time)
        self.max_time = timesteps.max()
        self.min_time = timesteps.min()
        print('TIME', self.max_time, self.min_time)
        edges[:, self.ecols.TimeStep] = timesteps

        edges[:, self.ecols.Weight] = self.cluster_negs_and_positives(
            edges[:, self.ecols.Weight])
        self.num_classes = edges[:, self.ecols.Weight].unique().size(0)

        self.edges = self.edges_to_sp_dict(edges)

        #random node features
        self.num_nodes = int(self.get_num_nodes(edges))
        self.feats_per_node = args.sbm_args.feats_per_node
        self.nodes_feats = torch.rand((self.num_nodes, self.feats_per_node))

        self.num_non_existing = self.num_nodes**2 - edges.size(0)
Esempio n. 2
0
    def load_edges(self, args, tar_archive):
        data = u.load_data_from_tar(args.uc_irc_args.edges_file,
                                    tar_archive,
                                    starting_line=2,
                                    sep=' ')
        cols = u.Namespace({'source': 0, 'target': 1, 'weight': 2, 'time': 3})

        data = data.long()

        self.num_nodes = int(data[:, [cols.source, cols.target]].max())

        # first id should be 0 (they are already contiguous)
        data[:, [cols.source, cols.target]] -= 1

        # add edges in the other direction (simmetric)
        data = torch.cat([
            data, data[:, [cols.target, cols.source, cols.weight, cols.time]]
        ],
                         dim=0)

        data[:, cols.time] = u.aggregate_by_time(data[:, cols.time],
                                                 args.uc_irc_args.aggr_time)

        ids = data[:, cols.source] * self.num_nodes + data[:, cols.target]
        self.num_non_existing = float(self.num_nodes**2 - ids.unique().size(0))

        idx = data[:, [cols.source, cols.target, cols.time]]

        self.max_time = data[:, cols.time].max()
        self.min_time = data[:, cols.time].min()

        return {'idx': idx, 'vals': torch.ones(idx.size(0))}
Esempio n. 3
0
    def load_edges(self, args, tar_archive):
        files = tar_archive.getnames()

        cont_files2times = self.times_from_names(files)

        edges = []
        cols = u.Namespace({'source': 0, 'target': 1, 'time': 2})
        for file in files:
            data = u.load_data_from_tar(file,
                                        tar_archive,
                                        starting_line=4,
                                        sep='\t',
                                        type_fn=int,
                                        tensor_const=torch.LongTensor)

            time_col = torch.zeros(data.size(0), 1,
                                   dtype=torch.long) + cont_files2times[file]

            data = torch.cat([data, time_col], dim=1)

            data = torch.cat(
                [data, data[:, [cols.target, cols.source, cols.time]]])

            edges.append(data)

        edges = torch.cat(edges)

        _, edges[:, [cols.source, cols.
                     target]] = edges[:, [cols.source, cols.target]].unique(
                         return_inverse=True)

        # use only first X time steps
        indices = edges[:, cols.time] < args.aut_sys_args.steps_accounted
        edges = edges[indices, :]

        # time aggregation
        edges[:, cols.time] = u.aggregate_by_time(edges[:, cols.time],
                                                  args.aut_sys_args.aggr_time)

        self.num_nodes = int(edges[:, [cols.source, cols.target]].max() + 1)

        ids = edges[:, cols.source] * self.num_nodes + edges[:, cols.target]
        self.num_non_existing = float(self.num_nodes**2 - ids.unique().size(0))

        self.max_time = edges[:, cols.time].max()
        self.min_time = edges[:, cols.time].min()

        return {'idx': edges, 'vals': torch.ones(edges.size(0))}
Esempio n. 4
0
    def __init__(self,args):
        assert args.task in ['link_pred', 'edge_cls'], 'bitcoin only implements link_pred or edge_cls'
        self.ecols = u.Namespace({'FromNodeId': 0,
                                  'ToNodeId': 1,
                                  'Weight': 2,
                                  'TimeStep': 3
                                })
        args.bitcoin_args = u.Namespace(args.bitcoin_args)

        #build edge data structure
        edges = self.load_edges(args.bitcoin_args)

        edges = self.make_contigous_node_ids(edges)
        num_nodes = edges[:,[self.ecols.FromNodeId,
                            self.ecols.ToNodeId]].unique().size(0)

        timesteps = u.aggregate_by_time(edges[:,self.ecols.TimeStep],args.bitcoin_args.aggr_time)
        self.max_time = timesteps.max()
        self.min_time = timesteps.min()
        edges[:,self.ecols.TimeStep] = timesteps

        edges[:,self.ecols.Weight] = self.cluster_negs_and_positives(edges[:,self.ecols.Weight])


        #add the reversed link to make the graph undirected
        edges = torch.cat([edges,edges[:,[self.ecols.ToNodeId,
                                          self.ecols.FromNodeId,
                                          self.ecols.Weight,
                                          self.ecols.TimeStep]]])

        #separate classes
        sp_indices = edges[:,[self.ecols.FromNodeId,
                              self.ecols.ToNodeId,
                              self.ecols.TimeStep]].t()
        sp_values = edges[:,self.ecols.Weight]


        neg_mask = sp_values == -1

        neg_sp_indices = sp_indices[:,neg_mask]
        neg_sp_values = sp_values[neg_mask]
        neg_sp_edges = torch.sparse.LongTensor(neg_sp_indices
                                              ,neg_sp_values,
                                              torch.Size([num_nodes,
                                                          num_nodes,
                                                          self.max_time+1])).coalesce()

        pos_mask = sp_values == 1

        pos_sp_indices = sp_indices[:,pos_mask]
        pos_sp_values = sp_values[pos_mask]

        pos_sp_edges = torch.sparse.LongTensor(pos_sp_indices
                                              ,pos_sp_values,
                                              torch.Size([num_nodes,
                                                          num_nodes,
                                                          self.max_time+1])).coalesce()

        #scale positive class to separate after adding
        pos_sp_edges *= 1000

        #we substract the neg_sp_edges to make the values positive
        sp_edges = (pos_sp_edges - neg_sp_edges).coalesce()

        #separating negs and positive edges per edge/timestamp
        vals = sp_edges._values()
        neg_vals = vals%1000
        pos_vals = vals//1000
        #We add the negative and positive scores and do majority voting
        vals = pos_vals - neg_vals
        #creating labels new_vals -> the label of the edges
        new_vals = torch.zeros(vals.size(0),dtype=torch.long)
        new_vals[vals>0] = 1
        new_vals[vals<=0] = 0
        indices_labels = torch.cat([sp_edges._indices().t(),new_vals.view(-1,1)],dim=1)

        #the weight of the edges (vals), is simply the number of edges between two entities at each time_step
        vals = pos_vals + neg_vals


        self.edges = {'idx': indices_labels, 'vals': vals}
        self.num_nodes = num_nodes
        self.num_classes = 2
Esempio n. 5
0
    def __init__(self, args):
        args.reddit_args = u.Namespace(args.reddit_args)
        folder = args.reddit_args.folder

        # load nodes
        cols = u.Namespace({'id': 0,
                            'feats': 1})
        file = args.reddit_args.nodes_file
        file = os.path.join(folder, file)
        with open(file) as file:
            file = file.read().splitlines()

        ids_str_to_int = {}
        id_counter = 0

        feats = []

        for line in file:
            line = line.split(',')
            # node id
            nd_id = line[0]
            if nd_id not in ids_str_to_int.keys():
                ids_str_to_int[nd_id] = id_counter
                id_counter += 1
                nd_feats = [float(r) for r in line[1:]]
                feats.append(nd_feats)
            else:
                print('duplicate id', nd_id)
                raise Exception('duplicate_id')

        feats = torch.tensor(feats, dtype=torch.float)
        num_nodes = feats.size(0)

        edges = []
        not_found = 0

        # load edges in title
        edges_tmp, not_found_tmp = self.load_edges_from_file(args.reddit_args.title_edges_file,
                                                             folder,
                                                             ids_str_to_int)
        edges.extend(edges_tmp)
        not_found += not_found_tmp

        # load edges in bodies

        edges_tmp, not_found_tmp = self.load_edges_from_file(args.reddit_args.body_edges_file,
                                                             folder,
                                                             ids_str_to_int)
        edges.extend(edges_tmp)
        not_found += not_found_tmp

        # min time should be 0 and time aggregation
        edges = torch.LongTensor(edges)
        edges[:, 2] = u.aggregate_by_time(edges[:, 2], args.reddit_args.aggr_time)
        max_time = edges[:, 2].max()

        # separate classes
        sp_indices = edges[:, :3].t()
        sp_values = edges[:, 3]

        # sp_edges = torch.sparse.LongTensor(sp_indices
        # 									  ,sp_values,
        # 									  torch.Size([num_nodes,
        # 									  			  num_nodes,
        # 									  			  max_time+1])).coalesce()
        # vals = sp_edges._values()
        # print(vals[vals>0].sum() + vals[vals<0].sum()*-1)
        # asdf

        pos_mask = sp_values == 1
        neg_mask = sp_values == -1

        neg_sp_indices = sp_indices[:, neg_mask]
        neg_sp_values = sp_values[neg_mask]
        neg_sp_edges = torch.sparse.LongTensor(neg_sp_indices
                                               , neg_sp_values,
                                               torch.Size([num_nodes,
                                                           num_nodes,
                                                           max_time + 1])).coalesce()

        pos_sp_indices = sp_indices[:, pos_mask]
        pos_sp_values = sp_values[pos_mask]

        pos_sp_edges = torch.sparse.LongTensor(pos_sp_indices
                                               , pos_sp_values,
                                               torch.Size([num_nodes,
                                                           num_nodes,
                                                           max_time + 1])).coalesce()

        # scale positive class to separate after adding
        pos_sp_edges *= 1000

        sp_edges = (pos_sp_edges - neg_sp_edges).coalesce()

        # separating negs and positive edges per edge/timestamp
        vals = sp_edges._values()
        neg_vals = vals % 1000
        pos_vals = vals // 1000
        # vals is simply the number of edges between two nodes at the same time_step, regardless of the edge label
        vals = pos_vals - neg_vals

        # creating labels new_vals -> the label of the edges
        new_vals = torch.zeros(vals.size(0), dtype=torch.long)
        new_vals[vals > 0] = 1
        new_vals[vals <= 0] = 0
        vals = pos_vals + neg_vals
        indices_labels = torch.cat([sp_edges._indices().t(), new_vals.view(-1, 1)], dim=1)

        self.edges = {'idx': indices_labels, 'vals': vals}
        self.num_classes = 2
        self.feats_per_node = feats.size(1)
        self.num_nodes = num_nodes
        self.nodes_feats = feats
        self.max_time = max_time
        self.min_time = 0