Ejemplo n.º 1
0
    def __init__(self,
                 edge_index: torch.Tensor,
                 sizes: List[int],
                 node_idx: Optional[torch.Tensor] = None,
                 num_nodes: Optional[int] = None,
                 flow: str = "source_to_target",
                 **kwargs):

        N = int(edge_index.max() + 1) if num_nodes is None else num_nodes
        edge_attr = torch.arange(edge_index.size(1))
        adj = SparseTensor(row=edge_index[0],
                           col=edge_index[1],
                           value=edge_attr,
                           sparse_sizes=(N, N),
                           is_sorted=False)
        adj = adj.t() if flow == 'source_to_target' else adj
        self.adj = adj.to('cpu')

        if node_idx is None:
            node_idx = torch.arange(N)
        elif node_idx.dtype == torch.bool:
            node_idx = node_idx.nonzero().view(-1)

        self.sizes = sizes
        self.flow = flow
        assert self.flow in ['source_to_target', 'target_to_source']

        super(NeighborSampler, self).__init__(node_idx.tolist(),
                                              collate_fn=self.sample,
                                              **kwargs)
Ejemplo n.º 2
0
    def init_adj(self, edge_index):
        """ cache normalized adjacency and normalized strict two-hop adjacency,
        neither has self loops
        """
        n = self.num_nodes
        
        if isinstance(edge_index, SparseTensor):
            dev = adj_t.device
            adj_t = edge_index
            adj_t = scipy.sparse.csr_matrix(adj_t.to_scipy())
            adj_t[adj_t > 0] = 1
            adj_t[adj_t < 0] = 0
            adj_t = SparseTensor.from_scipy(adj_t).to(dev)
        elif isinstance(edge_index, torch.Tensor):
            row, col = edge_index
            adj_t = SparseTensor(row=col, col=row, value=None, sparse_sizes=(n, n))

        adj_t.remove_diag(0)
        adj_t2 = matmul(adj_t, adj_t)
        adj_t2.remove_diag(0)
        adj_t = scipy.sparse.csr_matrix(adj_t.to_scipy())
        adj_t2 = scipy.sparse.csr_matrix(adj_t2.to_scipy())
        adj_t2 = adj_t2 - adj_t
        adj_t2[adj_t2 > 0] = 1
        adj_t2[adj_t2 < 0] = 0

        adj_t = SparseTensor.from_scipy(adj_t)
        adj_t2 = SparseTensor.from_scipy(adj_t2)
        
        adj_t = gcn_norm(adj_t, None, n, add_self_loops=False)
        adj_t2 = gcn_norm(adj_t2, None, n, add_self_loops=False)

        self.adj_t = adj_t.to(edge_index.device)
        self.adj_t2 = adj_t2.to(edge_index.device)
Ejemplo n.º 3
0
    def __init__(self, edge_index_dict, embedding_dim, metapath, walk_length,
                 context_size, walks_per_node=1, num_negative_samples=1,
                 num_nodes_dict=None, sparse=False):
        super(MetaPath2Vec, self).__init__()

        if num_nodes_dict is None:
            num_nodes_dict = {}
            for keys, edge_index in edge_index_dict.items():
                key = keys[0]
                N = int(edge_index[0].max() + 1)
                num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

                key = keys[-1]
                N = int(edge_index[1].max() + 1)
                num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

        adj_dict = {}
        for keys, edge_index in edge_index_dict.items():
            sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
            row, col = edge_index
            adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
            adj = adj.to('cpu')
            adj_dict[keys] = adj

        assert metapath[0][0] == metapath[-1][-1]
        assert walk_length >= context_size

        self.adj_dict = adj_dict
        self.embedding_dim = embedding_dim
        self.metapath = metapath
        self.walk_length = walk_length
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.num_negative_samples = num_negative_samples
        self.num_nodes_dict = num_nodes_dict

        types = set([x[0] for x in metapath]) | set([x[-1] for x in metapath])
        types = sorted(list(types))

        count = 0
        self.start, self.end = {}, {}
        for key in types:
            self.start[key] = count
            count += num_nodes_dict[key]
            self.end[key] = count

        offset = [self.start[metapath[0][0]]]
        offset += [self.start[keys[-1]] for keys in metapath
                   ] * int((walk_length / len(metapath)) + 1)
        offset = offset[:walk_length + 1]
        assert len(offset) == walk_length + 1
        self.offset = torch.tensor(offset)

        self.embedding = Embedding(count, embedding_dim, sparse=sparse)

        self.reset_parameters()
Ejemplo n.º 4
0
    def __init__(self,
                 edge_index_dict,
                 embedding_dim,
                 metapath,
                 walk_length,
                 context_size,
                 num_nodes_dict,
                 types,
                 type_accs,
                 walks_per_node=1,
                 num_negative_samples=1,
                 sparse=False):
        super(MetaPath2Vec, self).__init__()

        adj_dict = {}
        for keys, edge_index in edge_index_dict.items():
            sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
            row, col = edge_index
            row, col = row - type_accs[keys[0]], col - type_accs[keys[-1]]
            adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
            adj = adj.to('cpu')
            adj_dict[keys] = adj

        assert metapath[0][0] == metapath[-1][-1]
        assert walk_length >= context_size

        self.adj_dict = adj_dict
        self.embedding_dim = embedding_dim
        self.metapath = metapath
        self.walk_length = walk_length
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.num_negative_samples = num_negative_samples
        self.num_nodes_dict = num_nodes_dict

        count = 0
        self.start, self.end = {}, {}
        for key in types:
            self.start[key] = count
            count += num_nodes_dict[key]
            self.end[key] = count

        offset = [self.start[metapath[0][0]]]
        offset += [self.start[keys[-1]] for keys in metapath
                   ] * int((walk_length / len(metapath)) + 1)
        offset = offset[:walk_length + 1]
        assert len(offset) == walk_length + 1
        self.offset = torch.tensor(offset)

        self.embedding = Embedding(count, embedding_dim, sparse=sparse)

        self.reset_parameters()
Ejemplo n.º 5
0
def to_sparse_gpu(data):
    (row, col), N = data.edge_index, data.num_nodes
    perm = (col * N + row).argsort()
    row, col = row[perm], col[perm]
    value = torch.ones(data.edge_index.shape[1])
    adj_t = SparseTensor(
        row=col, col=row, value=value, sparse_sizes=(N, N), is_sorted=True
    )

    # Pre-process some important attributes.
    adj_t.storage.rowptr()
    adj_t.storage.csr2csc()
    return adj_t.to(torch.float32).to("cuda")
Ejemplo n.º 6
0
class RandomWalk():
    def __init__(self,
                 edge_index,
                 walk_length,
                 context_size,
                 walks_per_node=1,
                 p=1,
                 q=1,
                 num_negative_samples=1,
                 num_nodes=None,
                 sparse=False):

        if random_walk is None:
            raise ImportError('`Node2Vec` requires `torch-cluster`.')

        N = maybe_num_nodes(edge_index, num_nodes)
        row, col = edge_index
        self.adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
        self.adj = self.adj.to('cpu')

        assert walk_length >= context_size

        self.walk_length = walk_length - 1
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.p = p
        self.q = q
        self.num_negative_samples = num_negative_samples

    def loader(self, **kwargs):
        return DataLoader(range(self.adj.sparse_size(0)),
                          collate_fn=self.sample,
                          **kwargs)

    def sample(self, batch):
        if not isinstance(batch, torch.Tensor):
            batch = torch.tensor(batch)
        batch = batch.repeat(self.walks_per_node)
        rowptr, col, _ = self.adj.csr()
        rw = random_walk(rowptr, col, batch, self.walk_length, self.p, self.q)
        if not isinstance(rw, torch.Tensor):
            rw = rw[0]
        walks = []
        num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size
        for j in range(num_walks_per_rw):
            for i in range(1, self.context_size):
                walks.append(rw[:, [j, j + i]])
        return torch.cat(walks, dim=0)
Ejemplo n.º 7
0
    def __init__(self, edge_index: torch.Tensor, sizes: List[int],
                 split_idx=None,
                 node_idx: Optional[torch.Tensor] = None,
                 num_nodes: Optional[int] = None,
                 flow: str = "source_to_target", 
                 prune=False, prune_set='train',
                 prune_type='adaptive',
                 **kwargs):

        self.N = N = int(edge_index.max() + 1) if num_nodes is None else num_nodes
        edge_attr = torch.arange(edge_index.size(1))
        adj = SparseTensor(row=edge_index[0], col=edge_index[1],
                           value=edge_attr, sparse_sizes=(N, N),
                           is_sorted=False)
        adj = adj.t() if flow == 'source_to_target' else adj
        self.adj = adj.to('cpu')

        if node_idx is None:
            node_idx = torch.arange(N)
        elif node_idx.dtype == torch.bool:
            node_idx = node_idx.nonzero(as_tuple=False).view(-1)

        self.sizes = sizes
        self.flow = flow
        assert self.flow in ['source_to_target', 'target_to_source']

        super(NeighborSampler, self).__init__(node_idx.tolist(),
                                              collate_fn=self.sample, **kwargs)
        
        if prune == True:
            self.split_idx = split_idx
            self.E = edge_index.size(1)
            self.edge_index = edge_index
            if prune_set == 'train':
                self.train_idx = self.split_idx['train']
            else:
                self.train_idx = torch.cat([self.split_idx['train'], self.split_idx['valid']])
            subadj, _ = self.adj.saint_subgraph(self.train_idx)
            # subadj = self.adj.to_dense()[self.train_idx][:,self.train_idx].view(-1)
            _,_,e_idx = subadj.coo()
            self.train_e_idx = e_idx.squeeze().long()
            self.train_edge_index = self.edge_index[:, self.train_e_idx] 
            self.rest_idx = torch.cat([self.split_idx['valid'], self.split_idx['test']])
            subadj2, _ = self.adj.saint_subgraph(self.rest_idx)
            _,_,rest_e_idx = subadj2.coo()
            self.rest_e_idx = rest_e_idx.squeeze().long()
Ejemplo n.º 8
0
    print('Reading adjacency matrix...', end=' ', flush=True)
    path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
    if osp.exists(path):
        adj_t = torch.load(path)
    else:
        edge_index = dataset.edge_index('paper', 'cites', 'paper')
        edge_index = torch.from_numpy(edge_index)
        adj_t = SparseTensor(
            row=edge_index[0], col=edge_index[1],
            sparse_sizes=(dataset.num_papers, dataset.num_papers),
            is_sorted=True)
        adj_t = adj_t.to_symmetric()
        torch.save(adj_t, path)
    adj_t = gcn_norm(adj_t, add_self_loops=False)
    if args.low_memory:
        adj_t = adj_t.to(torch.half)
    print(f'Done! [{time.perf_counter() - t:.2f}s]')

    train_idx = dataset.get_idx_split('train')
    valid_idx = dataset.get_idx_split('valid')
    test_idx = dataset.get_idx_split('test')

    y_train = torch.from_numpy(dataset.paper_label[train_idx]).to(torch.long)
    y_valid = torch.from_numpy(dataset.paper_label[valid_idx]).to(torch.long)

    model = LabelPropagation(args.num_layers, args.alpha)

    N, C = dataset.num_papers, dataset.num_classes

    t = time.perf_counter()
    print('Propagating labels...', end=' ', flush=True)
Ejemplo n.º 9
0
class Node2Vec(torch.nn.Module):
    r"""The Node2Vec model from the
    `"node2vec: Scalable Feature Learning for Networks"
    <https://arxiv.org/abs/1607.00653>`_ paper where random walks of
    length :obj:`walk_length` are sampled in a given graph, and node embeddings
    are learned via negative sampling optimization.

    .. note::

        For an example of using Node2Vec, see `examples/node2vec.py
        <https://github.com/pyg-team/pytorch_geometric/blob/master/examples/
        node2vec.py>`_.

    Args:
        edge_index (LongTensor): The edge indices.
        embedding_dim (int): The size of each embedding vector.
        walk_length (int): The walk length.
        context_size (int): The actual context size which is considered for
            positive samples. This parameter increases the effective sampling
            rate by reusing samples across different source nodes.
        walks_per_node (int, optional): The number of walks to sample for each
            node. (default: :obj:`1`)
        p (float, optional): Likelihood of immediately revisiting a node in the
            walk. (default: :obj:`1`)
        q (float, optional): Control parameter to interpolate between
            breadth-first strategy and depth-first strategy (default: :obj:`1`)
        num_negative_samples (int, optional): The number of negative samples to
            use for each positive sample. (default: :obj:`1`)
        num_nodes (int, optional): The number of nodes. (default: :obj:`None`)
        sparse (bool, optional): If set to :obj:`True`, gradients w.r.t. to the
            weight matrix will be sparse. (default: :obj:`False`)
    """
    def __init__(self, edge_index, embedding_dim, walk_length, context_size,
                 walks_per_node=1, p=1, q=1, num_negative_samples=1,
                 num_nodes=None, sparse=False):
        super().__init__()

        if random_walk is None:
            raise ImportError('`Node2Vec` requires `torch-cluster`.')

        N = maybe_num_nodes(edge_index, num_nodes)
        row, col = edge_index
        self.adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
        self.adj = self.adj.to('cpu')

        assert walk_length >= context_size

        self.embedding_dim = embedding_dim
        self.walk_length = walk_length - 1
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.p = p
        self.q = q
        self.num_negative_samples = num_negative_samples

        self.embedding = Embedding(N, embedding_dim, sparse=sparse)

        self.reset_parameters()

    def reset_parameters(self):
        self.embedding.reset_parameters()

    def forward(self, batch=None):
        """Returns the embeddings for the nodes in :obj:`batch`."""
        emb = self.embedding.weight
        return emb if batch is None else emb.index_select(0, batch)

    def loader(self, **kwargs):
        return DataLoader(range(self.adj.sparse_size(0)),
                          collate_fn=self.sample, **kwargs)

    def pos_sample(self, batch):
        batch = batch.repeat(self.walks_per_node)
        rowptr, col, _ = self.adj.csr()
        rw = random_walk(rowptr, col, batch, self.walk_length, self.p, self.q)
        if not isinstance(rw, torch.Tensor):
            rw = rw[0]

        walks = []
        num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size
        for j in range(num_walks_per_rw):
            walks.append(rw[:, j:j + self.context_size])
        return torch.cat(walks, dim=0)

    def neg_sample(self, batch):
        batch = batch.repeat(self.walks_per_node * self.num_negative_samples)

        rw = torch.randint(self.adj.sparse_size(0),
                           (batch.size(0), self.walk_length))
        rw = torch.cat([batch.view(-1, 1), rw], dim=-1)

        walks = []
        num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size
        for j in range(num_walks_per_rw):
            walks.append(rw[:, j:j + self.context_size])
        return torch.cat(walks, dim=0)

    def sample(self, batch):
        if not isinstance(batch, torch.Tensor):
            batch = torch.tensor(batch)
        return self.pos_sample(batch), self.neg_sample(batch)

    def loss(self, pos_rw, neg_rw):
        r"""Computes the loss given positive and negative random walks."""

        # Positive loss.
        start, rest = pos_rw[:, 0], pos_rw[:, 1:].contiguous()

        h_start = self.embedding(start).view(pos_rw.size(0), 1,
                                             self.embedding_dim)
        h_rest = self.embedding(rest.view(-1)).view(pos_rw.size(0), -1,
                                                    self.embedding_dim)

        out = (h_start * h_rest).sum(dim=-1).view(-1)
        pos_loss = -torch.log(torch.sigmoid(out) + EPS).mean()

        # Negative loss.
        start, rest = neg_rw[:, 0], neg_rw[:, 1:].contiguous()

        h_start = self.embedding(start).view(neg_rw.size(0), 1,
                                             self.embedding_dim)
        h_rest = self.embedding(rest.view(-1)).view(neg_rw.size(0), -1,
                                                    self.embedding_dim)

        out = (h_start * h_rest).sum(dim=-1).view(-1)
        neg_loss = -torch.log(1 - torch.sigmoid(out) + EPS).mean()

        return pos_loss + neg_loss

    def test(self, train_z, train_y, test_z, test_y, solver='lbfgs',
             multi_class='auto', *args, **kwargs):
        r"""Evaluates latent space quality via a logistic regression downstream
        task."""
        from sklearn.linear_model import LogisticRegression

        clf = LogisticRegression(solver=solver, multi_class=multi_class, *args,
                                 **kwargs).fit(train_z.detach().cpu().numpy(),
                                               train_y.detach().cpu().numpy())
        return clf.score(test_z.detach().cpu().numpy(),
                         test_y.detach().cpu().numpy())

    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.embedding.weight.size(0)}, '
                f'{self.embedding.weight.size(1)})')
Ejemplo n.º 10
0
    def __init__(
        self,
        edge_index_dict: Dict[EdgeType, Tensor],
        embedding_dim: int,
        metapath: List[EdgeType],
        walk_length: int,
        context_size: int,
        walks_per_node: int = 1,
        num_negative_samples: int = 1,
        num_nodes_dict: Optional[Dict[NodeType, int]] = None,
        sparse: bool = False,
    ):
        super().__init__()

        if num_nodes_dict is None:
            num_nodes_dict = {}
            for keys, edge_index in edge_index_dict.items():
                key = keys[0]
                N = int(edge_index[0].max() + 1)
                num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

                key = keys[-1]
                N = int(edge_index[1].max() + 1)
                num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

        adj_dict = {}
        for keys, edge_index in edge_index_dict.items():
            sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
            row, col = edge_index
            adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
            adj = adj.to('cpu')
            adj_dict[keys] = adj

        assert walk_length + 1 >= context_size
        if walk_length > len(metapath) and metapath[0][0] != metapath[-1][-1]:
            raise AttributeError(
                "The 'walk_length' is longer than the given 'metapath', but "
                "the 'metapath' does not denote a cycle")

        self.adj_dict = adj_dict
        self.embedding_dim = embedding_dim
        self.metapath = metapath
        self.walk_length = walk_length
        self.context_size = context_size
        self.walks_per_node = walks_per_node
        self.num_negative_samples = num_negative_samples
        self.num_nodes_dict = num_nodes_dict

        types = set([x[0] for x in metapath]) | set([x[-1] for x in metapath])
        types = sorted(list(types))

        count = 0
        self.start, self.end = {}, {}
        for key in types:
            self.start[key] = count
            count += num_nodes_dict[key]
            self.end[key] = count

        offset = [self.start[metapath[0][0]]]
        offset += [self.start[keys[-1]] for keys in metapath
                   ] * int((walk_length / len(metapath)) + 1)
        offset = offset[:walk_length + 1]
        assert len(offset) == walk_length + 1
        self.offset = torch.tensor(offset)

        # + 1 denotes a dummy node used to link to for isolated nodes.
        self.embedding = Embedding(count + 1, embedding_dim, sparse=sparse)
        self.dummy_idx = count

        self.reset_parameters()
Ejemplo n.º 11
0
class JOINTSRMFSPARSE(GeneralRecommender):

    input_type = InputType.POINTWISE

    def __init__(self, config, dataset):
        super(JOINTSRMFSPARSE, self).__init__(config, dataset)
        # load dataset info
        self.LABEL = config['LABEL_FIELD']

        self.embedding_dim = config['embedding_dimension']
        self.alpha = config["alpha_item"]
        item_description_fields = config['item_description_fields']
        max_number_of_reviews = config['number_of_reviews_to_use_item']
        self.variant = config["debug_variant"]

        self.logger.info(f"embedding_dimension = {self.embedding_dim}")
        self.logger.info(f"alpha = {self.alpha}")
        self.logger.info(
            f"item_description_fields = {item_description_fields}")

        self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim)
        self.user_bias = nn.Parameter(torch.zeros(self.n_users))
        self.item_bias = nn.Parameter(torch.zeros(self.n_items))
        self.bias = nn.Parameter(torch.zeros(1))
        self.apply(self._init_weights)

        gensim_cache = open('gensim_cache_path', 'r').read().strip()
        os.environ['GENSIM_DATA_DIR'] = str(gensim_cache)
        import gensim
        import gensim.downloader as api
        # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300"
        pretrained_embedding_name = "glove-wiki-gigaword-50"  # because the size must be 50 the same as the embedding
        model_path = api.load(pretrained_embedding_name, return_path=True)
        model = gensim.models.KeyedVectors.load_word2vec_format(model_path)
        weights = torch.FloatTensor(
            model.vectors)  # formerly syn0, which is soon deprecated
        self.logger.info(f"pretrained_embedding shape: {weights.shape}")
        self.word_embedding = nn.Embedding.from_pretrained(weights,
                                                           freeze=True)
        self.vocab_size = len(model.key_to_index)

        s = time.time()
        item_lms = {}
        item_lm_len = {}

        item_desc_fields = []
        if "item_description" in item_description_fields:
            item_desc_fields.append(3)
        if "item_genres" in item_description_fields:
            item_desc_fields.append(4)
        if "tags" in item_description_fields:
            item_desc_fields.append(4)
        if len(item_desc_fields) > 0:
            item_LM_file = os.path.join(
                dataset.dataset.dataset_path,
                f"{dataset.dataset.dataset_name}.item")
            with open(item_LM_file, 'r') as infile:
                next(infile)
                for line in infile:
                    split = line.split("\t")
                    item_id = dataset.token2id_exists("item_id", split[0])
                    if item_id == -1:
                        continue
                    if item_id == 0:
                        print("Isnt that padding?")
                    if item_id not in item_lms:
                        item_lms[item_id] = {}
                        item_lm_len[item_id] = 0
                    for fi in item_desc_fields:
                        if fi >= len(split):
                            print(split)
                            continue
                        desc = split[fi]
                        for term in desc.split():
                            if term in model.key_to_index:
                                wv_term_index = model.key_to_index[term]
                                if wv_term_index not in item_lms[item_id]:
                                    item_lms[item_id][wv_term_index] = 1
                                else:
                                    item_lms[item_id][wv_term_index] += 1
                                item_lm_len[item_id] += 1
        # Do reviews as well
        # inter: user_id:token   item_id:token   rating:float    review:token_seq
        num_of_used_revs = {}
        if "review" in item_description_fields:
            # first we want to only load reviews that are in the training set so we specify those:
            training_set = {}
            for i in range(len(dataset.dataset.inter_feat["user_id"])):
                uid = int(dataset.dataset.inter_feat["user_id"][i])
                iid = int(dataset.dataset.inter_feat["item_id"][i])
                if uid not in training_set:
                    training_set[uid] = set()
                training_set[uid].add(iid)
            item_desc_fields = [3]
            item_LM_file = os.path.join(
                dataset.dataset.dataset_path,
                f"{dataset.dataset.dataset_name}.inter")
            with open(item_LM_file, 'r') as infile:
                next(infile)
                for line in infile:
                    split = line.split("\t")
                    user_id = dataset.token2id_exists("user_id", split[0])
                    item_id = dataset.token2id_exists("item_id", split[1])
                    if item_id == -1 or user_id == -1:
                        continue
                    if item_id == 0 or user_id == 0:
                        print("Isnt that padding?")
                    if user_id not in training_set:
                        continue
                    if item_id not in training_set[user_id]:
                        continue
                    if item_id not in num_of_used_revs:
                        num_of_used_revs[item_id] = 0
                    if max_number_of_reviews is not None and num_of_used_revs[
                            item_id] >= max_number_of_reviews:
                        continue
                    if item_id not in item_lms:
                        item_lms[item_id] = {}
                        item_lm_len[item_id] = 0
                    for fi in item_desc_fields:
                        desc = split[fi]
                        if len(desc) > 1:
                            num_of_used_revs[item_id] += 1
                        for term in desc.split():
                            if term in model.key_to_index:
                                wv_term_index = model.key_to_index[term]
                                if wv_term_index not in item_lms[item_id]:
                                    item_lms[item_id][wv_term_index] = 1
                                else:
                                    item_lms[item_id][wv_term_index] += 1
                                item_lm_len[item_id] += 1
        indices = [[0], [0]]
        values = [0]
        for item_id in item_lms.keys():
            for k, v in item_lms[item_id].items():
                indices[0].append(item_id)
                indices[1].append(k)
                values.append(v / item_lm_len[item_id])
        self.lm_gt = SparseTensor(row=torch.tensor(indices[0],
                                                   dtype=torch.long),
                                  col=torch.tensor(indices[1],
                                                   dtype=torch.long),
                                  value=torch.tensor(values),
                                  sparse_sizes=(self.n_items,
                                                len(model.key_to_index)))
        if self.variant == 1:
            self.lm_gt = self.lm_gt.to(self.device)
        e = time.time()
        self.logger.info(f"{e - s}s")
        self.logger.info(f"Done with lm_gt construction!")

        self.sigmoid = nn.Sigmoid()
        self.loss_rec = nn.BCELoss()
        self.loss_lm = SoftCrossEntropyLoss()

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)

    @staticmethod
    def get_entries(array, keys):
        ret = []
        for k in keys:
            ret.append(array[k])
        return ret

    def forward_rec(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        pred = torch.sum(torch.mul(user_emb, item_emb), dim=1)
        pred = pred + self.item_bias[item] + self.user_bias[user]
        pred = pred + self.bias
        pred = self.sigmoid(pred)
        return pred

    def forward_lm(self, item):
        item_emb = self.item_embedding(item)
        pred = torch.matmul(item_emb, self.word_embedding.weight.T)
        return pred

    def calculate_loss(self, interaction):
        user = interaction[self.USER_ID]
        item = interaction[self.ITEM_ID]
        label = interaction[self.LABEL]

        with profiler.record_function("REC output and loss"):
            output_rec = self.forward_rec(user, item)
            loss_rec = self.loss_rec(output_rec, label)

        with profiler.record_function("LM output"):
            output_lm = self.forward_lm(item)

        if self.variant == 3:
            label_lm = self.lm_gt[item].to_dense().to(self.device)

        if self.variant == 2:
            label_lm = self.lm_gt[item].to(self.device).to_dense()

        if self.variant == 1:
            with profiler.record_function("LM making label on GPU"):
                label_lm = self.lm_gt[item].to_dense()
                # label_lm = torch.zeros(len(item), self.vocab_size, device=self.device)
                # for i in range(len(item)):
                #     item_id = item[i]
                #     label_lm[i] = self.lm_gt[item_id].to_dense()

        with profiler.record_function("LM loss"):
            loss_lm = self.loss_lm(output_lm, label_lm)

        return loss_rec, self.alpha * loss_lm

    def predict(self, interaction):
        user = interaction[self.USER_ID]
        item = interaction[self.ITEM_ID]
        output = self.forward_rec(user, item)
        return output
Ejemplo n.º 12
0
class BM25vec(GeneralRecommender):

    input_type = InputType.POINTWISE
    type = ModelType.TRADITIONAL

    def __init__(self, config, dataset):
        super(BM25vec, self).__init__(config, dataset)

        rec_model = config["saved_rec_model"]
        topk = config["inferred_lm_topk_w"]
        item_description_fields = config['item_description_fields']
        max_number_of_reviews_item_lm = config['number_of_reviews_to_use_item']
        user_profile_fields = config['user_profile_fields']
        max_number_of_reviews_user_lm = config['number_of_reviews_to_use_user']
        self.k1 = config["k1"]
        self.b = config["b"]
        self.c = config["c"]
        self.use_sparse = config["use_sparse"]
        step = config["rec_model_load_step"]
        if step is None:
            step = 200000

        if rec_model is None and item_description_fields is None:
            print("Should specify rec_model or item_description_fields")
            exit(-1)
        elif rec_model is not None and item_description_fields is not None:
            print("Give either rec_model or item_description_fields, not both")
            exit(-1)
        if user_profile_fields is None:
            print("user_profile_fields should be given!")
            exit(-1)
        self.n_items = dataset.item_num
        self.n_users = dataset.user_num

        # load background idf
        print("Loading background corpus")
        s = time.time()
        background_idf_temp = {}
        jnius_config.set_classpath(get_fat_jar())
        indexcorpus = open('background_corpus_path', 'r').read().strip()
        from jnius import autoclass
        JFile = autoclass("java.io.File")
        JFSDirectory = autoclass("org.apache.lucene.store.FSDirectory")
        fsdir = JFSDirectory.open(JFile(indexcorpus).toPath())
        reader = autoclass("org.apache.lucene.index.DirectoryReader").open(
            fsdir)
        numdocs = reader.numDocs()
        JTerm = autoclass("org.apache.lucene.index.Term")
        # numterms = self.reader.getSumTotalTermFreq("contents")
        print(f"done {time.time()-s}")

        # create query/user LM:
        print("Creating user lm")
        s = time.time()
        uid_term_frequencies = {}
        self.uid_len = {}
        # self.uid_termprobs = {}
        num_of_used_revs = {}
        if "review" in user_profile_fields:
            # first we want to only load reviews that are in the training set so we specify those:
            training_set = {}
            for i in range(len(dataset.dataset.inter_feat["user_id"])):
                uid = int(dataset.dataset.inter_feat["user_id"][i])
                iid = int(dataset.dataset.inter_feat["item_id"][i])
                if uid not in training_set:
                    training_set[uid] = set()
                training_set[uid].add(iid)
            user_fields = [3]
            inter_file = os.path.join(dataset.dataset.dataset_path,
                                      f"{dataset.dataset.dataset_name}.inter")
            with open(inter_file, 'r') as infile:
                next(infile)
                for line in infile:
                    split = line.split("\t")
                    user_id = dataset.token2id_exists("user_id", split[0])
                    item_id = dataset.token2id_exists("item_id", split[1])
                    if item_id == -1 or user_id == -1:
                        continue
                    if item_id == 0 or user_id == 0:
                        print("Isnt that padding?")
                    if user_id not in training_set:
                        continue
                    if item_id not in training_set[user_id]:
                        continue
                    if user_id not in num_of_used_revs:
                        num_of_used_revs[user_id] = 0
                    if max_number_of_reviews_user_lm is not None and num_of_used_revs[
                            user_id] >= max_number_of_reviews_user_lm:
                        continue
                    if user_id not in uid_term_frequencies:
                        uid_term_frequencies[user_id] = {}
                        self.uid_len[user_id] = 0
                    for fi in user_fields:
                        desc = split[fi]
                        if len(desc) > 1:
                            num_of_used_revs[user_id] += 1
                        for term in desc.split():
                            if term not in uid_term_frequencies[user_id]:
                                uid_term_frequencies[user_id][term] = 1
                            else:
                                uid_term_frequencies[user_id][term] += 1
                            self.uid_len[user_id] += 1
                            # bg idf:
                            if term not in background_idf_temp:
                                jterm = JTerm("contents", term)
                                df = reader.docFreq(jterm)
                                background_idf_temp[term] = np.log10(
                                    (numdocs - df + 0.5) / (df + 0.5))
            # for user_id in self.uid_term_frequencies.keys():
            #     self.uid_termprobs[user_id] = {k: (v/self.uid_len[user_id]) for k, v in self.uid_term_frequencies[user_id]}
        # TODO extend this for KITT users... from ..user files

        self.term_idx = {}
        self.background_idf = torch.zeros(len(background_idf_temp.keys()))
        idx = 0
        for t, idf in background_idf_temp.items():
            self.term_idx[t] = idx
            self.background_idf[idx] = idf
            idx += 1
        self.background_idf.to(device=self.device)

        if self.use_sparse:
            indices = [[0], [0]]
            values = [0]
            for user_id in uid_term_frequencies.keys():
                for t, v in uid_term_frequencies[user_id].items():
                    indices[0].append(user_id)
                    indices[1].append(self.term_idx[t])
                    values.append(v)
            self.uid_term_frequencies = SparseTensor(
                row=torch.tensor(indices[0], dtype=torch.long),
                col=torch.tensor(indices[1], dtype=torch.long),
                value=torch.tensor(values),
                sparse_sizes=(self.n_users, len(self.background_idf)))
            self.uid_term_frequencies.to(self.device)
        else:
            self.uid_term_frequencies = torch.zeros(
                (self.n_users, len(self.background_idf)), device=self.device)
            for user in uid_term_frequencies:
                for t, v in uid_term_frequencies[user].items():
                    self.uid_term_frequencies[user][self.term_idx[t]] = v
        print(f"done {time.time()-s}")

        # item lm:
        doc_tf = {}
        self.doc_len = torch.zeros(self.n_items, device=self.device)
        # create item LM (inferred):
        if rec_model is not None:
            print("Creating inferred item lm")
            s = time.time()
            checkpoint_file = REC_MODELS[rec_model]["checkpoint_file"]
            model_name = REC_MODELS[rec_model]["model_name"]
            dataset_name = REC_MODELS[rec_model]["dataset_name"]
            config_dict = REC_MODELS[rec_model]["config_dict"]
            rec_model = ItemLM(checkpoint_file,
                               model_name,
                               dataset_name,
                               k=topk,
                               step=step,
                               config_dict=config_dict)
            inferred_lm = rec_model.get_lm()
            for i in range(1, len(dataset.dataset.item_feat)):
                item_id = dataset.dataset.item_feat["item_id"][i]
                item_url_rid = dataset.dataset.item_feat["item_url"][i]
                item_url = dataset.id2token("item_url", item_url_rid)
                if item_url in inferred_lm:
                    doc_tf[item_id] = inferred_lm[item_url]
                    # doc_tf[item_id] = {inferred_lm[item_url][0][j]: inferred_lm[item_url][1][j] for j in range(len(inferred_lm[item_url][0]))}
                    self.doc_len[item_id] = sum(inferred_lm[item_url][1])
                else:
                    doc_tf[item_id] = {([], [])}
                    self.doc_len[item_id] = 0
            print(f"{time.time() - s}")
            if self.use_sparse:
                indices = [[0], [0]]
                values = [0]
                for item_id in doc_tf.keys():
                    for i in range(len(doc_tf[item_id][0])):
                        t = doc_tf[item_id][0][i]
                        v = doc_tf[item_id][1][i]
                        if t in self.term_idx:
                            indices[0].append(item_id)
                            indices[1].append(self.term_idx[t])
                            values.append(v)
                self.doc_tf = SparseTensor(
                    row=torch.tensor(indices[0], dtype=torch.long),
                    col=torch.tensor(indices[1], dtype=torch.long),
                    value=torch.tensor(values),
                    sparse_sizes=(self.n_items, len(self.background_idf)))
                self.doc_tf.to(self.device)
            else:
                self.doc_tf = torch.zeros(
                    (self.n_items, len(self.background_idf)),
                    device=self.device)
                for item in doc_tf:
                    for i in range(len(doc_tf[item_id][0])):
                        t = doc_tf[item_id][0][i]
                        v = doc_tf[item_id][1][i]
                        if t in self.term_idx:
                            self.doc_tf[item][self.term_idx[t]] = v
        # OR create item LM statistical:
        elif item_description_fields is not None:
            print("Creating item lm")
            item_desc_fields = []
            if "item_description" in item_description_fields:
                item_desc_fields.append(3)
            if "item_genres" in item_description_fields:
                item_desc_fields.append(4)
            if "tags" in item_description_fields:
                item_desc_fields.append(4)
            if len(item_desc_fields) > 0:
                item_LM_file = os.path.join(
                    dataset.dataset.dataset_path,
                    f"{dataset.dataset.dataset_name}.item")
                with open(item_LM_file, 'r') as infile:
                    next(infile)
                    for line in infile:
                        split = line.split("\t")
                        item_id = dataset.token2id_exists("item_id", split[0])
                        if item_id == -1:
                            print(item_id)
                            continue
                        if item_id == 0:
                            print("Isnt that padding?")
                        if item_id not in doc_tf:
                            doc_tf[item_id] = {}
                            self.doc_len[item_id] = 0
                        for fi in item_desc_fields:
                            if fi >= len(split):
                                print(split)
                                continue
                            desc = split[fi]
                            for term in desc.split():
                                if term not in doc_tf[item_id]:
                                    doc_tf[item_id][term] = 1
                                else:
                                    doc_tf[item_id][term] += 1
                                self.doc_len[item_id] += 1
            num_of_used_revs = {}
            if "review" in item_description_fields:
                # first we want to only load reviews that are in the training set so we specify those:
                training_set = {}
                for i in range(len(dataset.dataset.inter_feat["user_id"])):
                    uid = int(dataset.dataset.inter_feat["user_id"][i])
                    iid = int(dataset.dataset.inter_feat["item_id"][i])
                    if uid not in training_set:
                        training_set[uid] = set()
                    training_set[uid].add(iid)
                item_desc_fields = [3]
                item_LM_file = os.path.join(
                    dataset.dataset.dataset_path,
                    f"{dataset.dataset.dataset_name}.inter")
                with open(item_LM_file, 'r') as infile:
                    next(infile)
                    for line in infile:
                        split = line.split("\t")
                        user_id = dataset.token2id_exists("user_id", split[0])
                        item_id = dataset.token2id_exists("item_id", split[1])
                        if item_id == -1 or user_id == -1:
                            continue
                        if item_id == 0 or user_id == 0:
                            print("Isnt that padding?")
                        if user_id not in training_set:
                            continue
                        if item_id not in training_set[user_id]:
                            continue
                        if item_id not in num_of_used_revs:
                            num_of_used_revs[item_id] = 0
                        if max_number_of_reviews_item_lm is not None and num_of_used_revs[
                                item_id] >= max_number_of_reviews_item_lm:
                            continue
                        if item_id not in doc_tf:
                            doc_tf[item_id] = {}
                            self.doc_len[item_id] = 0
                        for fi in item_desc_fields:
                            desc = split[fi]
                            if len(desc) > 1:
                                num_of_used_revs[item_id] += 1
                            for term in desc.split():
                                if term not in doc_tf[item_id]:
                                    doc_tf[item_id][term] = 1
                                else:
                                    doc_tf[item_id][term] += 1
                                self.doc_len[item_id] += 1
            if self.use_sparse:
                indices = [[0], [0]]
                values = [0]
                for item_id in doc_tf.keys():
                    for t, v in doc_tf[item_id].items():
                        if t in self.term_idx:
                            indices[0].append(item_id)
                            indices[1].append(self.term_idx[t])
                            values.append(v)
                self.doc_tf = SparseTensor(
                    row=torch.tensor(indices[0], dtype=torch.long),
                    col=torch.tensor(indices[1], dtype=torch.long),
                    value=torch.tensor(values),
                    sparse_sizes=(self.n_items, len(self.background_idf)))
                self.doc_tf.to(self.device)
            else:
                self.doc_tf = torch.zeros(
                    (self.n_items, len(self.background_idf)),
                    device=self.device)
                for item in doc_tf:
                    for t, v in doc_tf[item].items():
                        if t in self.term_idx:
                            self.doc_tf[item][self.term_idx[t]] = v

        self.average_doc_len = self.doc_len.sum() / self.doc_len.shape[0]
        print(self.average_doc_len)
        print(f"done {time.time()-s}")

        self.fake_loss = torch.nn.Parameter(
            torch.zeros(1))  # alaki yek parameter tarif mikonim why?

    def calculate_loss(self, interaction):
        pass
        return torch.nn.Parameter(torch.zeros(1))

    def predict(self, interaction):
        users = interaction[self.USER_ID]
        items = interaction[self.ITEM_ID]
        # return a list of scores wrt the user item pairs
        if self.use_sparse:
            try:
                doctf = self.doc_tf[items].to_dense()
            except:
                print(items)
                print(self.n_items)
                print(self.doc_tf[items])
                exit(-1)
            qtf = self.uid_term_frequencies[users].to_dense()
        else:
            doctf = self.doc_tf[items]
            qtf = self.uid_term_frequencies[users]
        numerator = doctf * (self.k1 + 1)
        t = self.k1 * (1 - self.b + self.b *
                       (self.doc_len[items] / self.average_doc_len))
        t = t.unsqueeze(1)
        denominator = doctf + t
        doctf = numerator / denominator
        if self.c is not None:
            qtf = (qtf * (self.c + 1)) / (qtf + self.c)
        ret = self.background_idf * doctf * qtf
        return ret.sum(1)
Ejemplo n.º 13
0
        # full graph testing
        val_adj_t = SparseTensor(
            row=edge_index[0, edge_dates < args.test_year],
            col=edge_index[1, edge_dates < args.test_year],
            value=torch.ones((edge_dates < args.test_year).sum(),
                             dtype=torch.float),
            sparse_sizes=(len(node_classes), len(node_classes)),
        )
        val_edge_types = get_edge_types(val_adj_t.storage.row(),
                                        val_adj_t.storage.col(), node_classes)
        model.eval()
        with torch.no_grad():
            z = model.encode(
                full_graph.feats.to(args.device),
                val_adj_t.to(args.device),
                val_edge_types,
            )
            auc, ap = gae.test(
                z,
                model.decoder,
                1,
                pos_test[1].to(args.device),
                neg_test[1].to(args.device),
            )
            mlflow.log_metric("Chosen model test AUC GD", auc)
            mlflow.log_metric("Chosen model test AP GD", ap)
            z = model.encode(
                full_graph.feats.to(args.device),
                full_graph.train_adj_t.to(args.device),
                full_graph.train_edge_types,