def __init__(self, edge_index: torch.Tensor, sizes: List[int], node_idx: Optional[torch.Tensor] = None, num_nodes: Optional[int] = None, flow: str = "source_to_target", **kwargs): N = int(edge_index.max() + 1) if num_nodes is None else num_nodes edge_attr = torch.arange(edge_index.size(1)) adj = SparseTensor(row=edge_index[0], col=edge_index[1], value=edge_attr, sparse_sizes=(N, N), is_sorted=False) adj = adj.t() if flow == 'source_to_target' else adj self.adj = adj.to('cpu') if node_idx is None: node_idx = torch.arange(N) elif node_idx.dtype == torch.bool: node_idx = node_idx.nonzero().view(-1) self.sizes = sizes self.flow = flow assert self.flow in ['source_to_target', 'target_to_source'] super(NeighborSampler, self).__init__(node_idx.tolist(), collate_fn=self.sample, **kwargs)
def init_adj(self, edge_index): """ cache normalized adjacency and normalized strict two-hop adjacency, neither has self loops """ n = self.num_nodes if isinstance(edge_index, SparseTensor): dev = adj_t.device adj_t = edge_index adj_t = scipy.sparse.csr_matrix(adj_t.to_scipy()) adj_t[adj_t > 0] = 1 adj_t[adj_t < 0] = 0 adj_t = SparseTensor.from_scipy(adj_t).to(dev) elif isinstance(edge_index, torch.Tensor): row, col = edge_index adj_t = SparseTensor(row=col, col=row, value=None, sparse_sizes=(n, n)) adj_t.remove_diag(0) adj_t2 = matmul(adj_t, adj_t) adj_t2.remove_diag(0) adj_t = scipy.sparse.csr_matrix(adj_t.to_scipy()) adj_t2 = scipy.sparse.csr_matrix(adj_t2.to_scipy()) adj_t2 = adj_t2 - adj_t adj_t2[adj_t2 > 0] = 1 adj_t2[adj_t2 < 0] = 0 adj_t = SparseTensor.from_scipy(adj_t) adj_t2 = SparseTensor.from_scipy(adj_t2) adj_t = gcn_norm(adj_t, None, n, add_self_loops=False) adj_t2 = gcn_norm(adj_t2, None, n, add_self_loops=False) self.adj_t = adj_t.to(edge_index.device) self.adj_t2 = adj_t2.to(edge_index.device)
def __init__(self, edge_index_dict, embedding_dim, metapath, walk_length, context_size, walks_per_node=1, num_negative_samples=1, num_nodes_dict=None, sparse=False): super(MetaPath2Vec, self).__init__() if num_nodes_dict is None: num_nodes_dict = {} for keys, edge_index in edge_index_dict.items(): key = keys[0] N = int(edge_index[0].max() + 1) num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N)) key = keys[-1] N = int(edge_index[1].max() + 1) num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N)) adj_dict = {} for keys, edge_index in edge_index_dict.items(): sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]]) row, col = edge_index adj = SparseTensor(row=row, col=col, sparse_sizes=sizes) adj = adj.to('cpu') adj_dict[keys] = adj assert metapath[0][0] == metapath[-1][-1] assert walk_length >= context_size self.adj_dict = adj_dict self.embedding_dim = embedding_dim self.metapath = metapath self.walk_length = walk_length self.context_size = context_size self.walks_per_node = walks_per_node self.num_negative_samples = num_negative_samples self.num_nodes_dict = num_nodes_dict types = set([x[0] for x in metapath]) | set([x[-1] for x in metapath]) types = sorted(list(types)) count = 0 self.start, self.end = {}, {} for key in types: self.start[key] = count count += num_nodes_dict[key] self.end[key] = count offset = [self.start[metapath[0][0]]] offset += [self.start[keys[-1]] for keys in metapath ] * int((walk_length / len(metapath)) + 1) offset = offset[:walk_length + 1] assert len(offset) == walk_length + 1 self.offset = torch.tensor(offset) self.embedding = Embedding(count, embedding_dim, sparse=sparse) self.reset_parameters()
def __init__(self, edge_index_dict, embedding_dim, metapath, walk_length, context_size, num_nodes_dict, types, type_accs, walks_per_node=1, num_negative_samples=1, sparse=False): super(MetaPath2Vec, self).__init__() adj_dict = {} for keys, edge_index in edge_index_dict.items(): sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]]) row, col = edge_index row, col = row - type_accs[keys[0]], col - type_accs[keys[-1]] adj = SparseTensor(row=row, col=col, sparse_sizes=sizes) adj = adj.to('cpu') adj_dict[keys] = adj assert metapath[0][0] == metapath[-1][-1] assert walk_length >= context_size self.adj_dict = adj_dict self.embedding_dim = embedding_dim self.metapath = metapath self.walk_length = walk_length self.context_size = context_size self.walks_per_node = walks_per_node self.num_negative_samples = num_negative_samples self.num_nodes_dict = num_nodes_dict count = 0 self.start, self.end = {}, {} for key in types: self.start[key] = count count += num_nodes_dict[key] self.end[key] = count offset = [self.start[metapath[0][0]]] offset += [self.start[keys[-1]] for keys in metapath ] * int((walk_length / len(metapath)) + 1) offset = offset[:walk_length + 1] assert len(offset) == walk_length + 1 self.offset = torch.tensor(offset) self.embedding = Embedding(count, embedding_dim, sparse=sparse) self.reset_parameters()
def to_sparse_gpu(data): (row, col), N = data.edge_index, data.num_nodes perm = (col * N + row).argsort() row, col = row[perm], col[perm] value = torch.ones(data.edge_index.shape[1]) adj_t = SparseTensor( row=col, col=row, value=value, sparse_sizes=(N, N), is_sorted=True ) # Pre-process some important attributes. adj_t.storage.rowptr() adj_t.storage.csr2csc() return adj_t.to(torch.float32).to("cuda")
class RandomWalk(): def __init__(self, edge_index, walk_length, context_size, walks_per_node=1, p=1, q=1, num_negative_samples=1, num_nodes=None, sparse=False): if random_walk is None: raise ImportError('`Node2Vec` requires `torch-cluster`.') N = maybe_num_nodes(edge_index, num_nodes) row, col = edge_index self.adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N)) self.adj = self.adj.to('cpu') assert walk_length >= context_size self.walk_length = walk_length - 1 self.context_size = context_size self.walks_per_node = walks_per_node self.p = p self.q = q self.num_negative_samples = num_negative_samples def loader(self, **kwargs): return DataLoader(range(self.adj.sparse_size(0)), collate_fn=self.sample, **kwargs) def sample(self, batch): if not isinstance(batch, torch.Tensor): batch = torch.tensor(batch) batch = batch.repeat(self.walks_per_node) rowptr, col, _ = self.adj.csr() rw = random_walk(rowptr, col, batch, self.walk_length, self.p, self.q) if not isinstance(rw, torch.Tensor): rw = rw[0] walks = [] num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size for j in range(num_walks_per_rw): for i in range(1, self.context_size): walks.append(rw[:, [j, j + i]]) return torch.cat(walks, dim=0)
def __init__(self, edge_index: torch.Tensor, sizes: List[int], split_idx=None, node_idx: Optional[torch.Tensor] = None, num_nodes: Optional[int] = None, flow: str = "source_to_target", prune=False, prune_set='train', prune_type='adaptive', **kwargs): self.N = N = int(edge_index.max() + 1) if num_nodes is None else num_nodes edge_attr = torch.arange(edge_index.size(1)) adj = SparseTensor(row=edge_index[0], col=edge_index[1], value=edge_attr, sparse_sizes=(N, N), is_sorted=False) adj = adj.t() if flow == 'source_to_target' else adj self.adj = adj.to('cpu') if node_idx is None: node_idx = torch.arange(N) elif node_idx.dtype == torch.bool: node_idx = node_idx.nonzero(as_tuple=False).view(-1) self.sizes = sizes self.flow = flow assert self.flow in ['source_to_target', 'target_to_source'] super(NeighborSampler, self).__init__(node_idx.tolist(), collate_fn=self.sample, **kwargs) if prune == True: self.split_idx = split_idx self.E = edge_index.size(1) self.edge_index = edge_index if prune_set == 'train': self.train_idx = self.split_idx['train'] else: self.train_idx = torch.cat([self.split_idx['train'], self.split_idx['valid']]) subadj, _ = self.adj.saint_subgraph(self.train_idx) # subadj = self.adj.to_dense()[self.train_idx][:,self.train_idx].view(-1) _,_,e_idx = subadj.coo() self.train_e_idx = e_idx.squeeze().long() self.train_edge_index = self.edge_index[:, self.train_e_idx] self.rest_idx = torch.cat([self.split_idx['valid'], self.split_idx['test']]) subadj2, _ = self.adj.saint_subgraph(self.rest_idx) _,_,rest_e_idx = subadj2.coo() self.rest_e_idx = rest_e_idx.squeeze().long()
print('Reading adjacency matrix...', end=' ', flush=True) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if osp.exists(path): adj_t = torch.load(path) else: edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor( row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) adj_t = adj_t.to_symmetric() torch.save(adj_t, path) adj_t = gcn_norm(adj_t, add_self_loops=False) if args.low_memory: adj_t = adj_t.to(torch.half) print(f'Done! [{time.perf_counter() - t:.2f}s]') train_idx = dataset.get_idx_split('train') valid_idx = dataset.get_idx_split('valid') test_idx = dataset.get_idx_split('test') y_train = torch.from_numpy(dataset.paper_label[train_idx]).to(torch.long) y_valid = torch.from_numpy(dataset.paper_label[valid_idx]).to(torch.long) model = LabelPropagation(args.num_layers, args.alpha) N, C = dataset.num_papers, dataset.num_classes t = time.perf_counter() print('Propagating labels...', end=' ', flush=True)
class Node2Vec(torch.nn.Module): r"""The Node2Vec model from the `"node2vec: Scalable Feature Learning for Networks" <https://arxiv.org/abs/1607.00653>`_ paper where random walks of length :obj:`walk_length` are sampled in a given graph, and node embeddings are learned via negative sampling optimization. .. note:: For an example of using Node2Vec, see `examples/node2vec.py <https://github.com/pyg-team/pytorch_geometric/blob/master/examples/ node2vec.py>`_. Args: edge_index (LongTensor): The edge indices. embedding_dim (int): The size of each embedding vector. walk_length (int): The walk length. context_size (int): The actual context size which is considered for positive samples. This parameter increases the effective sampling rate by reusing samples across different source nodes. walks_per_node (int, optional): The number of walks to sample for each node. (default: :obj:`1`) p (float, optional): Likelihood of immediately revisiting a node in the walk. (default: :obj:`1`) q (float, optional): Control parameter to interpolate between breadth-first strategy and depth-first strategy (default: :obj:`1`) num_negative_samples (int, optional): The number of negative samples to use for each positive sample. (default: :obj:`1`) num_nodes (int, optional): The number of nodes. (default: :obj:`None`) sparse (bool, optional): If set to :obj:`True`, gradients w.r.t. to the weight matrix will be sparse. (default: :obj:`False`) """ def __init__(self, edge_index, embedding_dim, walk_length, context_size, walks_per_node=1, p=1, q=1, num_negative_samples=1, num_nodes=None, sparse=False): super().__init__() if random_walk is None: raise ImportError('`Node2Vec` requires `torch-cluster`.') N = maybe_num_nodes(edge_index, num_nodes) row, col = edge_index self.adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N)) self.adj = self.adj.to('cpu') assert walk_length >= context_size self.embedding_dim = embedding_dim self.walk_length = walk_length - 1 self.context_size = context_size self.walks_per_node = walks_per_node self.p = p self.q = q self.num_negative_samples = num_negative_samples self.embedding = Embedding(N, embedding_dim, sparse=sparse) self.reset_parameters() def reset_parameters(self): self.embedding.reset_parameters() def forward(self, batch=None): """Returns the embeddings for the nodes in :obj:`batch`.""" emb = self.embedding.weight return emb if batch is None else emb.index_select(0, batch) def loader(self, **kwargs): return DataLoader(range(self.adj.sparse_size(0)), collate_fn=self.sample, **kwargs) def pos_sample(self, batch): batch = batch.repeat(self.walks_per_node) rowptr, col, _ = self.adj.csr() rw = random_walk(rowptr, col, batch, self.walk_length, self.p, self.q) if not isinstance(rw, torch.Tensor): rw = rw[0] walks = [] num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size for j in range(num_walks_per_rw): walks.append(rw[:, j:j + self.context_size]) return torch.cat(walks, dim=0) def neg_sample(self, batch): batch = batch.repeat(self.walks_per_node * self.num_negative_samples) rw = torch.randint(self.adj.sparse_size(0), (batch.size(0), self.walk_length)) rw = torch.cat([batch.view(-1, 1), rw], dim=-1) walks = [] num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size for j in range(num_walks_per_rw): walks.append(rw[:, j:j + self.context_size]) return torch.cat(walks, dim=0) def sample(self, batch): if not isinstance(batch, torch.Tensor): batch = torch.tensor(batch) return self.pos_sample(batch), self.neg_sample(batch) def loss(self, pos_rw, neg_rw): r"""Computes the loss given positive and negative random walks.""" # Positive loss. start, rest = pos_rw[:, 0], pos_rw[:, 1:].contiguous() h_start = self.embedding(start).view(pos_rw.size(0), 1, self.embedding_dim) h_rest = self.embedding(rest.view(-1)).view(pos_rw.size(0), -1, self.embedding_dim) out = (h_start * h_rest).sum(dim=-1).view(-1) pos_loss = -torch.log(torch.sigmoid(out) + EPS).mean() # Negative loss. start, rest = neg_rw[:, 0], neg_rw[:, 1:].contiguous() h_start = self.embedding(start).view(neg_rw.size(0), 1, self.embedding_dim) h_rest = self.embedding(rest.view(-1)).view(neg_rw.size(0), -1, self.embedding_dim) out = (h_start * h_rest).sum(dim=-1).view(-1) neg_loss = -torch.log(1 - torch.sigmoid(out) + EPS).mean() return pos_loss + neg_loss def test(self, train_z, train_y, test_z, test_y, solver='lbfgs', multi_class='auto', *args, **kwargs): r"""Evaluates latent space quality via a logistic regression downstream task.""" from sklearn.linear_model import LogisticRegression clf = LogisticRegression(solver=solver, multi_class=multi_class, *args, **kwargs).fit(train_z.detach().cpu().numpy(), train_y.detach().cpu().numpy()) return clf.score(test_z.detach().cpu().numpy(), test_y.detach().cpu().numpy()) def __repr__(self) -> str: return (f'{self.__class__.__name__}({self.embedding.weight.size(0)}, ' f'{self.embedding.weight.size(1)})')
def __init__( self, edge_index_dict: Dict[EdgeType, Tensor], embedding_dim: int, metapath: List[EdgeType], walk_length: int, context_size: int, walks_per_node: int = 1, num_negative_samples: int = 1, num_nodes_dict: Optional[Dict[NodeType, int]] = None, sparse: bool = False, ): super().__init__() if num_nodes_dict is None: num_nodes_dict = {} for keys, edge_index in edge_index_dict.items(): key = keys[0] N = int(edge_index[0].max() + 1) num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N)) key = keys[-1] N = int(edge_index[1].max() + 1) num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N)) adj_dict = {} for keys, edge_index in edge_index_dict.items(): sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]]) row, col = edge_index adj = SparseTensor(row=row, col=col, sparse_sizes=sizes) adj = adj.to('cpu') adj_dict[keys] = adj assert walk_length + 1 >= context_size if walk_length > len(metapath) and metapath[0][0] != metapath[-1][-1]: raise AttributeError( "The 'walk_length' is longer than the given 'metapath', but " "the 'metapath' does not denote a cycle") self.adj_dict = adj_dict self.embedding_dim = embedding_dim self.metapath = metapath self.walk_length = walk_length self.context_size = context_size self.walks_per_node = walks_per_node self.num_negative_samples = num_negative_samples self.num_nodes_dict = num_nodes_dict types = set([x[0] for x in metapath]) | set([x[-1] for x in metapath]) types = sorted(list(types)) count = 0 self.start, self.end = {}, {} for key in types: self.start[key] = count count += num_nodes_dict[key] self.end[key] = count offset = [self.start[metapath[0][0]]] offset += [self.start[keys[-1]] for keys in metapath ] * int((walk_length / len(metapath)) + 1) offset = offset[:walk_length + 1] assert len(offset) == walk_length + 1 self.offset = torch.tensor(offset) # + 1 denotes a dummy node used to link to for isolated nodes. self.embedding = Embedding(count + 1, embedding_dim, sparse=sparse) self.dummy_idx = count self.reset_parameters()
class JOINTSRMFSPARSE(GeneralRecommender): input_type = InputType.POINTWISE def __init__(self, config, dataset): super(JOINTSRMFSPARSE, self).__init__(config, dataset) # load dataset info self.LABEL = config['LABEL_FIELD'] self.embedding_dim = config['embedding_dimension'] self.alpha = config["alpha_item"] item_description_fields = config['item_description_fields'] max_number_of_reviews = config['number_of_reviews_to_use_item'] self.variant = config["debug_variant"] self.logger.info(f"embedding_dimension = {self.embedding_dim}") self.logger.info(f"alpha = {self.alpha}") self.logger.info( f"item_description_fields = {item_description_fields}") self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim) self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim) self.user_bias = nn.Parameter(torch.zeros(self.n_users)) self.item_bias = nn.Parameter(torch.zeros(self.n_items)) self.bias = nn.Parameter(torch.zeros(1)) self.apply(self._init_weights) gensim_cache = open('gensim_cache_path', 'r').read().strip() os.environ['GENSIM_DATA_DIR'] = str(gensim_cache) import gensim import gensim.downloader as api # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300" pretrained_embedding_name = "glove-wiki-gigaword-50" # because the size must be 50 the same as the embedding model_path = api.load(pretrained_embedding_name, return_path=True) model = gensim.models.KeyedVectors.load_word2vec_format(model_path) weights = torch.FloatTensor( model.vectors) # formerly syn0, which is soon deprecated self.logger.info(f"pretrained_embedding shape: {weights.shape}") self.word_embedding = nn.Embedding.from_pretrained(weights, freeze=True) self.vocab_size = len(model.key_to_index) s = time.time() item_lms = {} item_lm_len = {} item_desc_fields = [] if "item_description" in item_description_fields: item_desc_fields.append(3) if "item_genres" in item_description_fields: item_desc_fields.append(4) if "tags" in item_description_fields: item_desc_fields.append(4) if len(item_desc_fields) > 0: item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") item_id = dataset.token2id_exists("item_id", split[0]) if item_id == -1: continue if item_id == 0: print("Isnt that padding?") if item_id not in item_lms: item_lms[item_id] = {} item_lm_len[item_id] = 0 for fi in item_desc_fields: if fi >= len(split): print(split) continue desc = split[fi] for term in desc.split(): if term in model.key_to_index: wv_term_index = model.key_to_index[term] if wv_term_index not in item_lms[item_id]: item_lms[item_id][wv_term_index] = 1 else: item_lms[item_id][wv_term_index] += 1 item_lm_len[item_id] += 1 # Do reviews as well # inter: user_id:token item_id:token rating:float review:token_seq num_of_used_revs = {} if "review" in item_description_fields: # first we want to only load reviews that are in the training set so we specify those: training_set = {} for i in range(len(dataset.dataset.inter_feat["user_id"])): uid = int(dataset.dataset.inter_feat["user_id"][i]) iid = int(dataset.dataset.inter_feat["item_id"][i]) if uid not in training_set: training_set[uid] = set() training_set[uid].add(iid) item_desc_fields = [3] item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.inter") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") user_id = dataset.token2id_exists("user_id", split[0]) item_id = dataset.token2id_exists("item_id", split[1]) if item_id == -1 or user_id == -1: continue if item_id == 0 or user_id == 0: print("Isnt that padding?") if user_id not in training_set: continue if item_id not in training_set[user_id]: continue if item_id not in num_of_used_revs: num_of_used_revs[item_id] = 0 if max_number_of_reviews is not None and num_of_used_revs[ item_id] >= max_number_of_reviews: continue if item_id not in item_lms: item_lms[item_id] = {} item_lm_len[item_id] = 0 for fi in item_desc_fields: desc = split[fi] if len(desc) > 1: num_of_used_revs[item_id] += 1 for term in desc.split(): if term in model.key_to_index: wv_term_index = model.key_to_index[term] if wv_term_index not in item_lms[item_id]: item_lms[item_id][wv_term_index] = 1 else: item_lms[item_id][wv_term_index] += 1 item_lm_len[item_id] += 1 indices = [[0], [0]] values = [0] for item_id in item_lms.keys(): for k, v in item_lms[item_id].items(): indices[0].append(item_id) indices[1].append(k) values.append(v / item_lm_len[item_id]) self.lm_gt = SparseTensor(row=torch.tensor(indices[0], dtype=torch.long), col=torch.tensor(indices[1], dtype=torch.long), value=torch.tensor(values), sparse_sizes=(self.n_items, len(model.key_to_index))) if self.variant == 1: self.lm_gt = self.lm_gt.to(self.device) e = time.time() self.logger.info(f"{e - s}s") self.logger.info(f"Done with lm_gt construction!") self.sigmoid = nn.Sigmoid() self.loss_rec = nn.BCELoss() self.loss_lm = SoftCrossEntropyLoss() def _init_weights(self, module): if isinstance(module, nn.Embedding): normal_(module.weight.data, mean=0.0, std=0.01) @staticmethod def get_entries(array, keys): ret = [] for k in keys: ret.append(array[k]) return ret def forward_rec(self, user, item): user_emb = self.user_embedding(user) item_emb = self.item_embedding(item) pred = torch.sum(torch.mul(user_emb, item_emb), dim=1) pred = pred + self.item_bias[item] + self.user_bias[user] pred = pred + self.bias pred = self.sigmoid(pred) return pred def forward_lm(self, item): item_emb = self.item_embedding(item) pred = torch.matmul(item_emb, self.word_embedding.weight.T) return pred def calculate_loss(self, interaction): user = interaction[self.USER_ID] item = interaction[self.ITEM_ID] label = interaction[self.LABEL] with profiler.record_function("REC output and loss"): output_rec = self.forward_rec(user, item) loss_rec = self.loss_rec(output_rec, label) with profiler.record_function("LM output"): output_lm = self.forward_lm(item) if self.variant == 3: label_lm = self.lm_gt[item].to_dense().to(self.device) if self.variant == 2: label_lm = self.lm_gt[item].to(self.device).to_dense() if self.variant == 1: with profiler.record_function("LM making label on GPU"): label_lm = self.lm_gt[item].to_dense() # label_lm = torch.zeros(len(item), self.vocab_size, device=self.device) # for i in range(len(item)): # item_id = item[i] # label_lm[i] = self.lm_gt[item_id].to_dense() with profiler.record_function("LM loss"): loss_lm = self.loss_lm(output_lm, label_lm) return loss_rec, self.alpha * loss_lm def predict(self, interaction): user = interaction[self.USER_ID] item = interaction[self.ITEM_ID] output = self.forward_rec(user, item) return output
class BM25vec(GeneralRecommender): input_type = InputType.POINTWISE type = ModelType.TRADITIONAL def __init__(self, config, dataset): super(BM25vec, self).__init__(config, dataset) rec_model = config["saved_rec_model"] topk = config["inferred_lm_topk_w"] item_description_fields = config['item_description_fields'] max_number_of_reviews_item_lm = config['number_of_reviews_to_use_item'] user_profile_fields = config['user_profile_fields'] max_number_of_reviews_user_lm = config['number_of_reviews_to_use_user'] self.k1 = config["k1"] self.b = config["b"] self.c = config["c"] self.use_sparse = config["use_sparse"] step = config["rec_model_load_step"] if step is None: step = 200000 if rec_model is None and item_description_fields is None: print("Should specify rec_model or item_description_fields") exit(-1) elif rec_model is not None and item_description_fields is not None: print("Give either rec_model or item_description_fields, not both") exit(-1) if user_profile_fields is None: print("user_profile_fields should be given!") exit(-1) self.n_items = dataset.item_num self.n_users = dataset.user_num # load background idf print("Loading background corpus") s = time.time() background_idf_temp = {} jnius_config.set_classpath(get_fat_jar()) indexcorpus = open('background_corpus_path', 'r').read().strip() from jnius import autoclass JFile = autoclass("java.io.File") JFSDirectory = autoclass("org.apache.lucene.store.FSDirectory") fsdir = JFSDirectory.open(JFile(indexcorpus).toPath()) reader = autoclass("org.apache.lucene.index.DirectoryReader").open( fsdir) numdocs = reader.numDocs() JTerm = autoclass("org.apache.lucene.index.Term") # numterms = self.reader.getSumTotalTermFreq("contents") print(f"done {time.time()-s}") # create query/user LM: print("Creating user lm") s = time.time() uid_term_frequencies = {} self.uid_len = {} # self.uid_termprobs = {} num_of_used_revs = {} if "review" in user_profile_fields: # first we want to only load reviews that are in the training set so we specify those: training_set = {} for i in range(len(dataset.dataset.inter_feat["user_id"])): uid = int(dataset.dataset.inter_feat["user_id"][i]) iid = int(dataset.dataset.inter_feat["item_id"][i]) if uid not in training_set: training_set[uid] = set() training_set[uid].add(iid) user_fields = [3] inter_file = os.path.join(dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.inter") with open(inter_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") user_id = dataset.token2id_exists("user_id", split[0]) item_id = dataset.token2id_exists("item_id", split[1]) if item_id == -1 or user_id == -1: continue if item_id == 0 or user_id == 0: print("Isnt that padding?") if user_id not in training_set: continue if item_id not in training_set[user_id]: continue if user_id not in num_of_used_revs: num_of_used_revs[user_id] = 0 if max_number_of_reviews_user_lm is not None and num_of_used_revs[ user_id] >= max_number_of_reviews_user_lm: continue if user_id not in uid_term_frequencies: uid_term_frequencies[user_id] = {} self.uid_len[user_id] = 0 for fi in user_fields: desc = split[fi] if len(desc) > 1: num_of_used_revs[user_id] += 1 for term in desc.split(): if term not in uid_term_frequencies[user_id]: uid_term_frequencies[user_id][term] = 1 else: uid_term_frequencies[user_id][term] += 1 self.uid_len[user_id] += 1 # bg idf: if term not in background_idf_temp: jterm = JTerm("contents", term) df = reader.docFreq(jterm) background_idf_temp[term] = np.log10( (numdocs - df + 0.5) / (df + 0.5)) # for user_id in self.uid_term_frequencies.keys(): # self.uid_termprobs[user_id] = {k: (v/self.uid_len[user_id]) for k, v in self.uid_term_frequencies[user_id]} # TODO extend this for KITT users... from ..user files self.term_idx = {} self.background_idf = torch.zeros(len(background_idf_temp.keys())) idx = 0 for t, idf in background_idf_temp.items(): self.term_idx[t] = idx self.background_idf[idx] = idf idx += 1 self.background_idf.to(device=self.device) if self.use_sparse: indices = [[0], [0]] values = [0] for user_id in uid_term_frequencies.keys(): for t, v in uid_term_frequencies[user_id].items(): indices[0].append(user_id) indices[1].append(self.term_idx[t]) values.append(v) self.uid_term_frequencies = SparseTensor( row=torch.tensor(indices[0], dtype=torch.long), col=torch.tensor(indices[1], dtype=torch.long), value=torch.tensor(values), sparse_sizes=(self.n_users, len(self.background_idf))) self.uid_term_frequencies.to(self.device) else: self.uid_term_frequencies = torch.zeros( (self.n_users, len(self.background_idf)), device=self.device) for user in uid_term_frequencies: for t, v in uid_term_frequencies[user].items(): self.uid_term_frequencies[user][self.term_idx[t]] = v print(f"done {time.time()-s}") # item lm: doc_tf = {} self.doc_len = torch.zeros(self.n_items, device=self.device) # create item LM (inferred): if rec_model is not None: print("Creating inferred item lm") s = time.time() checkpoint_file = REC_MODELS[rec_model]["checkpoint_file"] model_name = REC_MODELS[rec_model]["model_name"] dataset_name = REC_MODELS[rec_model]["dataset_name"] config_dict = REC_MODELS[rec_model]["config_dict"] rec_model = ItemLM(checkpoint_file, model_name, dataset_name, k=topk, step=step, config_dict=config_dict) inferred_lm = rec_model.get_lm() for i in range(1, len(dataset.dataset.item_feat)): item_id = dataset.dataset.item_feat["item_id"][i] item_url_rid = dataset.dataset.item_feat["item_url"][i] item_url = dataset.id2token("item_url", item_url_rid) if item_url in inferred_lm: doc_tf[item_id] = inferred_lm[item_url] # doc_tf[item_id] = {inferred_lm[item_url][0][j]: inferred_lm[item_url][1][j] for j in range(len(inferred_lm[item_url][0]))} self.doc_len[item_id] = sum(inferred_lm[item_url][1]) else: doc_tf[item_id] = {([], [])} self.doc_len[item_id] = 0 print(f"{time.time() - s}") if self.use_sparse: indices = [[0], [0]] values = [0] for item_id in doc_tf.keys(): for i in range(len(doc_tf[item_id][0])): t = doc_tf[item_id][0][i] v = doc_tf[item_id][1][i] if t in self.term_idx: indices[0].append(item_id) indices[1].append(self.term_idx[t]) values.append(v) self.doc_tf = SparseTensor( row=torch.tensor(indices[0], dtype=torch.long), col=torch.tensor(indices[1], dtype=torch.long), value=torch.tensor(values), sparse_sizes=(self.n_items, len(self.background_idf))) self.doc_tf.to(self.device) else: self.doc_tf = torch.zeros( (self.n_items, len(self.background_idf)), device=self.device) for item in doc_tf: for i in range(len(doc_tf[item_id][0])): t = doc_tf[item_id][0][i] v = doc_tf[item_id][1][i] if t in self.term_idx: self.doc_tf[item][self.term_idx[t]] = v # OR create item LM statistical: elif item_description_fields is not None: print("Creating item lm") item_desc_fields = [] if "item_description" in item_description_fields: item_desc_fields.append(3) if "item_genres" in item_description_fields: item_desc_fields.append(4) if "tags" in item_description_fields: item_desc_fields.append(4) if len(item_desc_fields) > 0: item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") item_id = dataset.token2id_exists("item_id", split[0]) if item_id == -1: print(item_id) continue if item_id == 0: print("Isnt that padding?") if item_id not in doc_tf: doc_tf[item_id] = {} self.doc_len[item_id] = 0 for fi in item_desc_fields: if fi >= len(split): print(split) continue desc = split[fi] for term in desc.split(): if term not in doc_tf[item_id]: doc_tf[item_id][term] = 1 else: doc_tf[item_id][term] += 1 self.doc_len[item_id] += 1 num_of_used_revs = {} if "review" in item_description_fields: # first we want to only load reviews that are in the training set so we specify those: training_set = {} for i in range(len(dataset.dataset.inter_feat["user_id"])): uid = int(dataset.dataset.inter_feat["user_id"][i]) iid = int(dataset.dataset.inter_feat["item_id"][i]) if uid not in training_set: training_set[uid] = set() training_set[uid].add(iid) item_desc_fields = [3] item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.inter") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") user_id = dataset.token2id_exists("user_id", split[0]) item_id = dataset.token2id_exists("item_id", split[1]) if item_id == -1 or user_id == -1: continue if item_id == 0 or user_id == 0: print("Isnt that padding?") if user_id not in training_set: continue if item_id not in training_set[user_id]: continue if item_id not in num_of_used_revs: num_of_used_revs[item_id] = 0 if max_number_of_reviews_item_lm is not None and num_of_used_revs[ item_id] >= max_number_of_reviews_item_lm: continue if item_id not in doc_tf: doc_tf[item_id] = {} self.doc_len[item_id] = 0 for fi in item_desc_fields: desc = split[fi] if len(desc) > 1: num_of_used_revs[item_id] += 1 for term in desc.split(): if term not in doc_tf[item_id]: doc_tf[item_id][term] = 1 else: doc_tf[item_id][term] += 1 self.doc_len[item_id] += 1 if self.use_sparse: indices = [[0], [0]] values = [0] for item_id in doc_tf.keys(): for t, v in doc_tf[item_id].items(): if t in self.term_idx: indices[0].append(item_id) indices[1].append(self.term_idx[t]) values.append(v) self.doc_tf = SparseTensor( row=torch.tensor(indices[0], dtype=torch.long), col=torch.tensor(indices[1], dtype=torch.long), value=torch.tensor(values), sparse_sizes=(self.n_items, len(self.background_idf))) self.doc_tf.to(self.device) else: self.doc_tf = torch.zeros( (self.n_items, len(self.background_idf)), device=self.device) for item in doc_tf: for t, v in doc_tf[item].items(): if t in self.term_idx: self.doc_tf[item][self.term_idx[t]] = v self.average_doc_len = self.doc_len.sum() / self.doc_len.shape[0] print(self.average_doc_len) print(f"done {time.time()-s}") self.fake_loss = torch.nn.Parameter( torch.zeros(1)) # alaki yek parameter tarif mikonim why? def calculate_loss(self, interaction): pass return torch.nn.Parameter(torch.zeros(1)) def predict(self, interaction): users = interaction[self.USER_ID] items = interaction[self.ITEM_ID] # return a list of scores wrt the user item pairs if self.use_sparse: try: doctf = self.doc_tf[items].to_dense() except: print(items) print(self.n_items) print(self.doc_tf[items]) exit(-1) qtf = self.uid_term_frequencies[users].to_dense() else: doctf = self.doc_tf[items] qtf = self.uid_term_frequencies[users] numerator = doctf * (self.k1 + 1) t = self.k1 * (1 - self.b + self.b * (self.doc_len[items] / self.average_doc_len)) t = t.unsqueeze(1) denominator = doctf + t doctf = numerator / denominator if self.c is not None: qtf = (qtf * (self.c + 1)) / (qtf + self.c) ret = self.background_idf * doctf * qtf return ret.sum(1)
# full graph testing val_adj_t = SparseTensor( row=edge_index[0, edge_dates < args.test_year], col=edge_index[1, edge_dates < args.test_year], value=torch.ones((edge_dates < args.test_year).sum(), dtype=torch.float), sparse_sizes=(len(node_classes), len(node_classes)), ) val_edge_types = get_edge_types(val_adj_t.storage.row(), val_adj_t.storage.col(), node_classes) model.eval() with torch.no_grad(): z = model.encode( full_graph.feats.to(args.device), val_adj_t.to(args.device), val_edge_types, ) auc, ap = gae.test( z, model.decoder, 1, pos_test[1].to(args.device), neg_test[1].to(args.device), ) mlflow.log_metric("Chosen model test AUC GD", auc) mlflow.log_metric("Chosen model test AP GD", ap) z = model.encode( full_graph.feats.to(args.device), full_graph.train_adj_t.to(args.device), full_graph.train_edge_types,