def prepare_dataloader(data_dict, args): g = data_dict['graph'] user_ntype = data_dict['user_ntype'] item_ntype = data_dict['item_ntype'] textset = data_dict['textset'] # Sampler batch_sampler = sampler_module.ItemToItemBatchSampler( g, user_ntype, item_ntype, args.batch_size) neighbor_sampler = sampler_module.NeighborSampler( g, user_ntype, item_ntype, args.random_walk_length, args.random_walk_restart_prob, args.num_random_walks, args.num_neighbors, args.num_layers) collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset) dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=args.num_workers) dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)), batch_size=args.batch_size, collate_fn=collator.collate_test, num_workers=args.num_workers) dataloader_it = iter(dataloader) return dataloader_it, dataloader_test, neighbor_sampler
def train(dataset, args): g = dataset['train-graph'] val_matrix = dataset['val-matrix'].tocsr() test_matrix = dataset['test-matrix'].tocsr() item_texts = dataset['item-texts'] user_ntype = dataset['user-type'] item_ntype = dataset['item-type'] user_to_item_etype = dataset['user-to-item-type'] timestamp = dataset['timestamp-edge-column'] device = torch.device(args.device) # Prepare torchtext dataset and vocabulary fields = {} examples = [] for key, texts in item_texts.items(): fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True) for i in range(g.number_of_nodes(item_ntype)): example = torchtext.data.Example.fromlist( [item_texts[key][i] for key in item_texts.keys()], [(key, fields[key]) for key in item_texts.keys()]) examples.append(example) textset = torchtext.data.Dataset(examples, fields) for key, field in fields.items(): field.build_vocab(getattr(textset, key)) #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d') # Sampler batch_sampler = sampler_module.ItemToItemBatchSampler( g, user_ntype, item_ntype, args.batch_size) neighbor_sampler = sampler_module.NeighborSampler( g, user_ntype, item_ntype, args.random_walk_length, args.random_walk_restart_prob, args.num_random_walks, args.num_neighbors, args.num_layers) collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset) dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=args.num_workers) dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)), batch_size=args.batch_size, collate_fn=collator.collate_test, num_workers=args.num_workers) dataloader_it = iter(dataloader) # Model model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device) item_emb = nn.Embedding(g.number_of_nodes(item_ntype), args.hidden_dims, sparse=True) # Optimizer opt = torch.optim.Adam(model.parameters(), lr=args.lr) opt_emb = torch.optim.SparseAdam(item_emb.parameters(), lr=args.lr) # For each batch of head-tail-negative triplets... for epoch_id in range(args.num_epochs): model.train() for batch_id in tqdm.trange(args.batches_per_epoch): pos_graph, neg_graph, blocks = next(dataloader_it) # Copy to GPU for i in range(len(blocks)): blocks[i] = blocks[i].to(device) pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) loss = model(pos_graph, neg_graph, blocks, item_emb).mean() opt.zero_grad() opt_emb.zero_grad() loss.backward() opt.step() opt_emb.step() # Evaluate model.eval() with torch.no_grad(): item_batches = torch.arange(g.number_of_nodes(item_ntype)).split( args.batch_size) h_item_batches = [] for blocks in tqdm.tqdm(dataloader_test): for i in range(len(blocks)): blocks[i] = blocks[i].to(device) h_item_batches.append(model.get_repr(blocks, item_emb)) h_item = torch.cat(h_item_batches, 0) print( evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size))
def train(gpu, args): # Load dataset with open(args.dataset_path, 'rb') as f: dataset = pickle.load(f) rank = args.nr * args.gpus + gpu g = dataset['train-graph'] val_matrix = dataset['val-matrix'].tocsr() test_matrix = dataset['test-matrix'].tocsr() item_texts = dataset['item-texts'] user_ntype = dataset['user-type'] item_ntype = dataset['item-type'] user_to_item_etype = dataset['user-to-item-type'] timestamp = dataset['timestamp-edge-column'] dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank) cuda_string = 'cuda' + ':' + str(gpu) device = torch.device( cuda_string) if torch.cuda.is_available() else torch.device("cpu") print(device) # Assign user and movie IDs and use them as features (to learn an individual trainable # embedding for each entity) g.nodes[user_ntype].data['id'] = torch.arange( g.number_of_nodes(user_ntype)) g.nodes[item_ntype].data['id'] = torch.arange( g.number_of_nodes(item_ntype)) # Prepare torchtext dataset and vocabulary fields = {} examples = [] for key, texts in item_texts.items(): fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True) for i in range(g.number_of_nodes(item_ntype)): example = torchtext.data.Example.fromlist( [item_texts[key][i] for key in item_texts.keys()], [(key, fields[key]) for key in item_texts.keys()]) examples.append(example) textset = torchtext.data.Dataset(examples, fields) for key, field in fields.items(): field.build_vocab(getattr(textset, key)) #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d') # Sampler batch_sampler = sampler_module.ItemToItemBatchSampler( g, user_ntype, item_ntype, args.batch_size) neighbor_sampler = sampler_module.NeighborSampler( g, user_ntype, item_ntype, args.random_walk_length, args.random_walk_restart_prob, args.num_random_walks, args.num_neighbors, args.num_layers) collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset) dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=args.num_workers) dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)), batch_size=args.batch_size, collate_fn=collator.collate_test, num_workers=args.num_workers) dataloader_it = iter(dataloader) print(args.num_layers) # Model model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device) model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) # Optimizer print(model) opt = torch.optim.Adam(model.parameters(), lr=args.lr) # For each batch of head-tail-negative triplets... for epoch_id in range(args.num_epochs): model.train() for batch_id in tqdm.trange(args.batches_per_epoch): for pos_graph, neg_graph, blocks in dataloader: for i in range(len(blocks)): blocks[i] = blocks[i].to(device) pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) loss = model(pos_graph, neg_graph, blocks).mean() opt.zero_grad() loss.backward() opt.step() # Evaluate model.eval() with torch.no_grad(): item_batches = torch.arange(g.number_of_nodes(item_ntype)).split( args.batch_size) h_item_batches = [] for blocks in dataloader_test: for i in range(len(blocks)): blocks[i] = blocks[i].to(device) h_item_batches.append(model.get_repr(blocks)) h_item = torch.cat(h_item_batches, 0) print( evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size))
def train(dataset, args): g = dataset['train-graph'] val_matrix = dataset['val-matrix'].tocsr() test_matrix = dataset['test-matrix'].tocsr() item_texts = dataset['item-texts'] user_ntype = dataset['user-type'] item_ntype = dataset['item-type'] user_to_item_etype = dataset['user-to-item-type'] timestamp = dataset['timestamp-edge-column'] device = torch.device(args.device) # Assign user and movie IDs and use them as features (to learn an individual trainable # embedding for each entity) g.nodes[user_ntype].data['id'] = torch.arange(g.num_nodes(user_ntype)) g.nodes[item_ntype].data['id'] = torch.arange(g.num_nodes(item_ntype)) # Prepare torchtext dataset and Vocabulary textset = {} tokenizer = get_tokenizer(None) textlist = [] batch_first = True for i in range(g.num_nodes(item_ntype)): for key in item_texts.keys(): l = tokenizer(item_texts[key][i].lower()) textlist.append(l) for key, field in item_texts.items(): vocab2 = build_vocab_from_iterator(textlist, specials=["<unk>", "<pad>"]) textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'], batch_first) # Sampler batch_sampler = sampler_module.ItemToItemBatchSampler( g, user_ntype, item_ntype, args.batch_size) neighbor_sampler = sampler_module.NeighborSampler( g, user_ntype, item_ntype, args.random_walk_length, args.random_walk_restart_prob, args.num_random_walks, args.num_neighbors, args.num_layers) collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset) dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=args.num_workers) dataloader_test = DataLoader(torch.arange(g.num_nodes(item_ntype)), batch_size=args.batch_size, collate_fn=collator.collate_test, num_workers=args.num_workers) dataloader_it = iter(dataloader) # Model model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims, args.num_layers).to(device) # Optimizer opt = torch.optim.Adam(model.parameters(), lr=args.lr) # For each batch of head-tail-negative triplets... for epoch_id in range(args.num_epochs): model.train() for batch_id in tqdm.trange(args.batches_per_epoch): pos_graph, neg_graph, blocks = next(dataloader_it) # Copy to GPU for i in range(len(blocks)): blocks[i] = blocks[i].to(device) pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) loss = model(pos_graph, neg_graph, blocks).mean() opt.zero_grad() loss.backward() opt.step() # Evaluate model.eval() with torch.no_grad(): item_batches = torch.arange(g.num_nodes(item_ntype)).split( args.batch_size) h_item_batches = [] for blocks in dataloader_test: for i in range(len(blocks)): blocks[i] = blocks[i].to(device) h_item_batches.append(model.get_repr(blocks)) h_item = torch.cat(h_item_batches, 0) print( evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size))