class Recommender(nn.Module): def __init__( self, train_vocab, n_movies, params, ): super(Recommender, self).__init__() self.params = params self.train_vocab = train_vocab self.n_movies = n_movies self.cuda_available = torch.cuda.is_available() # instantiate the gensen module that will be used in the encoder HRNN, and by the recommender module self.gensen = GenSenSingle( model_folder=os.path.join(config.MODELS_PATH, 'GenSen'), filename_prefix='nli_large', pretrained_emb=os.path.join(config.MODELS_PATH, 'embeddings/glove.840B.300d.h5'), cuda=self.cuda_available) self.gensen.vocab_expansion(list(train_vocab)) # HRNN encoder # Conversation encoder not bidirectional self.encoder = HRNN(params=params['hrnn_params'], train_vocabulary=train_vocab, gensen=self.gensen, train_gensen=False, conv_bidirectional=False) self.recommender_module = RecommendFromDialogue( params=params['recommend_from_dialogue_params'], train_vocab=train_vocab, n_movies=n_movies, gensen=self.gensen, ) if params['language_aware_recommender']: self.language_to_user = nn.Linear( in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=self.recommender_module.autorec. user_representation_size) # latent variable distribution parameters: latent_layer_sizes = params['latent_layer_sizes'] if latent_layer_sizes is not None: latent_variable_size = latent_layer_sizes[-1] self.prior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=params['hrnn_params'] ['conversation_encoder_hidden_size'], out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = params['hrnn_params']['conversation_encoder_hidden_size'] \ if len(latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_prior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_prior = nn.Linear(penultimate_size, latent_variable_size) # context size + size of sentence representations posterior_input_size = params['hrnn_params']['conversation_encoder_hidden_size'] +\ 2 * params['hrnn_params']['sentence_encoder_hidden_size'] + 1 self.posterior_hidden_layers = nn.ModuleList([ nn.Linear(in_features=posterior_input_size, out_features=latent_layer_sizes[0]) if i == 0 else nn.Linear(in_features=latent_layer_sizes[i - 1], out_features=latent_layer_sizes[i]) for i in range(len(latent_layer_sizes) - 1) ]) penultimate_size = posterior_input_size if len( latent_layer_sizes) == 1 else latent_layer_sizes[-2] self.mu_posterior = nn.Linear(penultimate_size, latent_variable_size) self.sigma_posterior = nn.Linear(penultimate_size, latent_variable_size) context_size = params['hrnn_params'][ 'conversation_encoder_hidden_size'] if latent_layer_sizes is not None: context_size += latent_layer_sizes[-1] self.decoder = SwitchingDecoder(context_size=context_size, vocab_size=len(train_vocab), **params['decoder_params']) if self.cuda_available: self.cuda() self.decoder.set_pretrained_embeddings( self.encoder.gensen.encoder.src_embedding.weight.data) def reparametrize(self, mu, logvariance): """ Sample the latent variable :param mu: :param logvar: :return: """ std = torch.exp(0.5 * logvariance) tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor eps = Variable(torch.randn(std.data.shape, out=tt())) return mu + eps * std def forward(self, input_dict, return_latent=False): # encoder result: (batch_size, max_conv_length, conversation_encoder_hidden_size) conversation_representations, sentence_representations = self.encoder( input_dict, return_all=True, return_sentence_representations=True) batch_size, max_conversation_length, max_utterance_length = input_dict[ "dialogue"].data.shape # get movie_recommendations (batch, max_conv_length, n_movies) if self.params['language_aware_recommender']: user_rep_from_language = self.language_to_user( conversation_representations) movie_recommendations = self.recommender_module( dialogue=input_dict["dialogue"], senders=input_dict["senders"], lengths=input_dict["lengths"], conversation_lengths=input_dict["conversation_lengths"], movie_occurrences=input_dict["movie_occurrences"], recommend_new_movies=False, user_representation=user_rep_from_language if self.params['language_aware_recommender'] else None) # TODO: only decode recommender's utterances # Decoder: utterances = input_dict["dialogue"].view( batch_size * max_conversation_length, -1) lengths = input_dict["lengths"] # order by descending utterance length lengths = lengths.reshape((-1)) sorted_lengths, sorted_idx, rev = sort_for_packed_sequence( lengths, cuda=self.cuda_available) sorted_utterances = utterances.index_select(0, sorted_idx) # shift the context vectors one step in time tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor pad_tensor = (Variable( torch.zeros( batch_size, 1, self.params['hrnn_params']['conversation_encoder_hidden_size'], out=tt()))) conversation_representations = torch.cat( (pad_tensor, conversation_representations), 1).narrow(1, 0, max_conversation_length) # and reshape+reorder the same way as utterances conversation_representations = conversation_representations.contiguous().view( batch_size * max_conversation_length, self.params['hrnn_params']['conversation_encoder_hidden_size'])\ .index_select(0, sorted_idx) # shift the movie recommendations one step in time pad_tensor = (Variable( torch.zeros(batch_size, 1, self.n_movies, out=tt()))) movie_recommendations = torch.cat((pad_tensor, movie_recommendations), 1).narrow(1, 0, max_conversation_length) # and reshape+reorder movie_recommendations the same way as utterances movie_recommendations = movie_recommendations.contiguous().view( batch_size * max_conversation_length, -1).index_select(0, sorted_idx) # consider only lengths > 0 num_positive_lengths = np.sum(lengths > 0) sorted_utterances = sorted_utterances[:num_positive_lengths] sorted_lengths = sorted_lengths[:num_positive_lengths] conversation_representations = conversation_representations[: num_positive_lengths] movie_recommendations = movie_recommendations[:num_positive_lengths] # Latent variable if self.params['latent_layer_sizes'] is not None: # remember that conversation_representations have been shifted one step in time h_prior = conversation_representations for layer in self.prior_hidden_layers: h_prior = F.relu(layer(h_prior)) mu_prior = self.mu_prior(h_prior) logvar_prior = self.sigma_prior(h_prior) # posterior conditioned on current context, and representation of the next utterance (that is the # utterance about to be decoded) # reshape sentence representations the same way as utterances sentence_representations = sentence_representations.view( batch_size * max_conversation_length, 2 * self.params['hrnn_params']['sentence_encoder_hidden_size'] + 1).index_select(0, sorted_idx) sentence_representations = sentence_representations[: num_positive_lengths] h_posterior = torch.cat( (conversation_representations, sentence_representations), 1) for layer in self.posterior_hidden_layers: h_posterior = F.relu(layer(h_posterior)) mu_posterior = self.mu_posterior(h_posterior) logvar_posterior = self.sigma_posterior(h_posterior) # In training, sample from the posterior distribution. At test time, sample from prior. mu, logvar = (mu_posterior, logvar_posterior) if self.training else ( mu_prior, logvar_prior) z = self.reparametrize(mu, logvar) context = torch.cat((conversation_representations, z), 1) else: context = conversation_representations # Run decoder outputs = self.decoder(sorted_utterances, sorted_lengths, context, movie_recommendations, log_probabilities=True, sample_movies=False) # Complete the missing sequences (of length 0) if num_positive_lengths < batch_size * max_conversation_length: tt = torch.cuda.FloatTensor if self.cuda_available else torch.FloatTensor pad_tensor = Variable( torch.zeros(batch_size * max_conversation_length - num_positive_lengths, max_utterance_length, len(self.train_vocab) + self.n_movies, out=tt())) outputs = torch.cat((outputs, pad_tensor), 0) # print("OUTPUT SHAPE :", outputs.data.shape) # (batch * max_conv_len, max_sentence_len, vocab + n_movie) # retrieve original order outputs = outputs.index_select(0, rev). \ view(batch_size, max_conversation_length, max_utterance_length, -1) # print("OUTPUT SHAPE RETRIEVED IN ORDER", outputs.data.shape) # (batch, max_conv_len, max_sentence_len, vocab + n_movie) if return_latent: if self.params['latent_layer_sizes'] is None: raise ValueError( "Model has no latent variable, cannot return latent parameters." ) return outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior return outputs def train_iter(self, batch, criterion, kl_coefficient=1): self.train() if self.params['latent_layer_sizes'] is not None: outputs, mu_prior, logvar_prior, mu_posterior, logvar_posterior = self.forward( batch, return_latent=True) else: outputs = self.forward(batch, return_latent=False) batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape # indices of recommender's utterances(< batch * max_conv_len) idx = Variable( torch.nonzero((batch["senders"].view(-1) == -1).data).squeeze()) # select recommender's utterances for the loss outputs = outputs.view(-1, max_seq_length, vocab_size).index_select(0, idx) target = batch["target"].view(-1, max_seq_length).index_select(0, idx) loss = criterion(outputs.view(-1, vocab_size), target.view(-1)) # variational loss = KL(posterior || prior) if self.params['latent_layer_sizes'] is not None: # for two normal distributions, kld(p1, p2) = # log(sig2 / sig1) + (sig1^2 + (mu1-mu2)^2) / (2 sig2^2) - 1/2 # multivariate: (sig1 and sig2 the covariance matrices) # .5 * (tr(sig2^-1 sig1) + (mu2-mu1)T sig2^-1 (mu2-mu1) - k + ln(det(sig2) / det(sig1)) # in the case where sig1 and sig2 are diagonal: # .5 * sum(sig1^2 / sig2^2 + (mu2-mu1)^2 / sig2^2 - 1 + ln(sig2^2) - ln(sig1^2)) kld = .5 * ( -1 + logvar_prior - logvar_posterior + (torch.exp(logvar_posterior) + (mu_posterior - mu_prior).pow(2)) / torch.exp(logvar_prior)) kld = torch.mean(torch.sum(kld, -1)) # print("NLL loss {} KLD {}".format(loss.data, kld.data)) loss += kl_coefficient + kld # backward pass loss.backward() return loss.data[0] def evaluate(self, batch_loader, criterion, subset="valid"): """ Evaluate function :param subset: in {"valid", "train"}. Susbet on which to evaluate :return: the mean loss. """ self.eval() batch_loader.batch_index[subset] = 0 n_batches = batch_loader.n_batches[subset] losses = [] for _ in tqdm(range(n_batches)): # load batch batch = batch_loader.load_batch(subset=subset) if self.cuda_available: batch["dialogue"] = batch["dialogue"].cuda() batch["target"] = batch["target"].cuda() batch["senders"] = batch["senders"].cuda() # compute output and loss outputs = self.forward(batch) batch_size, max_conv_length, max_seq_length, vocab_size = outputs.data.shape # indices of recommender's utterances(< batch * max_conv_len) idx = Variable( torch.nonzero( (batch["senders"].view(-1) == -1).data).squeeze()) # select recommender's utterances for the loss outputs = outputs.view(-1, max_seq_length, vocab_size).index_select(0, idx) target = batch["target"].view(-1, max_seq_length).index_select(0, idx) loss = criterion(outputs.view(-1, vocab_size), target.view(-1)) losses.append(loss.data[0]) print("{} loss : {}".format(subset, np.mean(losses))) self.train() return np.mean(losses)
class EncodingIteratorBase(DataIterator): """ Base generator class of sentence encodings.""" def __init__(self, max_sent_length, max_sent_src, max_sent_trg, data_folder, model_folder, pretrain_path, prefix, source_file, target_file, use_gensen_w2i, device_ids=[0], data_parallelize=False, test=False): """ :param max_sent_length: max words in sentence gensen_h --> batch size x max_len x rep_size :param max_sent_src: number of sentences in source doc :param max_sent_trg: number of sentences in target doc :param data_folder: data location :param model_folder: location of pretrained gensen :param pretrain_path: location of pretrained embeddings (e.g. Glove) :param prefix: used of the type of gensen ["nli_large"+"bothskip"+"arxiv"] :param source_file: name of source file in data_folder :param target_file: name of target file in data_folder :param use_gensen_w2i: use the word to ids for pretrained gensen :param device_ids: used when data_parallelize = True, specify devices to use :param data_parallelize: :param test: """ self.max_len = max_sent_length # max words self.max_sent_src = max_sent_src # max sentences src self.max_sent_trg = max_sent_trg # max sentences trg self.data_folder = data_folder self.source_file = source_file self.target_file = target_file self.src_data = [] self.atrg_data = [] self.data_parallelize = data_parallelize self.device_ids = device_ids self.test = test logging.debug(""" max_len: {}, max_sent_src: {}, max_sent_trg: {}, data_folder: {}, source_file: {}, target_file: {} """.format(self.max_len, self.max_sent_src, self.max_sent_trg, self.data_folder, self.source_file, self.target_file)) self.gensen = GenSenSingle(model_folder=model_folder, filename_prefix=prefix, pretrained_emb=pretrain_path, cuda=True, max_sentence_length=max_sent_length, data_parallelize=data_parallelize, device_ids=device_ids[::-1]) self.sen_rep_dim = self.gensen.sen_rep_dim self.vocab_size = self.gensen.vocab_size self.emb_dim = self.gensen.embedding_dim self.vocab_expansion(use_gensen_w2i) def vocab_expansion(self, use_gensen_w2i): """ Read data from files.""" if self.test: logging.debug(" Testing with 100 documents") files = [self.source_file, self.target_file] data = [self.src_data, self.atrg_data] maxes_sen = [self.max_sent_src, self.max_sent_trg] for file, dt, max_sen in zip(files, data, maxes_sen): with open('%s/%s' % (self.data_folder, file), 'r', encoding="utf-8") as source: doc = [] for sentence in source: if doc and sentence.startswith("\n"): if len(doc) > max_sen: doc = doc[0:max_sen] dt.append(doc) doc = [] elif sentence.strip(): doc.append(sentence.strip()) if self.test and len(dt) > test_num_docs: break self.num_docs = len(self.src_data) assert self.num_docs == len(self.atrg_data) logging.info(" Constructing vocabulary...") if use_gensen_w2i: # if True does not construct a new vocab self.word2id = self.gensen.word2id self.id2word = self.gensen.id2word else: self.word2id, self.id2word = self.construct_vocab( list(chain.from_iterable(self.src_data)) + list(chain.from_iterable(self.atrg_data)), self.vocab_size) self.gensen.vocab_expansion(self.word2id.keys()) self.vocab_size = self.gensen.vocab_size logging.info(" Data has been read")
batch_size = 20000 hidden_size = 2048 max_length = 100 data_file = args.train_filename iterator = SentenceIterator(data_file, vocab_size=80000, max_length=max_length) model = GenSenSingle(model_folder=args.folder_path, filename_prefix=args.prefix, pretrained_emb=args.pretrain, cuda=True) iterator.word2id = model.word2id iterator.id2word = model.id2word model.vocab_expansion(model.id2word.values()) sentences = iterator.lines if batch_size is 'all' else iterator.lines[ 0:batch_size] sentences = [' '.join(s[:max_length]) for s in sentences] repr_last_h = np.empty((0, hidden_size)) for mbatch_idx, mbatch in enumerate(range(0, len(sentences), 200)): less_sentences = sentences[mbatch:mbatch + 200] _, last_h = model.get_representation(less_sentences, pool='last', return_numpy=True, tokenize=False) repr_last_h = np.append(repr_last_h, last_h, axis=0) print(repr_last_h.shape) iterator.build_kde(repr_last_h=repr_last_h, num_dim_pca=40, grid_search_num=7)