def __init__(self, word_embeddings, word_dists, review_count, dropout=0.0, pretrain_emb_path=None, fix_emb=False): super(ParagraphVector, self).__init__() self.word_embeddings = word_embeddings self.fix_emb = fix_emb self.dropout_ = dropout self.word_dists = word_dists self._embedding_size = self.word_embeddings.weight.size()[-1] self.review_count = review_count self.review_pad_idx = review_count - 1 self.pretrain_emb_path = pretrain_emb_path if pretrain_emb_path is not None: _, pretrained_weights = load_pretrain_embeddings(pretrain_emb_path) pretrained_weights.append( [0. for _ in range(self._embedding_size)]) pretrained_weights = torch.FloatTensor(pretrained_weights) self.review_embeddings = nn.Embedding.from_pretrained( pretrained_weights) #, scale_grad_by_freq = scale_grad, sparse=self.is_emb_sparse else: self.review_embeddings = nn.Embedding( self.review_count, self._embedding_size, padding_idx=self.review_pad_idx) if self.fix_emb: self.review_embeddings.weight.requires_grad = False self.dropout_ = 0 self.drop_layer = nn.Dropout(p=self.dropout_) self.bce_logits_loss = torch.nn.BCEWithLogitsLoss( reduction='none') #by default it's mean
def __init__(self, word_embeddings, word_dists, corrupt_rate, dropout=0.0, pretrain_emb_path=None, vocab_words=None, fix_emb=False): super(ParagraphVectorCorruption, self).__init__() self.word_embeddings = word_embeddings self.word_dists = word_dists self._embedding_size = self.word_embeddings.weight.size()[-1] vocab_size = self.word_embeddings.weight.size()[0] self.word_pad_idx = vocab_size - 1 if pretrain_emb_path is not None and vocab_words is not None: word_index_dic, pretrained_weights = load_pretrain_embeddings( pretrain_emb_path) word_indices = torch.tensor( [0] + [word_index_dic[x] for x in vocab_words[1:]] + [self.word_pad_idx]) pretrained_weights = torch.FloatTensor(pretrained_weights) self.context_embeddings = nn.Embedding.from_pretrained( pretrained_weights[word_indices], padding_idx=self.word_pad_idx) else: self.context_embeddings = self.word_embeddings #self.context_embeddings = nn.Embedding( # vocab_size, self._embedding_size, padding_idx=self.word_pad_idx) if fix_emb: self.context_embeddings.weight.requires_grad = False self.dropout_ = 0 self.corrupt_rate = corrupt_rate self.train_corrupt_rate = corrupt_rate self.dropout_ = dropout self.bce_logits_loss = torch.nn.BCEWithLogitsLoss( reduction='none') #by default it's mean
def __init__(self, args, device, vocab_size, review_count, product_size, user_size, review_words, vocab_words, word_dists=None): super(ProductRanker, self).__init__() self.args = args self.device = device self.train_review_only = args.train_review_only self.embedding_size = args.embedding_size self.vocab_words = vocab_words self.word_dists = None if word_dists is not None: self.word_dists = torch.tensor(word_dists, device=device) self.prod_pad_idx = product_size self.user_pad_idx = user_size self.word_pad_idx = vocab_size - 1 self.seg_pad_idx = 3 self.review_pad_idx = review_count - 1 self.emb_dropout = args.dropout self.review_encoder_name = args.review_encoder_name self.fix_emb = args.fix_emb padded_review_words = review_words if not self.args.do_subsample_mask: #otherwise, review_words should be already padded padded_review_words = pad(review_words, pad_id=self.word_pad_idx, width=args.review_word_limit) self.review_words = torch.tensor(padded_review_words, device=device) self.pretrain_emb_dir = None if os.path.exists(args.pretrain_emb_dir): self.pretrain_emb_dir = args.pretrain_emb_dir self.pretrain_up_emb_dir = None if os.path.exists(args.pretrain_up_emb_dir): self.pretrain_up_emb_dir = args.pretrain_up_emb_dir self.dropout_layer = nn.Dropout(p=args.dropout) if self.args.use_user_emb: if self.pretrain_up_emb_dir is None: self.user_emb = nn.Embedding(user_size + 1, self.embedding_size, padding_idx=self.user_pad_idx) else: pretrain_user_emb_path = os.path.join(self.pretrain_up_emb_dir, "user_emb.txt") pretrained_weights = load_user_item_embeddings( pretrain_user_emb_path) pretrained_weights.append([0.] * len(pretrained_weights[0])) assert len(pretrained_weights[0]) == self.embedding_size self.user_emb = nn.Embedding.from_pretrained( torch.FloatTensor(pretrained_weights), padding_idx=self.user_pad_idx) if self.args.use_item_emb: if self.pretrain_up_emb_dir is None: self.product_emb = nn.Embedding(product_size + 1, self.embedding_size, padding_idx=self.prod_pad_idx) else: pretrain_product_emb_path = os.path.join( self.pretrain_up_emb_dir, "product_emb.txt") pretrained_weights = load_user_item_embeddings( pretrain_product_emb_path) pretrained_weights.append([0.] * len(pretrained_weights[0])) self.product_emb = nn.Embedding.from_pretrained( torch.FloatTensor(pretrained_weights), padding_idx=self.prod_pad_idx) if self.pretrain_emb_dir is not None: #word_emb_fname = "word_emb.txt.gz" #for query and target words in pv and pvc word_emb_fname = "context_emb.txt.gz" if args.review_encoder_name == "pvc" else "word_emb.txt.gz" #for query and target words in pv and pvc pretrain_word_emb_path = os.path.join(self.pretrain_emb_dir, word_emb_fname) word_index_dic, pretrained_weights = load_pretrain_embeddings( pretrain_word_emb_path) word_indices = torch.tensor( [0] + [word_index_dic[x] for x in self.vocab_words[1:]] + [self.word_pad_idx]) #print(len(word_indices)) #print(word_indices.cpu().tolist()) pretrained_weights = torch.FloatTensor(pretrained_weights) self.word_embeddings = nn.Embedding.from_pretrained( pretrained_weights[word_indices], padding_idx=self.word_pad_idx) #vectors of padding idx will not be updated else: self.word_embeddings = nn.Embedding(vocab_size, self.embedding_size, padding_idx=self.word_pad_idx) if self.fix_emb and args.review_encoder_name == "pvc": #if review embeddings are fixed, just load the aggregated embeddings which include all the words in the review #otherwise the reviews are cut off at review_word_limit self.review_encoder_name = "pv" self.transformer_encoder = TransformerEncoder(self.embedding_size, args.ff_size, args.heads, args.dropout, args.inter_layers) if self.review_encoder_name == "pv": pretrain_emb_path = None if self.pretrain_emb_dir is not None: pretrain_emb_path = os.path.join(self.pretrain_emb_dir, "doc_emb.txt.gz") self.review_encoder = ParagraphVector(self.word_embeddings, self.word_dists, review_count, self.emb_dropout, pretrain_emb_path, fix_emb=self.fix_emb) elif self.review_encoder_name == "pvc": pretrain_emb_path = None #if self.pretrain_emb_dir is not None: # pretrain_emb_path = os.path.join(self.pretrain_emb_dir, "context_emb.txt.gz") self.review_encoder = ParagraphVectorCorruption( self.word_embeddings, self.word_dists, args.corrupt_rate, self.emb_dropout, pretrain_emb_path, self.vocab_words, fix_emb=self.fix_emb) elif self.review_encoder_name == "fs": self.review_encoder = FSEncoder(self.embedding_size, self.emb_dropout) else: self.review_encoder = AVGEncoder(self.embedding_size, self.emb_dropout) if args.query_encoder_name == "fs": self.query_encoder = FSEncoder(self.embedding_size, self.emb_dropout) else: self.query_encoder = AVGEncoder(self.embedding_size, self.emb_dropout) self.seg_embeddings = nn.Embedding(4, self.embedding_size, padding_idx=self.seg_pad_idx) #for each q,u,i #Q, previous purchases of u, current available reviews for i, padding value #self.logsoftmax = torch.nn.LogSoftmax(dim = -1) #self.bce_logits_loss = torch.nn.BCEWithLogitsLoss(reduction='none')#by default it's mean self.review_embeddings = None if self.fix_emb: #self.word_embeddings.weight.requires_grad = False #embeddings of query words need to be update #self.emb_dropout = 0 self.get_review_embeddings() #get model.review_embeddings self.initialize_parameters(logger) #logger self.to(device) #change model in place
def __init__(self, args, device, vocab_size, product_size, vocab_words, word_dists=None): super(ItemTransformerRanker, self).__init__() self.args = args self.device = device self.train_review_only = args.train_review_only self.embedding_size = args.embedding_size self.vocab_words = vocab_words self.word_dists = None if word_dists is not None: self.word_dists = torch.tensor(word_dists, device=device) self.prod_dists = torch.ones(product_size, device=device) self.prod_pad_idx = product_size self.word_pad_idx = vocab_size - 1 self.seg_pad_idx = 3 self.emb_dropout = args.dropout self.pretrain_emb_dir = None if os.path.exists(args.pretrain_emb_dir): self.pretrain_emb_dir = args.pretrain_emb_dir self.pretrain_up_emb_dir = None if os.path.exists(args.pretrain_up_emb_dir): self.pretrain_up_emb_dir = args.pretrain_up_emb_dir self.dropout_layer = nn.Dropout(p=args.dropout) self.product_emb = nn.Embedding(product_size + 1, self.embedding_size, padding_idx=self.prod_pad_idx) if args.sep_prod_emb: self.hist_product_emb = nn.Embedding(product_size + 1, self.embedding_size, padding_idx=self.prod_pad_idx) ''' else: pretrain_product_emb_path = os.path.join(self.pretrain_up_emb_dir, "product_emb.txt") pretrained_weights = load_user_item_embeddings(pretrain_product_emb_path) pretrained_weights.append([0.] * len(pretrained_weights[0])) self.product_emb = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_weights), padding_idx=self.prod_pad_idx) ''' self.product_bias = nn.Parameter(torch.zeros(product_size + 1), requires_grad=True) self.word_bias = nn.Parameter(torch.zeros(vocab_size), requires_grad=True) if self.pretrain_emb_dir is not None: word_emb_fname = "word_emb.txt.gz" #for query and target words in pv and pvc pretrain_word_emb_path = os.path.join(self.pretrain_emb_dir, word_emb_fname) word_index_dic, pretrained_weights = load_pretrain_embeddings( pretrain_word_emb_path) word_indices = torch.tensor( [0] + [word_index_dic[x] for x in self.vocab_words[1:]] + [self.word_pad_idx]) #print(len(word_indices)) #print(word_indices.cpu().tolist()) pretrained_weights = torch.FloatTensor(pretrained_weights) self.word_embeddings = nn.Embedding.from_pretrained( pretrained_weights[word_indices], padding_idx=self.word_pad_idx) #vectors of padding idx will not be updated else: self.word_embeddings = nn.Embedding(vocab_size, self.embedding_size, padding_idx=self.word_pad_idx) if self.args.model_name == "item_transformer": self.transformer_encoder = TransformerEncoder( self.embedding_size, args.ff_size, args.heads, args.dropout, args.inter_layers) #if self.args.model_name == "ZAM" or self.args.model_name == "AEM": else: self.attention_encoder = MultiHeadedAttention( args.heads, self.embedding_size, args.dropout) if args.query_encoder_name == "fs": self.query_encoder = FSEncoder(self.embedding_size, self.emb_dropout) else: self.query_encoder = AVGEncoder(self.embedding_size, self.emb_dropout) self.seg_embeddings = nn.Embedding(4, self.embedding_size, padding_idx=self.seg_pad_idx) #for each q,u,i #Q, previous purchases of u, current available reviews for i, padding value #self.logsoftmax = torch.nn.LogSoftmax(dim = -1) self.bce_logits_loss = torch.nn.BCEWithLogitsLoss( reduction='none') #by default it's mean self.initialize_parameters(logger) #logger self.to(device) #change model in place self.item_loss = 0 self.ps_loss = 0