Beispiel #1
0
 def __init__(self,
              word_embeddings,
              word_dists,
              review_count,
              dropout=0.0,
              pretrain_emb_path=None,
              fix_emb=False):
     super(ParagraphVector, self).__init__()
     self.word_embeddings = word_embeddings
     self.fix_emb = fix_emb
     self.dropout_ = dropout
     self.word_dists = word_dists
     self._embedding_size = self.word_embeddings.weight.size()[-1]
     self.review_count = review_count
     self.review_pad_idx = review_count - 1
     self.pretrain_emb_path = pretrain_emb_path
     if pretrain_emb_path is not None:
         _, pretrained_weights = load_pretrain_embeddings(pretrain_emb_path)
         pretrained_weights.append(
             [0. for _ in range(self._embedding_size)])
         pretrained_weights = torch.FloatTensor(pretrained_weights)
         self.review_embeddings = nn.Embedding.from_pretrained(
             pretrained_weights)
         #, scale_grad_by_freq = scale_grad, sparse=self.is_emb_sparse
     else:
         self.review_embeddings = nn.Embedding(
             self.review_count,
             self._embedding_size,
             padding_idx=self.review_pad_idx)
     if self.fix_emb:
         self.review_embeddings.weight.requires_grad = False
         self.dropout_ = 0
     self.drop_layer = nn.Dropout(p=self.dropout_)
     self.bce_logits_loss = torch.nn.BCEWithLogitsLoss(
         reduction='none')  #by default it's mean
Beispiel #2
0
 def __init__(self,
              word_embeddings,
              word_dists,
              corrupt_rate,
              dropout=0.0,
              pretrain_emb_path=None,
              vocab_words=None,
              fix_emb=False):
     super(ParagraphVectorCorruption, self).__init__()
     self.word_embeddings = word_embeddings
     self.word_dists = word_dists
     self._embedding_size = self.word_embeddings.weight.size()[-1]
     vocab_size = self.word_embeddings.weight.size()[0]
     self.word_pad_idx = vocab_size - 1
     if pretrain_emb_path is not None and vocab_words is not None:
         word_index_dic, pretrained_weights = load_pretrain_embeddings(
             pretrain_emb_path)
         word_indices = torch.tensor(
             [0] + [word_index_dic[x]
                    for x in vocab_words[1:]] + [self.word_pad_idx])
         pretrained_weights = torch.FloatTensor(pretrained_weights)
         self.context_embeddings = nn.Embedding.from_pretrained(
             pretrained_weights[word_indices],
             padding_idx=self.word_pad_idx)
     else:
         self.context_embeddings = self.word_embeddings
         #self.context_embeddings = nn.Embedding(
         #    vocab_size, self._embedding_size, padding_idx=self.word_pad_idx)
     if fix_emb:
         self.context_embeddings.weight.requires_grad = False
         self.dropout_ = 0
     self.corrupt_rate = corrupt_rate
     self.train_corrupt_rate = corrupt_rate
     self.dropout_ = dropout
     self.bce_logits_loss = torch.nn.BCEWithLogitsLoss(
         reduction='none')  #by default it's mean
Beispiel #3
0
    def __init__(self,
                 args,
                 device,
                 vocab_size,
                 review_count,
                 product_size,
                 user_size,
                 review_words,
                 vocab_words,
                 word_dists=None):
        super(ProductRanker, self).__init__()
        self.args = args
        self.device = device
        self.train_review_only = args.train_review_only
        self.embedding_size = args.embedding_size
        self.vocab_words = vocab_words
        self.word_dists = None
        if word_dists is not None:
            self.word_dists = torch.tensor(word_dists, device=device)
        self.prod_pad_idx = product_size
        self.user_pad_idx = user_size
        self.word_pad_idx = vocab_size - 1
        self.seg_pad_idx = 3
        self.review_pad_idx = review_count - 1
        self.emb_dropout = args.dropout
        self.review_encoder_name = args.review_encoder_name
        self.fix_emb = args.fix_emb

        padded_review_words = review_words
        if not self.args.do_subsample_mask:
            #otherwise, review_words should be already padded
            padded_review_words = pad(review_words,
                                      pad_id=self.word_pad_idx,
                                      width=args.review_word_limit)
        self.review_words = torch.tensor(padded_review_words, device=device)

        self.pretrain_emb_dir = None
        if os.path.exists(args.pretrain_emb_dir):
            self.pretrain_emb_dir = args.pretrain_emb_dir
        self.pretrain_up_emb_dir = None
        if os.path.exists(args.pretrain_up_emb_dir):
            self.pretrain_up_emb_dir = args.pretrain_up_emb_dir
        self.dropout_layer = nn.Dropout(p=args.dropout)

        if self.args.use_user_emb:
            if self.pretrain_up_emb_dir is None:
                self.user_emb = nn.Embedding(user_size + 1,
                                             self.embedding_size,
                                             padding_idx=self.user_pad_idx)
            else:
                pretrain_user_emb_path = os.path.join(self.pretrain_up_emb_dir,
                                                      "user_emb.txt")
                pretrained_weights = load_user_item_embeddings(
                    pretrain_user_emb_path)
                pretrained_weights.append([0.] * len(pretrained_weights[0]))
                assert len(pretrained_weights[0]) == self.embedding_size
                self.user_emb = nn.Embedding.from_pretrained(
                    torch.FloatTensor(pretrained_weights),
                    padding_idx=self.user_pad_idx)

        if self.args.use_item_emb:
            if self.pretrain_up_emb_dir is None:
                self.product_emb = nn.Embedding(product_size + 1,
                                                self.embedding_size,
                                                padding_idx=self.prod_pad_idx)
            else:
                pretrain_product_emb_path = os.path.join(
                    self.pretrain_up_emb_dir, "product_emb.txt")
                pretrained_weights = load_user_item_embeddings(
                    pretrain_product_emb_path)
                pretrained_weights.append([0.] * len(pretrained_weights[0]))
                self.product_emb = nn.Embedding.from_pretrained(
                    torch.FloatTensor(pretrained_weights),
                    padding_idx=self.prod_pad_idx)

        if self.pretrain_emb_dir is not None:
            #word_emb_fname = "word_emb.txt.gz" #for query and target words in pv and pvc
            word_emb_fname = "context_emb.txt.gz" if args.review_encoder_name == "pvc" else "word_emb.txt.gz"  #for query and target words in pv and pvc
            pretrain_word_emb_path = os.path.join(self.pretrain_emb_dir,
                                                  word_emb_fname)
            word_index_dic, pretrained_weights = load_pretrain_embeddings(
                pretrain_word_emb_path)
            word_indices = torch.tensor(
                [0] + [word_index_dic[x]
                       for x in self.vocab_words[1:]] + [self.word_pad_idx])
            #print(len(word_indices))
            #print(word_indices.cpu().tolist())
            pretrained_weights = torch.FloatTensor(pretrained_weights)
            self.word_embeddings = nn.Embedding.from_pretrained(
                pretrained_weights[word_indices],
                padding_idx=self.word_pad_idx)
            #vectors of padding idx will not be updated
        else:
            self.word_embeddings = nn.Embedding(vocab_size,
                                                self.embedding_size,
                                                padding_idx=self.word_pad_idx)

        if self.fix_emb and args.review_encoder_name == "pvc":
            #if review embeddings are fixed, just load the aggregated embeddings which include all the words in the review
            #otherwise the reviews are cut off at review_word_limit
            self.review_encoder_name = "pv"

        self.transformer_encoder = TransformerEncoder(self.embedding_size,
                                                      args.ff_size, args.heads,
                                                      args.dropout,
                                                      args.inter_layers)

        if self.review_encoder_name == "pv":
            pretrain_emb_path = None
            if self.pretrain_emb_dir is not None:
                pretrain_emb_path = os.path.join(self.pretrain_emb_dir,
                                                 "doc_emb.txt.gz")
            self.review_encoder = ParagraphVector(self.word_embeddings,
                                                  self.word_dists,
                                                  review_count,
                                                  self.emb_dropout,
                                                  pretrain_emb_path,
                                                  fix_emb=self.fix_emb)
        elif self.review_encoder_name == "pvc":
            pretrain_emb_path = None
            #if self.pretrain_emb_dir is not None:
            #    pretrain_emb_path = os.path.join(self.pretrain_emb_dir, "context_emb.txt.gz")
            self.review_encoder = ParagraphVectorCorruption(
                self.word_embeddings,
                self.word_dists,
                args.corrupt_rate,
                self.emb_dropout,
                pretrain_emb_path,
                self.vocab_words,
                fix_emb=self.fix_emb)
        elif self.review_encoder_name == "fs":
            self.review_encoder = FSEncoder(self.embedding_size,
                                            self.emb_dropout)
        else:
            self.review_encoder = AVGEncoder(self.embedding_size,
                                             self.emb_dropout)

        if args.query_encoder_name == "fs":
            self.query_encoder = FSEncoder(self.embedding_size,
                                           self.emb_dropout)
        else:
            self.query_encoder = AVGEncoder(self.embedding_size,
                                            self.emb_dropout)
        self.seg_embeddings = nn.Embedding(4,
                                           self.embedding_size,
                                           padding_idx=self.seg_pad_idx)
        #for each q,u,i
        #Q, previous purchases of u, current available reviews for i, padding value
        #self.logsoftmax = torch.nn.LogSoftmax(dim = -1)
        #self.bce_logits_loss = torch.nn.BCEWithLogitsLoss(reduction='none')#by default it's mean

        self.review_embeddings = None
        if self.fix_emb:
            #self.word_embeddings.weight.requires_grad = False
            #embeddings of query words need to be update
            #self.emb_dropout = 0
            self.get_review_embeddings()  #get model.review_embeddings

        self.initialize_parameters(logger)  #logger
        self.to(device)  #change model in place
    def __init__(self,
                 args,
                 device,
                 vocab_size,
                 product_size,
                 vocab_words,
                 word_dists=None):
        super(ItemTransformerRanker, self).__init__()
        self.args = args
        self.device = device
        self.train_review_only = args.train_review_only
        self.embedding_size = args.embedding_size
        self.vocab_words = vocab_words
        self.word_dists = None
        if word_dists is not None:
            self.word_dists = torch.tensor(word_dists, device=device)
        self.prod_dists = torch.ones(product_size, device=device)
        self.prod_pad_idx = product_size
        self.word_pad_idx = vocab_size - 1
        self.seg_pad_idx = 3
        self.emb_dropout = args.dropout
        self.pretrain_emb_dir = None
        if os.path.exists(args.pretrain_emb_dir):
            self.pretrain_emb_dir = args.pretrain_emb_dir
        self.pretrain_up_emb_dir = None
        if os.path.exists(args.pretrain_up_emb_dir):
            self.pretrain_up_emb_dir = args.pretrain_up_emb_dir
        self.dropout_layer = nn.Dropout(p=args.dropout)

        self.product_emb = nn.Embedding(product_size + 1,
                                        self.embedding_size,
                                        padding_idx=self.prod_pad_idx)
        if args.sep_prod_emb:
            self.hist_product_emb = nn.Embedding(product_size + 1,
                                                 self.embedding_size,
                                                 padding_idx=self.prod_pad_idx)
        '''
        else:
            pretrain_product_emb_path = os.path.join(self.pretrain_up_emb_dir, "product_emb.txt")
            pretrained_weights = load_user_item_embeddings(pretrain_product_emb_path)
            pretrained_weights.append([0.] * len(pretrained_weights[0]))
            self.product_emb = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_weights), padding_idx=self.prod_pad_idx)
        '''
        self.product_bias = nn.Parameter(torch.zeros(product_size + 1),
                                         requires_grad=True)
        self.word_bias = nn.Parameter(torch.zeros(vocab_size),
                                      requires_grad=True)

        if self.pretrain_emb_dir is not None:
            word_emb_fname = "word_emb.txt.gz"  #for query and target words in pv and pvc
            pretrain_word_emb_path = os.path.join(self.pretrain_emb_dir,
                                                  word_emb_fname)
            word_index_dic, pretrained_weights = load_pretrain_embeddings(
                pretrain_word_emb_path)
            word_indices = torch.tensor(
                [0] + [word_index_dic[x]
                       for x in self.vocab_words[1:]] + [self.word_pad_idx])
            #print(len(word_indices))
            #print(word_indices.cpu().tolist())
            pretrained_weights = torch.FloatTensor(pretrained_weights)
            self.word_embeddings = nn.Embedding.from_pretrained(
                pretrained_weights[word_indices],
                padding_idx=self.word_pad_idx)
            #vectors of padding idx will not be updated
        else:
            self.word_embeddings = nn.Embedding(vocab_size,
                                                self.embedding_size,
                                                padding_idx=self.word_pad_idx)
        if self.args.model_name == "item_transformer":
            self.transformer_encoder = TransformerEncoder(
                self.embedding_size, args.ff_size, args.heads, args.dropout,
                args.inter_layers)
        #if self.args.model_name == "ZAM" or self.args.model_name == "AEM":
        else:
            self.attention_encoder = MultiHeadedAttention(
                args.heads, self.embedding_size, args.dropout)

        if args.query_encoder_name == "fs":
            self.query_encoder = FSEncoder(self.embedding_size,
                                           self.emb_dropout)
        else:
            self.query_encoder = AVGEncoder(self.embedding_size,
                                            self.emb_dropout)
        self.seg_embeddings = nn.Embedding(4,
                                           self.embedding_size,
                                           padding_idx=self.seg_pad_idx)
        #for each q,u,i
        #Q, previous purchases of u, current available reviews for i, padding value
        #self.logsoftmax = torch.nn.LogSoftmax(dim = -1)
        self.bce_logits_loss = torch.nn.BCEWithLogitsLoss(
            reduction='none')  #by default it's mean

        self.initialize_parameters(logger)  #logger
        self.to(device)  #change model in place
        self.item_loss = 0
        self.ps_loss = 0