def __init__(self, opts, vocab, char_vocab, label_vocab):
        super(Char_CNN, self).__init__()

        random.seed(opts.seed)
        torch.manual_seed(opts.seed)
        torch.cuda.manual_seed(opts.seed)

        # embedding parameters
        self.embed_dim = opts.embed_size
        self.char_embed_dim = opts.char_embed_size
        self.vocab_size = vocab.m_size
        self.char_num = char_vocab.m_size
        self.pre_embed_path = opts.pre_embed_path
        self.str2idx = vocab.str2idx
        self.char_str2idx = char_vocab.str2idx
        self.embed_uniform_init = opts.embed_uniform_init

        # network parameters
        self.stride = opts.stride
        self.kernel_size = opts.kernel_size
        self.kernel_num = opts.kernel_num
        self.label_num = label_vocab.m_size
        self.embed_dropout = opts.embed_dropout
        self.fc_dropout = opts.fc_dropout

        # gpu option
        self.use_cuda = opts.use_cuda

        # embeddings: word level and char level
        self.word_embeddings = nn.Embedding(self.vocab_size, self.embed_dim)
        self.char_embeddings = nn.Embedding(self.char_num, self.char_embed_dim)

        if opts.pre_embed_path != '':
            embedding = Embedding.load_predtrained_emb_zero(
                self.pre_embed_path, self.str2idx)
            self.word_embeddings.weight.data.copy_(embedding)
        else:
            nn.init.uniform_(self.word_embeddings.weight.data,
                             -self.embed_uniform_init, self.embed_uniform_init)

        nn.init.uniform_(self.char_embeddings.weight.data,
                         -self.embed_uniform_init, self.embed_uniform_init)

        word_char_embed_dim = self.embed_dim + len(
            self.kernel_size) * self.kernel_num

        self.word_char_convs = nn.ModuleList([
            nn.Conv2d(1,
                      self.kernel_num, (K, word_char_embed_dim),
                      stride=self.stride,
                      padding=(K // 2, 0)) for K in self.kernel_size
        ])

        self.char_convs = nn.ModuleList([
            nn.Conv2d(1,
                      self.kernel_num, (K, self.char_embed_dim),
                      stride=self.stride,
                      padding=(K // 2, 0)) for K in self.kernel_size
        ])

        infea = len(self.kernel_size) * self.kernel_num
        self.linear1 = nn.Linear(infea, infea // 2)
        self.linear2 = nn.Linear(infea // 2, self.label_num)

        self.embed_dropout = nn.Dropout(self.embed_dropout)
        self.fc_dropout = nn.Dropout(self.fc_dropout)
Example #2
0
    product_sentences_prior = order_products_prior.sort_values([
        'order_id', 'add_to_cart_order'
    ]).groupby('order_id').apply(lambda order: order['product_id'].tolist())

    product_sentences = product_sentences_prior.append(
        product_sentences_train).values

    return product_sentences


if __name__ == '__main__':
    print('Generating sentences...')
    product_sentences = gen_product_sentences()

    print('Generating product_vector features...')
    embedding = Embedding(product_sentences)
    embedding.word_to_vector(size=100, window=5, min_count=2)
    embedding.reduce_dimension(n_components=2)
    product_vector_feat = embedding.return_dataframe(name='product_id')

    product_vector_feat['product_id'] = product_vector_feat[
        'product_id'].astype(np.int)
    product_vector_feat['product_id_vector_1'] = product_vector_feat[
        'product_id_vector_1'].astype(np.float)
    product_vector_feat['product_id_vector_2'] = product_vector_feat[
        'product_id_vector_1'].astype(np.float)
    product_vector_feat.set_index('product_id', inplace=True)

    pickle_dump(product_vector_feat,
                '{}/product_vector_feat.pkl'.format(config.feat_folder))
    print('Done - product_vector features')