def _build_network(self): # Category Embedding layers self.cat_embedding_layers = nn.ModuleList( [ nn.Embedding(cardinality, self.hparams.embedding_dim) for cardinality in self.hparams.categorical_cardinality ] ) if self.hparams.batch_norm_continuous_input: self.normalizing_batch_norm = nn.BatchNorm1d(self.hparams.continuous_dim) # Continuous Embedding Layer self.cont_embedding_layer = nn.Embedding( self.hparams.continuous_dim, self.hparams.embedding_dim ) if self.hparams.embedding_dropout != 0 and self.embedding_cat_dim != 0: self.embed_dropout = nn.Dropout(self.hparams.embedding_dropout) # Deep Layers _curr_units = self.hparams.embedding_dim if self.hparams.deep_layers: activation = getattr(nn, self.hparams.activation) # Linear Layers layers = [] for units in self.hparams.layers.split("-"): layers.extend( _linear_dropout_bn( self.hparams, _curr_units, int(units), activation, self.hparams.dropout, ) ) _curr_units = int(units) self.linear_layers = nn.Sequential(*layers) # Projection to Multi-Headed Attention Dims self.attn_proj = nn.Linear(_curr_units, self.hparams.attn_embed_dim) _initialize_layers(self.hparams, self.attn_proj) # Multi-Headed Attention Layers self.self_attns = nn.ModuleList( [ nn.MultiheadAttention( self.hparams.attn_embed_dim, self.hparams.num_heads, dropout=self.hparams.attn_dropouts, ) for _ in range(self.hparams.num_attn_blocks) ] ) if self.hparams.has_residuals: self.V_res_embedding = torch.nn.Linear( _curr_units, self.hparams.attn_embed_dim * self.hparams.num_attn_blocks if self.hparams.attention_pooling else self.hparams.attn_embed_dim, ) self.output_dim = ( self.hparams.continuous_dim + self.hparams.categorical_dim ) * self.hparams.attn_embed_dim if self.hparams.attention_pooling: self.output_dim = self.output_dim * self.hparams.num_attn_blocks
def _build_network(self): # Linear Layers layers = [] _curr_units = self.embedding_cat_dim + self.hparams.continuous_dim if self.hparams.embedding_dropout != 0 and self.embedding_cat_dim != 0: layers.append(nn.Dropout(self.hparams.embedding_dropout)) for units in self.hparams.layers.split("-"): layers.extend( _linear_dropout_bn( self.hparams.activation, self.hparams.initialization, self.hparams.use_batch_norm, _curr_units, int(units), self.hparams.dropout, )) _curr_units = int(units) self.linear_layers = nn.Sequential(*layers) self.output_dim = _curr_units # Embedding layers self.embedding_layers = nn.ModuleList( [nn.Embedding(x, y) for x, y in self.hparams.embedding_dims]) # Continuous Layers if self.hparams.batch_norm_continuous_input: self.normalizing_batch_norm = nn.BatchNorm1d( self.hparams.continuous_dim)
def _build_network(self): activation = getattr(nn, self.hparams.activation) # Linear Layers layers = [] _curr_units = self.embedding_cat_dim + self.hparams.continuous_dim if self.hparams.embedding_dropout != 0 and self.embedding_cat_dim != 0: layers.append(nn.Dropout(self.hparams.embedding_dropout)) for units in self.hparams.layers.split("-"): layers.extend( _linear_dropout_bn( self.hparams, _curr_units, int(units), activation, self.hparams.dropout, ) ) _curr_units = int(units) self.linear_layers = nn.Sequential(*layers) self.output_dim = _curr_units
def _build_network(self): d_sqrt_inv = 1 / math.sqrt(self.hparams.input_embed_dim) if self.hparams.categorical_dim > 0: # Category Embedding layers if self.hparams.share_embedding: self.cat_embedding_layers = nn.ModuleList([ SharedEmbeddings( cardinality, self.hparams.input_embed_dim, add_shared_embed=self.hparams.share_embedding_strategy == "add", frac_shared_embed=self.hparams. shared_embedding_fraction, ) for cardinality in self.hparams.categorical_cardinality ]) else: self.cat_embedding_layers = nn.ModuleList([ nn.Embedding(cardinality, self.hparams.input_embed_dim) for cardinality in self.hparams.categorical_cardinality ]) if self.hparams.embedding_bias: self.cat_embedding_bias = nn.Parameter( torch.Tensor(self.hparams.categorical_dim, self.hparams.input_embed_dim)) _initialize_kaiming( self.cat_embedding_bias, self.hparams.embedding_initialization, d_sqrt_inv, ) # Continuous Embedding Layer if self.hparams.continuous_dim > 0: self.cont_embedding_layer = nn.Embedding( self.hparams.continuous_dim, self.hparams.input_embed_dim) _initialize_kaiming( self.cont_embedding_layer.weight, self.hparams.embedding_initialization, d_sqrt_inv, ) if self.hparams.embedding_bias: self.cont_embedding_bias = nn.Parameter( torch.Tensor(self.hparams.continuous_dim, self.hparams.input_embed_dim)) _initialize_kaiming( self.cont_embedding_bias, self.hparams.embedding_initialization, d_sqrt_inv, ) if self.hparams.embedding_dropout != 0: self.embed_dropout = nn.Dropout(self.hparams.embedding_dropout) self.add_cls = AppendCLSToken( d_token=self.hparams.input_embed_dim, initialization=self.hparams.embedding_initialization, ) self.transformer_blocks = OrderedDict() for i in range(self.hparams.num_attn_blocks): self.transformer_blocks[f"mha_block_{i}"] = TransformerEncoderBlock( input_embed_dim=self.hparams.input_embed_dim, num_heads=self.hparams.num_heads, ff_hidden_multiplier=self.hparams.ff_hidden_multiplier, ff_activation=self.hparams.transformer_activation, attn_dropout=self.hparams.attn_dropout, ff_dropout=self.hparams.ff_dropout, add_norm_dropout=self.hparams.add_norm_dropout, keep_attn=self.hparams. attn_feature_importance, # Can use Attn Weights to derive feature importance ) self.transformer_blocks = nn.Sequential(self.transformer_blocks) if self.hparams.attn_feature_importance: self.attention_weights_ = [None] * self.hparams.num_attn_blocks if self.hparams.batch_norm_continuous_input: self.normalizing_batch_norm = nn.BatchNorm1d( self.hparams.continuous_dim) # Final MLP Layers _curr_units = self.hparams.input_embed_dim # Linear Layers layers = [] for units in self.hparams.out_ff_layers.split("-"): layers.extend( _linear_dropout_bn( self.hparams.out_ff_activation, self.hparams.out_ff_initialization, self.hparams.use_batch_norm, _curr_units, int(units), self.hparams.out_ff_dropout, )) _curr_units = int(units) self.linear_layers = nn.Sequential(*layers) self.output_dim = _curr_units
def _build_network(self): if self.hparams.categorical_dim > 0: # Category Embedding layers if self.hparams.share_embedding: self.cat_embedding_layers = nn.ModuleList( [ SharedEmbeddings( cardinality, self.hparams.input_embed_dim, add_shared_embed=self.hparams.share_embedding_strategy == "add", frac_shared_embed=self.hparams.shared_embedding_fraction, ) for cardinality in self.hparams.categorical_cardinality ] ) else: self.cat_embedding_layers = nn.ModuleList( [ nn.Embedding(cardinality, self.hparams.input_embed_dim) for cardinality in self.hparams.categorical_cardinality ] ) if self.hparams.embedding_dropout != 0: self.embed_dropout = nn.Dropout(self.hparams.embedding_dropout) self.transformer_blocks = OrderedDict() for i in range(self.hparams.num_attn_blocks): self.transformer_blocks[f"mha_block_{i}"] = TransformerEncoderBlock( input_embed_dim=self.hparams.input_embed_dim, num_heads=self.hparams.num_heads, ff_hidden_multiplier=self.hparams.ff_hidden_multiplier, ff_activation=self.hparams.transformer_activation, attn_dropout=self.hparams.attn_dropout, ff_dropout=self.hparams.ff_dropout, add_norm_dropout=self.hparams.add_norm_dropout, keep_attn=False, # No easy way to convert TabTransformer Attn Weights to Feature Importance ) self.transformer_blocks = nn.Sequential(self.transformer_blocks) self.attention_weights = [None] * self.hparams.num_attn_blocks if self.hparams.batch_norm_continuous_input: self.normalizing_batch_norm = nn.BatchNorm1d(self.hparams.continuous_dim) # Final MLP Layers _curr_units = ( self.hparams.input_embed_dim * self.hparams.categorical_dim + self.hparams.continuous_dim ) # Linear Layers layers = [] for units in self.hparams.out_ff_layers.split("-"): layers.extend( _linear_dropout_bn( self.hparams.out_ff_activation, self.hparams.out_ff_initialization, self.hparams.use_batch_norm, _curr_units, int(units), self.hparams.out_ff_dropout, ) ) _curr_units = int(units) self.linear_layers = nn.Sequential(*layers) self.output_dim = _curr_units