def classify(image: PILImage, learner: Learner) -> Tuple[AnimalType, float]: with learner.no_bar(): results = learner.predict(image) _, category, probabilities = results is_a_cat = category == 1 animal_type = AnimalType.cat if is_a_cat else AnimalType.dog percent = np.round(100 * probabilities) return animal_type, percent[category]
# %% print(summary(mlp_model, torch.zeros((1, 1, 28, 28)).to(device), show_input=True)) print(summary(mlp_model, torch.zeros((1, 1, 28, 28)).to(device), show_input=False)) # %% mlp_learner = Learner( mnist_dls, mlp_model, metrics=[accuracy, Precision(average="macro"), Recall(average="macro")], ) # %% mlp_learner.lr_find() # %% with mlp_learner.no_bar(): mlp_learner.fit(n_epoch=4) # %% conv_model = nn.Sequential( nn.Conv2d(1, 10, kernel_size=5, stride=(2, 2)), nn.ReLU(), nn.Conv2d(10, 20, kernel_size=5, stride=(2, 2)), nn.ReLU(), nn.Flatten(), nn.Linear(320, 60), nn.ReLU(), nn.Linear(60, 10), nn.LogSoftmax(dim=1), ).to(device)
class AutoEmbedderCategoryEncoder(CustomCategoryEncoder): """Uses an `AutoEmbedder` model to perform encoding of categorical features.""" _preprocessor_cls: Type[ CategoryEncoderPreprocessor] = AutoEmbedderPreprocessor learn: Learner = None emb_szs: Dict[str, int] = None def encode(self, X: TabDataLoader): """Encodes all elements in `data`.""" data = X if isinstance(X, TabDataLoader) else X.train preds = self.learn.get_preds(dl=data, reorder=False)[0].cpu().numpy() return pd.DataFrame(preds, columns=self.get_feature_names()) def fit(self, X: TabularDataLoaders): """Creates the learner and trains it.""" emb_szs = get_emb_sz(X.train_ds, {}) self.emb_szs = {col: sz for col, sz in zip(self.cat_names, emb_szs)} n_conts = len(X.cont_names) n_cats = sum(list(map(lambda e: e[1], emb_szs))) in_sz = n_conts + n_cats out_sz = n_conts + len(X.cat_names) # Create the embedding model model = AutoEmbedder(in_sz, out_sz, emb_szs, [2000, 1000]) self.learn = Learner(X, model, loss_func=EmbeddingLoss(model), wd=1.0) # TODO hide training progress? with self.learn.no_bar(): self.learn.fit_one_cycle(20, lr_max=3e-3) def decode(self, X: pd.DataFrame) -> pd.DataFrame: """Decodes multiple items for one feature embedding.""" column_idx = 0 df = pd.DataFrame() data = torch.tensor(X[self.get_feature_names()].values) embeddings = self.learn.model.embeddings.embeddings # Split data into chunks depending on embedding sizes data = torch.split(data, list(map(lambda o: o[1], self.emb_szs.values())), dim=-1) # Iterate over features, decoding each one for all rows for (embedding_vectors, embedding_layer, (colname, (n_unique_values, embedding_size))) in zip(data, embeddings, self.emb_szs.items()): # Calculate the embedding output for each category value cat_embeddings = embedding_layer( torch.tensor( range(n_unique_values)).to(device=embedding_layer.device)) # Compute cosine similarity over embeddings most_similar = expanded( embedding_vectors, cat_embeddings, lambda a, b: F.cosine_similarity(a, b, dim=-1)) # Map values to their most similar category most_similar = most_similar.argmax(dim=-1) # Save data into decoded column df[colname] = most_similar.cpu().numpy() # move forward the column index column_idx += embedding_size return df def get_feature_names(self) -> List[str]: """ Returns a list of encoded feature names. For embeddings, this is a list of original categorical names followed by embedding index, e.g. [feature_a_0, feature_a_1, feature_b_0, feature_b_1]. """ return [ f"{column}_{feature_num}" for column in self.cat_names for feature_num in range(self.emb_szs[column][1]) ] def get_emb_szs(self): """Returns a dict of embedding sizes for each categorical feature.""" return self.emb_szs