def classify(image: PILImage, learner: Learner) -> Tuple[AnimalType, float]:
    with learner.no_bar():
        results = learner.predict(image)
    _, category, probabilities = results
    is_a_cat = category == 1
    animal_type = AnimalType.cat if is_a_cat else AnimalType.dog
    percent = np.round(100 * probabilities)
    return animal_type, percent[category]
Exemple #2
0
# %%
print(summary(mlp_model, torch.zeros((1, 1, 28, 28)).to(device), show_input=True))
print(summary(mlp_model, torch.zeros((1, 1, 28, 28)).to(device), show_input=False))

# %%
mlp_learner = Learner(
    mnist_dls,
    mlp_model,
    metrics=[accuracy, Precision(average="macro"), Recall(average="macro")],
)

# %%
mlp_learner.lr_find()

# %%
with mlp_learner.no_bar():
    mlp_learner.fit(n_epoch=4)

# %%
conv_model = nn.Sequential(
    nn.Conv2d(1, 10, kernel_size=5, stride=(2, 2)),
    nn.ReLU(),
    nn.Conv2d(10, 20, kernel_size=5, stride=(2, 2)),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(320, 60),
    nn.ReLU(),
    nn.Linear(60, 10),
    nn.LogSoftmax(dim=1),
).to(device)
class AutoEmbedderCategoryEncoder(CustomCategoryEncoder):
    """Uses an `AutoEmbedder` model to perform encoding of categorical features."""
    _preprocessor_cls: Type[
        CategoryEncoderPreprocessor] = AutoEmbedderPreprocessor
    learn: Learner = None
    emb_szs: Dict[str, int] = None

    def encode(self, X: TabDataLoader):
        """Encodes all elements in `data`."""
        data = X if isinstance(X, TabDataLoader) else X.train
        preds = self.learn.get_preds(dl=data, reorder=False)[0].cpu().numpy()
        return pd.DataFrame(preds, columns=self.get_feature_names())

    def fit(self, X: TabularDataLoaders):
        """Creates the learner and trains it."""
        emb_szs = get_emb_sz(X.train_ds, {})
        self.emb_szs = {col: sz for col, sz in zip(self.cat_names, emb_szs)}
        n_conts = len(X.cont_names)
        n_cats = sum(list(map(lambda e: e[1], emb_szs)))
        in_sz = n_conts + n_cats
        out_sz = n_conts + len(X.cat_names)
        # Create the embedding model
        model = AutoEmbedder(in_sz, out_sz, emb_szs, [2000, 1000])
        self.learn = Learner(X, model, loss_func=EmbeddingLoss(model), wd=1.0)
        # TODO hide training progress?
        with self.learn.no_bar():
            self.learn.fit_one_cycle(20, lr_max=3e-3)

    def decode(self, X: pd.DataFrame) -> pd.DataFrame:
        """Decodes multiple items for one feature embedding."""
        column_idx = 0
        df = pd.DataFrame()
        data = torch.tensor(X[self.get_feature_names()].values)
        embeddings = self.learn.model.embeddings.embeddings
        # Split data into chunks depending on embedding sizes
        data = torch.split(data,
                           list(map(lambda o: o[1], self.emb_szs.values())),
                           dim=-1)
        # Iterate over features, decoding each one for all rows
        for (embedding_vectors, embedding_layer,
             (colname, (n_unique_values,
                        embedding_size))) in zip(data, embeddings,
                                                 self.emb_szs.items()):
            # Calculate the embedding output for each category value
            cat_embeddings = embedding_layer(
                torch.tensor(
                    range(n_unique_values)).to(device=embedding_layer.device))
            # Compute cosine similarity over embeddings
            most_similar = expanded(
                embedding_vectors, cat_embeddings,
                lambda a, b: F.cosine_similarity(a, b, dim=-1))
            # Map values to their most similar category
            most_similar = most_similar.argmax(dim=-1)
            # Save data into decoded column
            df[colname] = most_similar.cpu().numpy()
            # move forward the column index
            column_idx += embedding_size
        return df

    def get_feature_names(self) -> List[str]:
        """
        Returns a list of encoded feature names.
        For embeddings, this is a list of original categorical names followed by embedding index,
        e.g. [feature_a_0, feature_a_1, feature_b_0, feature_b_1].
        """
        return [
            f"{column}_{feature_num}" for column in self.cat_names
            for feature_num in range(self.emb_szs[column][1])
        ]

    def get_emb_szs(self):
        """Returns a dict of embedding sizes for each categorical feature."""
        return self.emb_szs