def test_inverse_transform(): df = pd.DataFrame( { "text_column": [ "life is like a box of chocolates", "You never know what you're going to get", ] } ) text_preprocessor = TextPreprocessor( text_col="text_column", max_vocab=25, min_freq=1, maxlen=10, verbose=False ) padded_seq = text_preprocessor.fit_transform(df) org_df = text_preprocessor.inverse_transform(padded_seq) texts = org_df.text_column.values assert ("life is like box of chocolates" in texts[0]) and ( "you never know what you re going to get" in texts[1] )
target = "yield" target = df[target].values wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols) X_wide = wide_preprocessor.fit_transform(df) tab_preprocessor = TabPreprocessor( embed_cols=cat_embed_cols, # type: ignore[arg-type] continuous_cols=continuous_cols, already_standard=already_standard, ) X_tab = tab_preprocessor.fit_transform(df) text_processor = TextPreprocessor(word_vectors_path=word_vectors_path, text_col=text_col) X_text = text_processor.fit_transform(df) image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path) X_images = image_processor.fit_transform(df) wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1) deepdense = TabMlp( mlp_hidden_dims=[64, 32], mlp_dropout=[0.2, 0.2], column_idx=tab_preprocessor.column_idx, embed_input=tab_preprocessor.embeddings_input, continuous_cols=continuous_cols, ) # # To use TabResnet as the deepdense component simply: # deepdense = TabResnet(
import numpy as np import pandas as pd from sklearn.datasets import fetch_20newsgroups from pytorch_widedeep.utils import text_utils from pytorch_widedeep.preprocessing import TextPreprocessor texts = np.random.choice(fetch_20newsgroups().data, 10) df = pd.DataFrame({"texts": texts}) processor = TextPreprocessor(min_freq=0, text_col="texts") X_text = processor.fit_transform(df) ############################################################################### # There is not much to test here. I will simply test that the tokenization and # and padding processes went well ############################################################################### def test_text_processor(): idx = int(np.random.choice(np.arange(10), 1)) original_tokens = processor.tokens[idx] if len(original_tokens) > processor.maxlen: original_tokens = original_tokens[-processor.maxlen :] padded_sequence = X_text[idx] recovered_tokens = [] for t in padded_sequence: if processor.vocab.itos[t] != "xxpad": recovered_tokens.append(processor.vocab.itos[t])
def test_notfittederror(): processor = TextPreprocessor(min_freq=0, text_col="texts") with pytest.raises(NotFittedError): processor.transform(df)
import numpy as np import pandas as pd import pytest import warnings from sklearn.datasets import fetch_20newsgroups from pytorch_widedeep.preprocessing import TextPreprocessor texts = np.random.choice(fetch_20newsgroups().data, 10) df = pd.DataFrame({'texts': texts}) processor = TextPreprocessor(min_freq=0) X_text = processor.fit_transform(df, 'texts') ############################################################################### # There is not much to test here. I will simply test that the tokenization and # and padding processes went well ############################################################################### def test_text_processor(): idx = int(np.random.choice(np.arange(10), 1)) original_tokens = processor.tokens[idx] if len(original_tokens) > processor.maxlen: original_tokens = original_tokens[-processor.maxlen:] padded_sequence = X_text[idx] recovered_tokens = [] for t in padded_sequence: if processor.vocab.itos[t] != 'xxpad': recovered_tokens.append(processor.vocab.itos[t])