def show(self, ctx=None, **kwargs): all_tensors = all([isinstance(t, Tensor) for t in self]) same_shape = all([self[0].shape == t.shape for t in self[1:]]) if not all_tensors or not same_shape: return ctx line = self[0].new_zeros(self[0].shape[0], self[0].shape[1], 10) imgs = sum(L(zip(self, [line] * len(self))).map(list), [])[:-1] return show_image(torch.cat(imgs, dim=2), ctx=ctx, **kwargs)
def test_get_sheet_df(df: pd.DataFrame): assert isinstance(df, pd.DataFrame) assert all([ col in df.columns.values for col in [ "Wahlperiode", "Sitzungnr", "Abstimmnr", "Fraktion/Gruppe", "Name", "Vorname", "Titel", "ja", "nein", "Enthaltung", "ungültig", "nichtabgegeben", "Bezeichnung", "sheet_name", "date", "title", ] ])
def all_equal(a, b): "Compares whether `a` and `b` are the same length and have the same contents" if not is_iter(b): return False return all(equals(a_, b_) for a_, b_ in itertools.zip_longest(a, b))
def __init__( self, # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an # instance of `BatchTokenizeTransform` to `before_batch_tfm`) hf_arch: Optional[str] = None, # A Hugging Face configuration object (not required if passing in an # instance of `BatchTokenizeTransform` to `before_batch_tfm`) hf_config: Optional[PretrainedConfig] = None, # A Hugging Face tokenizer (not required if passing in an # instance of `BatchTokenizeTransform` to `before_batch_tfm`) hf_tokenizer: Optional[PreTrainedTokenizerBase] = None, # A Hugging Face model (not required if passing in an # instance of `BatchTokenizeTransform` to `before_batch_tfm`) hf_model: Optional[PreTrainedModel] = None, # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it include_labels: bool = True, # The token ID that should be ignored when calculating the loss ignore_token_id=CrossEntropyLossFlat().ignore_index, # The before_batch_tfm you want to use to tokenize your raw data on the fly # (defaults to an instance of `BatchTokenizeTransform`) batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None, # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods, # (defaults to BatchDecodeTransform) batch_decode_tfm: Optional[BatchDecodeTransform] = None, # To control the length of the padding/truncation. It can be an integer or None, # in which case it will default to the maximum length the model can accept. If the model has no # specific maximum input length, truncation/padding to max_length is deactivated. # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation) max_length: Optional[int] = None, # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to # `False` or `'do_not_pad'. # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation) padding: Union[bool, str] = True, # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to # `False` or `do_not_truncate`. # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation) truncation: Union[bool, str] = True, # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True` # if your inputs are pre-tokenized (not numericalized) is_split_into_words: bool = False, # The return type your decoded inputs should be cast too (used by methods such as `show_batch`) input_return_type: Type = TextInput, # The type of `DataLoader` you want created (defaults to `SortedDL`) dl_type: Optional[DataLoader] = None, # Any keyword arguments you want applied to your `batch_tokenize_tfm` batch_tokenize_kwargs: dict = {}, # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`) batch_decode_kwargs: dict = {}, # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization tok_kwargs: dict = {}, # Any keyword arguments you want to have applied with generating text text_gen_kwargs: dict = {}, # Any keyword arguments you want applied to `TextBlock` **kwargs): if (not all([hf_arch, hf_config, hf_tokenizer, hf_model ])) and batch_tokenize_tfm is None: raise ValueError( "You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a BatchTokenizeTransform" ) if batch_tokenize_tfm is None: batch_tokenize_tfm = BatchTokenizeTransform( hf_arch, hf_config, hf_tokenizer, hf_model, include_labels=include_labels, ignore_token_id=ignore_token_id, max_length=max_length, padding=padding, truncation=truncation, is_split_into_words=is_split_into_words, tok_kwargs=tok_kwargs.copy(), **batch_tokenize_kwargs.copy()) if batch_decode_tfm is None: batch_decode_tfm = BatchDecodeTransform( input_return_type=input_return_type, **batch_decode_kwargs.copy()) if dl_type is None: dl_sort_func = partial( blurr_sort_func, hf_tokenizer=batch_tokenize_tfm.hf_tokenizer, is_split_into_words=batch_tokenize_tfm.is_split_into_words, tok_kwargs=batch_tokenize_tfm.tok_kwargs.copy(), ) dl_type = partial(SortedDL, sort_func=dl_sort_func) return super().__init__( dl_type=dl_type, dls_kwargs={"before_batch": batch_tokenize_tfm}, batch_tfms=batch_decode_tfm)
def test_embeddings(emb: dict): assert isinstance(emb, dict) assert all([isinstance(m, pd.DataFrame) for m in emb.values()])
def explode_lens(o): if is_listy(o): if all(is_listy(o_) for o_ in o): return [explode_lens(o_) for o_ in o] else: return len(o)
def shape(self): all_tensors = all([isinstance(t, Tensor) for t in self]) same_shape = all([self[0].shape == t.shape for t in self[1:]]) if not all_tensors or not same_shape: raise AttributeError return self[0].shape