Esempio n. 1
0
 def show(self, ctx=None, **kwargs):
     all_tensors = all([isinstance(t, Tensor) for t in self])
     same_shape = all([self[0].shape == t.shape for t in self[1:]])
     if not all_tensors or not same_shape: return ctx
     line = self[0].new_zeros(self[0].shape[0], self[0].shape[1], 10)
     imgs = sum(L(zip(self, [line] * len(self))).map(list), [])[:-1]
     return show_image(torch.cat(imgs, dim=2), ctx=ctx, **kwargs)
Esempio n. 2
0
def test_get_sheet_df(df: pd.DataFrame):
    assert isinstance(df, pd.DataFrame)
    assert all([
        col in df.columns.values for col in [
            "Wahlperiode",
            "Sitzungnr",
            "Abstimmnr",
            "Fraktion/Gruppe",
            "Name",
            "Vorname",
            "Titel",
            "ja",
            "nein",
            "Enthaltung",
            "ungültig",
            "nichtabgegeben",
            "Bezeichnung",
            "sheet_name",
            "date",
            "title",
        ]
    ])
Esempio n. 3
0
def all_equal(a, b):
    "Compares whether `a` and `b` are the same length and have the same contents"
    if not is_iter(b): return False
    return all(equals(a_, b_) for a_, b_ in itertools.zip_longest(a, b))
Esempio n. 4
0
    def __init__(
            self,
            # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
            # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
            hf_arch: Optional[str] = None,
            # A Hugging Face configuration object (not required if passing in an
            # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
            hf_config: Optional[PretrainedConfig] = None,
            # A Hugging Face tokenizer (not required if passing in an
            # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
            hf_tokenizer: Optional[PreTrainedTokenizerBase] = None,
            # A Hugging Face model (not required if passing in an
            # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
            hf_model: Optional[PreTrainedModel] = None,
            # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
            # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
            include_labels: bool = True,
            # The token ID that should be ignored when calculating the loss
            ignore_token_id=CrossEntropyLossFlat().ignore_index,
            # The before_batch_tfm you want to use to tokenize your raw data on the fly
            # (defaults to an instance of `BatchTokenizeTransform`)
            batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None,
            # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods,
            # (defaults to BatchDecodeTransform)
            batch_decode_tfm: Optional[BatchDecodeTransform] = None,
            # To control the length of the padding/truncation. It can be an integer or None,
            # in which case it will default to the maximum length the model can accept. If the model has no
            # specific maximum input length, truncation/padding to max_length is deactivated.
            # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
            max_length: Optional[int] = None,
            # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
            # `False` or `'do_not_pad'.
            # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
            padding: Union[bool, str] = True,
            # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
            # `False` or `do_not_truncate`.
            # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
            truncation: Union[bool, str] = True,
            # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
            # if your inputs are pre-tokenized (not numericalized)
            is_split_into_words: bool = False,
            # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
            input_return_type: Type = TextInput,
            # The type of `DataLoader` you want created (defaults to `SortedDL`)
            dl_type: Optional[DataLoader] = None,
            # Any keyword arguments you want applied to your `batch_tokenize_tfm`
            batch_tokenize_kwargs: dict = {},
            # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`)
            batch_decode_kwargs: dict = {},
            # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
            tok_kwargs: dict = {},
            # Any keyword arguments you want to have applied with generating text
            text_gen_kwargs: dict = {},
            # Any keyword arguments you want applied to `TextBlock`
            **kwargs):
        if (not all([hf_arch, hf_config, hf_tokenizer, hf_model
                     ])) and batch_tokenize_tfm is None:
            raise ValueError(
                "You must supply an hf_arch, hf_config, hf_tokenizer, hf_model -or- a BatchTokenizeTransform"
            )

        if batch_tokenize_tfm is None:
            batch_tokenize_tfm = BatchTokenizeTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                include_labels=include_labels,
                ignore_token_id=ignore_token_id,
                max_length=max_length,
                padding=padding,
                truncation=truncation,
                is_split_into_words=is_split_into_words,
                tok_kwargs=tok_kwargs.copy(),
                **batch_tokenize_kwargs.copy())

        if batch_decode_tfm is None:
            batch_decode_tfm = BatchDecodeTransform(
                input_return_type=input_return_type,
                **batch_decode_kwargs.copy())

        if dl_type is None:
            dl_sort_func = partial(
                blurr_sort_func,
                hf_tokenizer=batch_tokenize_tfm.hf_tokenizer,
                is_split_into_words=batch_tokenize_tfm.is_split_into_words,
                tok_kwargs=batch_tokenize_tfm.tok_kwargs.copy(),
            )

            dl_type = partial(SortedDL, sort_func=dl_sort_func)

        return super().__init__(
            dl_type=dl_type,
            dls_kwargs={"before_batch": batch_tokenize_tfm},
            batch_tfms=batch_decode_tfm)
Esempio n. 5
0
def test_embeddings(emb: dict):
    assert isinstance(emb, dict)
    assert all([isinstance(m, pd.DataFrame) for m in emb.values()])
Esempio n. 6
0
def explode_lens(o):
    if is_listy(o):
        if all(is_listy(o_) for o_ in o):
            return [explode_lens(o_) for o_ in o]
        else:
            return len(o)
Esempio n. 7
0
 def shape(self):
     all_tensors = all([isinstance(t, Tensor) for t in self])
     same_shape = all([self[0].shape == t.shape for t in self[1:]])
     if not all_tensors or not same_shape: raise AttributeError
     return self[0].shape