Exemple #1
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = False,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = None,
     target_max_tokens: Optional[int] = None,
     source_to_target_len_max_ratio: Optional[float] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or WhitespaceTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_to_target_len_max_ratio = source_to_target_len_max_ratio
     self._source_ignored = 0
     self._target_ignored = 0
     self._source_target_ratio_ignored = 0
 def __init__(self,
              tokenizer: Optional[Tokenizer] = None,
              token_indexers: Optional[Dict[str, TokenIndexer]] = None,
              lazy: bool = False):
     super().__init__(lazy=lazy)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Exemple #3
0
 def test_load_word_pairs(self):
     ids1, ids2 = load_word_pairs(self.pairs_fname, WhitespaceTokenizer(),
                                  self.pairs_vocab, "tokens")
     # first two token IDs reserved for [CLS] and [SEP]
     assert torch.equal(torch.tensor([i.item() for i in ids1]),
                        torch.arange(2, self.num_pairs + 2, step=2))
     assert torch.equal(torch.tensor([i.item() for i in ids2]),
                        torch.arange(3, self.num_pairs + 3, step=2))
 def __init__(self, model_dir_path, cuda_device=-1):
     self._model_path = os.path.join(model_dir_path, 'segmenter_neural', 'model.tar.gz')
     self._cuda_device = cuda_device
     self.predictor = Predictor.from_path(self._model_path, cuda_device=self._cuda_device)
     self.predictor._tokenizer = WhitespaceTokenizer()
     self._separator = 'U-S'
     self._threshold = 0.5
     self._use_logits = False
     self._symbol_map = SYMBOL_MAP
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              sample: int = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
     self._sample = sample
Exemple #6
0
 def __init__(
     self,
     tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     **kwargs
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Exemple #7
0
 def test_load_words(self):
     ids = load_words(self.singles_fname,
                      WhitespaceTokenizer(),
                      self.singles_vocab,
                      "tokens",
                      all_cases=False)
     # first two token IDs reserved for [CLS] and [SEP]
     assert torch.equal(torch.tensor([i.item() for i in ids]),
                        torch.arange(2, self.num_singles + 2))
Exemple #8
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer],
     human_prob: float = 1.0,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._tokenizer = WhitespaceTokenizer()
     self._token_indexers = token_indexers
     self._human_prob = human_prob
Exemple #9
0
 def __init__(
     self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._targets_tokenizer: Tokenizer
     if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
         self._targets_tokenizer = copy.copy(self._tokenizer)
         self._targets_tokenizer._add_special_tokens = False
     else:
         self._targets_tokenizer = self._tokenizer
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        max_sequence_length: int = None,
        human_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._max_sequence_length = max_sequence_length
        self._token_indexers = token_indexers
        self._human_prob = human_prob

        self._bert = "bert" in token_indexers
Exemple #11
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        add_rationale: bool = False,
        keep_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._token_indexers = token_indexers
        self._add_rationale = add_rationale
        self._keep_prob = keep_prob

        self._bert = "bert" in token_indexers
Exemple #12
0
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        combine_input_fields: Optional[bool] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

        if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
            assert not self._tokenizer._add_special_tokens

        if combine_input_fields is not None:
            self._combine_input_fields = combine_input_fields
        else:
            self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer)
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        # temporary hack to not to add special tokens
        self._targets_tokenizer: Tokenizer
        if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
            self._targets_tokenizer = copy.copy(self._tokenizer)
            self._targets_tokenizer._add_special_tokens = False
        else:
            self._targets_tokenizer = self._tokenizer

        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
Exemple #14
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_sequence_length: int = None,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None) -> None:
        super().__init__()
        self._tokenizer = tokenizer or WhitespaceTokenizer()

        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)