Esempio n. 1
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer],
     human_prob: float = 1.0,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._tokenizer = WhitespaceTokenizer()
     self._token_indexers = token_indexers
     self._human_prob = human_prob
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        max_sequence_length: int = None,
        human_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._max_sequence_length = max_sequence_length
        self._token_indexers = token_indexers
        self._human_prob = human_prob

        self._bert = "bert" in token_indexers
Esempio n. 3
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        add_rationale: bool = False,
        keep_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._token_indexers = token_indexers
        self._add_rationale = add_rationale
        self._keep_prob = keep_prob

        self._bert = "bert" in token_indexers
Esempio n. 4
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = False,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = None,
     target_max_tokens: Optional[int] = None,
     source_to_target_len_max_ratio: Optional[float] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or WhitespaceTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_to_target_len_max_ratio = source_to_target_len_max_ratio
     self._source_ignored = 0
     self._target_ignored = 0
     self._source_target_ratio_ignored = 0
Esempio n. 5
0
 def __init__(self,
              tokenizer: Optional[Tokenizer] = None,
              token_indexers: Optional[Dict[str, TokenIndexer]] = None,
              lazy: bool = False):
     super().__init__(lazy=lazy)
     self.tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Esempio n. 6
0
 def test_load_word_pairs(self):
     ids1, ids2 = load_word_pairs(self.pairs_fname, WhitespaceTokenizer(),
                                  self.pairs_vocab, "tokens")
     # first two token IDs reserved for [CLS] and [SEP]
     assert torch.equal(torch.tensor([i.item() for i in ids1]),
                        torch.arange(2, self.num_pairs + 2, step=2))
     assert torch.equal(torch.tensor([i.item() for i in ids2]),
                        torch.arange(3, self.num_pairs + 3, step=2))
Esempio n. 7
0
 def __init__(self, model_dir_path, cuda_device=-1):
     self._model_path = os.path.join(model_dir_path, 'segmenter_neural', 'model.tar.gz')
     self._cuda_device = cuda_device
     self.predictor = Predictor.from_path(self._model_path, cuda_device=self._cuda_device)
     self.predictor._tokenizer = WhitespaceTokenizer()
     self._separator = 'U-S'
     self._threshold = 0.5
     self._use_logits = False
     self._symbol_map = SYMBOL_MAP
Esempio n. 8
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              sample: int = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
     self._sample = sample
Esempio n. 9
0
 def __init__(
     self,
     tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     **kwargs
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Esempio n. 10
0
 def test_load_words(self):
     ids = load_words(self.singles_fname,
                      WhitespaceTokenizer(),
                      self.singles_vocab,
                      "tokens",
                      all_cases=False)
     # first two token IDs reserved for [CLS] and [SEP]
     assert torch.equal(torch.tensor([i.item() for i in ids]),
                        torch.arange(2, self.num_singles + 2))
Esempio n. 11
0
 def __init__(
     self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or WhitespaceTokenizer()
     self._targets_tokenizer: Tokenizer
     if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
         self._targets_tokenizer = copy.copy(self._tokenizer)
         self._targets_tokenizer._add_special_tokens = False
     else:
         self._targets_tokenizer = self._tokenizer
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Esempio n. 12
0
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        combine_input_fields: Optional[bool] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

        if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
            assert not self._tokenizer._add_special_tokens

        if combine_input_fields is not None:
            self._combine_input_fields = combine_input_fields
        else:
            self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer)
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        # temporary hack to not to add special tokens
        self._targets_tokenizer: Tokenizer
        if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
            self._targets_tokenizer = copy.copy(self._tokenizer)
            self._targets_tokenizer._add_special_tokens = False
        else:
            self._targets_tokenizer = self._tokenizer

        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
Esempio n. 14
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_sequence_length: int = None,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None) -> None:
        super().__init__()
        self._tokenizer = tokenizer or WhitespaceTokenizer()

        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)
Esempio n. 15
0
class BaseReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        human_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._token_indexers = token_indexers
        self._human_prob = human_prob

    @overrides
    def _read(self, file_path):
        rs = RandomState(seed=1000)
        with open(cached_path(file_path), "r") as data_file:
            for _, line in enumerate(data_file.readlines()):
                items = json.loads(line)
                document = items["document"]
                annotation_id = items["annotation_id"]
                query = items.get("query", None)
                label = items.get("label", None)
                rationale = items.get(
                    "rationale",
                    []) if rs.random_sample() < self._human_prob else []

                if label is not None:
                    label = str(label).replace(" ", "_")

                instance = self.text_to_instance(
                    annotation_id=annotation_id,
                    document=document,
                    query=query,
                    label=label,
                    rationale=rationale,
                )
                yield instance

    @overrides
    def text_to_instance(
        self,
        annotation_id: str,
        document: str,
        query: str = None,
        label: str = None,
        rationale: List[tuple] = None,
    ) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        document_tokens = [
            to_token(t.text) for t in self._tokenizer.tokenize(document)
        ]
        human_rationale_labels = [0] * len(document_tokens)
        for s, e in rationale:
            for i in range(s, e):
                human_rationale_labels[i] = 1

        if query is not None:
            query_tokens = [
                to_token(t.text) for t in self._tokenizer.tokenize(query)
            ]
        else:
            query_tokens = []

        for index_name, indexer in self._token_indexers.items():
            if hasattr(indexer, "add_token_info"):
                indexer.add_token_info(document_tokens, index_name)
                indexer.add_token_info(query_tokens, index_name)

        fields["document"] = MetadataField({
            "tokens": document_tokens,
            "reader_object": self
        })
        fields["query"] = MetadataField({"tokens": query_tokens})
        fields["rationale"] = ArrayField(np.array(human_rationale_labels))

        metadata = {
            "annotation_id": annotation_id,
            "human_rationale": rationale,
            "document": document,
            "label": label,
        }

        if query is not None:
            metadata["query"] = query

        fields["metadata"] = MetadataField(metadata)

        if label is not None:
            fields["label"] = LabelField(label, label_namespace="labels")

        return Instance(fields)

    def convert_tokens_to_instance(self, tokens: List[Token]):
        fields = {}
        tokens = tokens[0] + (
            ([to_token("[DQSEP]")] + tokens[1]) if len(tokens[1]) > 0 else [])
        fields["document"] = TextField(tokens, self._token_indexers)

        return Instance(fields)

    def convert_documents_to_batch(self, documents: List[Tuple[List[Token],
                                                               List[Token]]],
                                   vocabulary) -> Dict[str, Any]:
        batch = Batch(
            [self.convert_tokens_to_instance(tokens) for tokens in documents])
        batch.index_instances(vocabulary)
        batch = batch.as_tensor_dict()
        return batch["document"]

    def combine_document_query(self, document: List[MetadataField],
                               query: List[MetadataField], vocabulary):
        document_tokens = [(x["tokens"], y["tokens"])
                           for x, y in zip(document, query)]
        return self.convert_documents_to_batch(document_tokens, vocabulary)
class RationaleReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        max_sequence_length: int = None,
        human_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = WhitespaceTokenizer()
        self._max_sequence_length = max_sequence_length
        self._token_indexers = token_indexers
        self._human_prob = human_prob

        self._bert = "bert" in token_indexers

    @overrides
    def _read(self, file_path):
        rs = RandomState(seed=1000)
        with open(cached_path(file_path), "r") as data_file:
            for _, line in enumerate(data_file.readlines()):
                items = json.loads(line)
                document = items["document"]
                query = items.get("query", None)
                label = items.get("label", None)
                rationale = items.get("rationale", [])
                annotation_id = items["annotation_id"]

                if label is not None:
                    label = str(label).replace(' ', '_')

                if rs.random_sample() > self._human_prob:
                    rationale = -1

                instance = self.text_to_instance(annotation_id=annotation_id,
                                                 document=document,
                                                 query=query,
                                                 label=label,
                                                 rationale=rationale)
                if instance is not None:
                    yield instance

    @overrides
    def text_to_instance(
            self,
            annotation_id: str,
            document: str,
            query: str = None,
            label: str = None,
            rationale: List[tuple] = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        tokens = [Token("<S>")]
        keep_tokens = [1]

        word_tokens = self._tokenizer.tokenize(document)
        rationale_tokens = [0] * len(word_tokens)
        if rationale != -1:
            for s, e in rationale:
                for i in range(s, e):
                    rationale_tokens[i] = 1

        tokens.extend(word_tokens)
        keep_tokens.extend([0 for _ in range(len(word_tokens))])

        rationale_tokens = [0] + rationale_tokens

        if query is not None:
            if self._bert:
                query_tokens = self._tokenizer.tokenize(query)
                tokens += [Token('[SEP]')] + query_tokens
                keep_tokens += [1 for _ in range(len(query_tokens) + 1)]
                rationale_tokens += [1] * (len(query_tokens) + 1)
            else:
                fields["query"] = TextField(self._tokenizer.tokenize(query),
                                            self._token_indexers)

        fields["document"] = TextField(tokens, self._token_indexers)

        assert len(rationale_tokens) == len(tokens), breakpoint()
        fields['rationale'] = SequenceLabelField(rationale_tokens,
                                                 fields['document'],
                                                 'rationale_labels')

        metadata = {
            "annotation_id": annotation_id,
            "tokens": tokens,
            "keep_tokens": keep_tokens,
            "document": document,
            "query": query,
            "convert_tokens_to_instance": self.convert_tokens_to_instance,
            "label": label
        }

        fields["metadata"] = MetadataField(metadata)

        if label is not None:
            fields["label"] = LabelField(label, label_namespace="labels")

        return Instance(fields)

    def convert_tokens_to_instance(self, tokens):
        fields = {}
        fields["document"] = TextField(tokens, self._token_indexers)
        return Instance(fields)