Esempio n. 1
0
    def __init__(
        self,
        lowercase_tokens: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
Esempio n. 2
0
    def tokenize(self, text: str) -> List[Token]:
        konoha_tokens = self._tokenizer.tokenize(text)
        tokens = [
            Token(text=token.surface,
                  lemma_=token.base_form,
                  pos_=token.postag) for token in konoha_tokens
        ]

        for start_token in self._start_tokens:
            tokens.insert(0, Token(start_token, 0))

        for end_token in self._end_tokens:
            tokens.append(Token(end_token, -1))

        return tokens
Esempio n. 3
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    # Parameters

    batch : `List[List[str]]`, required
        A list of tokenized sentences.

    # Returns

        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {"character_ids": indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()["elmo"]["character_ids"]["elmo_tokens"]
Esempio n. 4
0
 def _create_dummy_input(self):
     sentence = "<S> placeholder </S>"
     tokens = [Token(word) for word in sentence.split()]
     character_indices = self.indexer.tokens_to_indices(
         tokens, self.vocab)["elmo_tokens"]
     indices_tensor = torch.LongTensor([character_indices])
     return indices_tensor
def generate_sentence_embeddings(sentence="", max_len=200):
    sen_list = []
    if isinstance(sentence, str):
        tokens = [
            Token(word) for word in sentence.split() if isinstance(word, str)
        ]
        embed = google_word_2_vec()

        for idx, t in enumerate(tokens):
            try:
                if idx >= max_len:
                    break

                sen_list.append(embed[t.text])
            except Exception as e:
                #print(e)
                sen_list.append([0.0] * embedding_dim)

        for x in range(len(tokens), max_len):
            #print(x)
            sen_list.append([0.0] * embedding_dim)

    else:
        print('no vector for  a sentence')

        for x in range(0, max_len):
            # print(x)
            sen_list.append([0.0] * embedding_dim)

    return np.asarray(sen_list, dtype=float)
Esempio n. 6
0
 def _encode_concepts(self, concepts):
     concept_tensors = []
     for concept in concepts:
         concept = [Token(word) for word in concept.split()]
         concept_indices = self.indexer.tokens_to_indices(
             concept, self.vocab)["elmo_tokens"]
         concept_tensors.append(torch.LongTensor(concept_indices))
     return concept_tensors
    def tokenize(self, text: str) -> List[Token]:
        """
        This method only handles a single sentence (or sequence) of text.
        """
        max_length = self._max_length
        if max_length is not None and not self._add_special_tokens:
            max_length += self.num_special_tokens_for_sequence()

        encoded_tokens = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=max_length,
            stride=self._stride,
            return_tensors=None,
            return_offsets_mapping=self.tokenizer.is_fast,
            return_attention_mask=False,
            return_token_type_ids=True,
            return_special_tokens_mask=True,
        )
        # token_ids contains a final list with ids for both regular and special tokens
        token_ids, token_type_ids, special_tokens_mask, token_offsets = (
            encoded_tokens["input_ids"],
            encoded_tokens["token_type_ids"],
            encoded_tokens["special_tokens_mask"],
            encoded_tokens.get("offset_mapping"),
        )

        # If we don't have token offsets, try to calculate them ourselves.
        if token_offsets is None:
            token_offsets = self._estimate_character_indices(text, token_ids)

        tokens = []
        for token_id, token_type_id, special_token_mask, offsets in zip(
                token_ids, token_type_ids, special_tokens_mask, token_offsets):
            # In `special_tokens_mask`, 1s indicate special tokens and 0s indicate regular tokens.
            # NOTE: in transformers v3.4.0 (and probably older versions) the docstring
            # for `encode_plus` was incorrect as it had the 0s and 1s reversed.
            # https://github.com/huggingface/transformers/pull/7949 fixed this.
            if not self._add_special_tokens and special_token_mask == 1:
                continue

            if offsets is None or offsets[0] >= offsets[1]:
                start = None
                end = None
            else:
                start, end = offsets

            tokens.append(
                Token(
                    text=self.tokenizer.convert_ids_to_tokens(
                        token_id, skip_special_tokens=False),
                    text_id=token_id,
                    type_id=token_type_id,
                    idx=start,
                    idx_end=end,
                ))

        return tokens
Esempio n. 8
0
 def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]:
     sentence = json_dict["sentence"]
     if "verbs" in json_dict.keys():
         text = sentence.split()
         pos = ["VERB" if i == json_dict["verbs"] else "NOUN" for i, _ in enumerate(text)]
         tokens = [Token(t, i, i + len(text), pos_=p) for i, (t, p) in enumerate(zip(text, pos))]
     else:
         tokens = self._tokenizer.tokenize(sentence)
     return self.tokens_to_instances(tokens)
Esempio n. 9
0
    def text_to_instance(
        self,
        sentences: List[str],
        labels: List[str] = None,
        confidences: List[float] = None,
        additional_features: List[float] = None,
    ) -> Instance:
        if not self.predict:
            assert len(sentences) == len(labels)
        if confidences is not None:
            assert len(sentences) == len(confidences)
        if additional_features is not None:
            assert len(sentences) == len(additional_features)

        if self.use_sep:
            tokenized_sentences = [
                self._tokenizer.tokenize(s)[:self.sent_max_len] +
                [Token("[SEP]")] for s in sentences
            ]
            sentences = [
                list(itertools.chain.from_iterable(tokenized_sentences))[:-1]
            ]
        else:
            # Tokenize the sentences
            sentences = [
                self._tokenizer.tokenize(sentence_text)[:self.sent_max_len]
                for sentence_text in sentences
            ]

        fields: Dict[str, Field] = {}
        fields["sentences"] = ListField(
            [TextField(sentence) for sentence in sentences])

        if labels is not None:
            if isinstance(labels[0], list):
                fields["labels"] = ListField(
                    [MultiLabelField(label) for label in labels])
            else:
                # make the labels strings for easier identification of the neutral label
                # probably not strictly necessary
                if self.sci_sum:
                    fields["labels"] = ArrayField(np.array(labels))
                else:
                    fields["labels"] = ListField([
                        LabelField(str(label) + "_label") for label in labels
                    ])

        if confidences is not None:
            fields['confidences'] = ArrayField(np.array(confidences))
        if additional_features is not None:
            fields["additional_features"] = ArrayField(
                np.array(additional_features))

        return Instance(fields)
Esempio n. 10
0
 def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]:
     """
     Converts spaCy tokens to allennlp tokens. Is a no-op if
     keep_spacy_tokens is True
     """
     if not self._keep_spacy_tokens:
         tokens = [
             Token(
                 token.text,
                 token.idx,
                 token.idx + len(token.text),
                 token.lemma_,
                 token.pos_,
                 token.tag_,
                 token.dep_,
                 token.ent_type_,
             ) for token in tokens
         ]
     for start_token in self._start_tokens:
         tokens.insert(0, Token(start_token, 0))
     for end_token in self._end_tokens:
         tokens.append(Token(end_token, -1))
     return tokens
    def _intra_word_tokenize(
        self, string_tokens: List[str]
    ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
        tokens: List[Token] = []
        offsets: List[Optional[Tuple[int, int]]] = []
        for token_string in string_tokens:
            wordpieces = self.tokenizer.encode_plus(
                token_string,
                add_special_tokens=False,
                return_tensors=None,
                return_offsets_mapping=False,
                return_attention_mask=False,
            )
            wp_ids = wordpieces["input_ids"]

            if len(wp_ids) > 0:
                offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
                tokens.extend(
                    Token(text=wp_text, text_id=wp_id)
                    for wp_id, wp_text in zip(
                        wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)))
            else:
                offsets.append(None)
        return tokens, offsets
Esempio n. 12
0
 def tokenize(self, text: str) -> List[Token]:
     if self._lowercase_characters:
         text = text.lower()
     if self._byte_encoding is not None:
         # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
         # of this.
         tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
     else:
         tokens = [Token(t) for t in list(text)]
     for start_token in self._start_tokens:
         if isinstance(start_token, int):
             token = Token(text_id=start_token, idx=0)
         else:
             token = Token(text=start_token, idx=0)
         tokens.insert(0, token)
     for end_token in self._end_tokens:
         if isinstance(end_token, int):
             token = Token(text_id=end_token, idx=0)
         else:
             token = Token(text=end_token, idx=0)
         tokens.append(token)
     return tokens
 def tokenize(self, text: str) -> List[Token]:
     # We use the [^\W\d_] pattern as a trick to match unicode letters
     tokens = [Token(m.group(), idx=m.start()) for m in re.finditer(r"[^\W\d_]+|\d+|\S", text)]
     return tokens
Esempio n. 14
0
 def tokenize(self, text: str) -> List[Token]:
     tokens = self.tokenizer.parse(text).split(' ')
     return [Token(t) for t in tokens]
    def _reverse_engineer_special_tokens(
        self,
        token_a: str,
        token_b: str,
        model_name: str,
        tokenizer_kwargs: Optional[Dict[str, Any]],
    ):
        # storing the special tokens
        self.sequence_pair_start_tokens = []
        self.sequence_pair_mid_tokens = []
        self.sequence_pair_end_tokens = []
        # storing token type ids for the sequences
        self.sequence_pair_first_token_type_id = None
        self.sequence_pair_second_token_type_id = None

        # storing the special tokens
        self.single_sequence_start_tokens = []
        self.single_sequence_end_tokens = []
        # storing token type id for the sequence
        self.single_sequence_token_type_id = None

        # Reverse-engineer the tokenizer for two sequences
        from allennlp.common import cached_transformers

        tokenizer_with_special_tokens = cached_transformers.get_tokenizer(
            model_name, add_special_tokens=True, **(tokenizer_kwargs or {}))
        dummy_output = tokenizer_with_special_tokens.encode_plus(
            token_a,
            token_b,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=False,
        )
        if len(dummy_output["token_type_ids"]) != len(
                dummy_output["input_ids"]):
            logger.warning(
                "Tokenizer library did not return valid token type ids. We will assume they are all zero."
            )
            dummy_output["token_type_ids"] = [0] * len(
                dummy_output["input_ids"])

        dummy_a = self.tokenizer.encode(token_a, add_special_tokens=False)[0]
        assert dummy_a in dummy_output["input_ids"]
        dummy_b = self.tokenizer.encode(token_b, add_special_tokens=False)[0]
        assert dummy_b in dummy_output["input_ids"]
        assert dummy_a != dummy_b

        seen_dummy_a = False
        seen_dummy_b = False
        for token_id, token_type_id in zip(dummy_output["input_ids"],
                                           dummy_output["token_type_ids"]):
            if token_id == dummy_a:
                if seen_dummy_a or seen_dummy_b:  # seeing a twice or b before a
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_a = True
                assert (
                    self.sequence_pair_first_token_type_id is None
                    or self.sequence_pair_first_token_type_id == token_type_id
                ), "multiple different token type ids found for the first sequence"
                self.sequence_pair_first_token_type_id = token_type_id
                continue

            if token_id == dummy_b:
                if seen_dummy_b:  # seeing b twice
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_b = True
                assert (
                    self.sequence_pair_second_token_type_id is None
                    or self.sequence_pair_second_token_type_id == token_type_id
                ), "multiple different token type ids found for the second sequence"
                self.sequence_pair_second_token_type_id = token_type_id
                continue

            token = Token(
                tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
                text_id=token_id,
                type_id=token_type_id,
            )
            if not seen_dummy_a:
                self.sequence_pair_start_tokens.append(token)
            elif not seen_dummy_b:
                self.sequence_pair_mid_tokens.append(token)
            else:
                self.sequence_pair_end_tokens.append(token)

        assert (len(self.sequence_pair_start_tokens) +
                len(self.sequence_pair_mid_tokens) +
                len(self.sequence_pair_end_tokens)
                ) == self.tokenizer.num_special_tokens_to_add(pair=True)

        # Reverse-engineer the tokenizer for one sequence
        dummy_output = tokenizer_with_special_tokens.encode_plus(
            token_a,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=False,
        )
        if len(dummy_output["token_type_ids"]) != len(
                dummy_output["input_ids"]):
            logger.warning(
                "Tokenizer library did not return valid token type ids. We will assume they are all zero."
            )
            dummy_output["token_type_ids"] = [0] * len(
                dummy_output["input_ids"])

        seen_dummy_a = False
        for token_id, token_type_id in zip(dummy_output["input_ids"],
                                           dummy_output["token_type_ids"]):
            if token_id == dummy_a:
                if seen_dummy_a:
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_a = True
                assert (
                    self.single_sequence_token_type_id is None
                    or self.single_sequence_token_type_id == token_type_id
                ), "multiple different token type ids found for the sequence"
                self.single_sequence_token_type_id = token_type_id
                continue

            token = Token(
                tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
                text_id=token_id,
                type_id=token_type_id,
            )
            if not seen_dummy_a:
                self.single_sequence_start_tokens.append(token)
            else:
                self.single_sequence_end_tokens.append(token)

        assert (len(self.single_sequence_start_tokens) +
                len(self.single_sequence_end_tokens)
                ) == self.tokenizer.num_special_tokens_to_add(pair=False)
Esempio n. 16
0
 def tokenize(self, text: str) -> List[Token]:
     return [Token(t) for t in text.split()]
Esempio n. 17
0
 def tokenize(self, text: str) -> List[Token]:
     return [Token(t) for t in jieba.lcut(text)]