Example #1
0
 def split_words(self, doc: str) -> List[Token]:
     #tokens = requests.get("http://127.0.0.1:8000/thulac?text=%s"%doc).json()
     try:
         tokens =[t.split('_') for t in requests.post("http://127.0.0.1:8000/thulac",data={'text':doc}).json()]
     except:
         return [Token(',',pos='ws')],[Token(',',pos='ws')],[]
     return [Token(t[0],pos=t[1]) for t in tokens],[Token(t[0],pos=t[1]) if not self._pos_pattern or t[1] not in self._pattern_pos else Token('@@'+t[1]+'@@',pos=t[1]) for t in tokens],[t[0] if not self._pos_pattern or t[1] not in self._pattern_pos else 'c_'+t[0] for t in tokens]
    def __init__(
            self,
            namespace: str = 'bme_token_characters',
            character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
            start_tokens: List[str] = None,
            end_tokens: List[str] = None,
            token_min_padding_length: int = 0,
            begin_size: int = 3,
            end_size: int = 3) -> None:
        super().__init__(token_min_padding_length)

        major, minor, patch = map(int, torch.__version__.split('.'))
        torch_version = major + 0.1 * minor

        if torch_version < 1.1:
            raise Exception(
                "BMETokenIndexer requires pytorch version >= 1.1 because it provides \
                torch.nn.functional.one_hot. Your version is {}".format(
                    torch.__version__))

        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        self._begin_size = begin_size
        self._end_size = end_size
Example #3
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[List[int]]]:
        chunk_tags = self.get_chunk_tags(tokens, vocabulary)

        # Add BOS, EOS characters
        tokens_with_bos_eos = [Token(self.bos_token)
                               ] + tokens + [Token(self.bos_token)]
        character_indices_with_eos_bos = self.elmo_indexer.tokens_to_indices(
            tokens_with_bos_eos, vocabulary, "elmo")

        # Get string chunk tags.
        chunk_tags_str, instance_fields = self.get_input_data_structures_for_segmental_lm(
            chunk_tags)
        # Convert these into tags for the language model.
        chunk_tags_seglm_ids = self.get_tags_in_lm_vocab(chunk_tags_str)

        return_dict = {
            'character_ids': character_indices_with_eos_bos["elmo"],
            'mask': [1] * len(tokens),
            "mask_with_bos_eos": [1] * len(tokens_with_bos_eos),
            'tags': chunk_tags_seglm_ids
        }
        return_dict.update(instance_fields)

        return return_dict
    def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
        def add_special_token_to_utterance(column_tokens):
            # new_token = []
            specail_list = ["year", "name"]
            specail_already = [False, False]
            for col in column_tokens:
                for i, tok in enumerate(specail_list):
                    if tok == col.lemma_:
                        specail_already[i] = True

            name_start_token = [
                "what", "give", "tell", "show", "which", "find"
            ]

            for i, col in enumerate(column_tokens):
                if col.text.isdigit():
                    digit = int(col.text)
                    if digit > 1700 and digit < 2100 and not specail_already[0]:
                        column_tokens[i] = Token(text="year",
                                                 lemma="year",
                                                 tag="NN")
                        # new_token.append(Token(text="year",lemma="year",tag = "NN"))
                        specail_already[0] = True
                elif col.text == "each" or (i == 0
                                            and col.lemma_ in name_start_token
                                            ) and not specail_already[1]:
                    column_tokens[i] = Token(text="name",
                                             lemma="name",
                                             tag="NN")
                    # new_token.append(Token(text="name",lemma="name",tag = "NN"))
                    specail_already[1] = True
            # column_tokens.extend(new_token)
            return column_tokens
Example #6
0
 def _filter_and_stem(self, words: List[Token]) -> List[Token]:
     filtered_words = self._word_filter.filter_words(words)
     stemmed_words = [self._word_stemmer.stem_word(word) for word in filtered_words]
     for start_token in self._start_tokens:
         stemmed_words.insert(0, Token(start_token, 0))
     for end_token in self._end_tokens:
         stemmed_words.append(Token(end_token, -1))
     return stemmed_words
    def split_words(self, sentence: str) -> List[Token]:
        """
        Splits a sentence into word tokens.  We handle four kinds of things: words with punctuation
        that should be ignored as a special case (Mr. Mrs., etc.), contractions/genitives (isn't,
        don't, Matt's), and beginning and ending punctuation ("antennagate", (parentheticals), and
        such.).

        The basic outline is to split on whitespace, then check each of these cases.  First, we
        strip off beginning punctuation, then strip off ending punctuation, then strip off
        contractions.  When we strip something off the beginning of a word, we can add it to the
        list of tokens immediately.  When we strip it off the end, we have to save it to be added
        to after the word itself has been added.  Before stripping off any part of a token, we
        first check to be sure the token isn't in our list of special cases.
        """
        if self.pos_tags:
            cut_res = pseg.lcut(sentence=sentence, HMM=self.hmm)
            fields = [text for text, _ in cut_res]
            tags = [tag for _, tag in cut_res]
        else:
            if self.cut_for_search:
                fields = jieba.cut_for_search(sentence=sentence, HMM=self.hmm)
            else:
                fields = jieba.cut(sentence,
                                   cut_all=self.cut_all,
                                   HMM=self.hmm)
        tokens: List[Token] = []
        for idx, field in enumerate(fields):
            add_at_end: List[Token] = []
            while self._can_split(
                    field) and field[0] in self.beginning_punctuation:
                tokens.append(Token(field[0]))
                field = field[1:]
            while self._can_split(
                    field) and field[-1] in self.ending_punctuation:
                add_at_end.insert(0, Token(field[-1]))
                field = field[:-1]

            # There could (rarely) be several contractions in a word, but we check contractions
            # sequentially, in a random order.  If we've removed one, we need to check again to be
            # sure there aren't others.
            remove_contractions = True
            while remove_contractions:
                remove_contractions = False
                for contraction in self.contractions:
                    if self._can_split(field) and field.lower().endswith(
                            contraction):
                        add_at_end.insert(0, Token(field[-len(contraction):]))
                        field = field[:-len(contraction)]
                        remove_contractions = True
            if field:
                if self.pos_tags:
                    tokens.append(Token(field, pos=tags[idx], tag=tags[idx]))
                else:
                    tokens.append(Token(field))
            tokens.extend(add_at_end)
        return tokens
Example #8
0
    def __init__(self,
                 meta_ids: Dict[str, str] = None,
                 start_token="<start>",
                 end_token="<end>"):
        if not meta_ids:
            meta_ids = {"text": "lex"}
        self._meta_ids = meta_ids

        self._start_token = Token(**{i: start_token for i in self._meta_ids})
        self._end_token = Token(**{i: end_token for i in self._meta_ids})
Example #9
0
    def __init__(self,
                 namespace: str = 'tokens',
                 lowercase_tokens: bool = False,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None) -> None:
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
    def __init__(
            self,
            namespace: str = 'token_characters',
            character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
            start_tokens: List[str] = None,
            end_tokens: List[str] = None) -> None:
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
 def test_tokenize_handles_unicode_letters(self):
     sentence = "HAL9000   and    Ångström"
     expected_tokens = [
         Token("HAL", 0),
         Token("9000", 3),
         Token("and", 10),
         Token("Ångström", 17)
     ]
     tokens = self.word_splitter.split_words(sentence)
     assert [t.text for t in tokens] == [t.text for t in expected_tokens]
     assert [t.idx for t in tokens] == [t.idx for t in expected_tokens]
Example #12
0
    def __init__(
        self,
        lowercase_tokens: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
 def tokenize(self, text: str) -> List[Token]:
     """
     Splits sentences into a set of all possible ngrams up to self._max_ngram_degree using nltk
     """
     ngrams_iterator = everygrams(text.split(),
                                  max_len=self._max_ngram_degree)
     tokens = [Token(" ".join(ngram)) for ngram in ngrams_iterator]
     for start_token in self._start_tokens:
         tokens.insert(0, Token(start_token, 0))
     for end_token in self._end_tokens:
         tokens.append(Token(end_token, -1))
     return tokens
Example #14
0
 def _filter_and_stem(self, words):
     # filtered_words = self._word_filter.filter_words(words)
     # Not to filter stop words to avoid the mis-alignment.
     filtered_words = words
     stemmed_words = [
         self._word_stemmer.stem_word(word) for word in filtered_words
     ]
     for start_token in self._start_tokens:
         stemmed_words.insert(0, Token(start_token, 0))
     for end_token in self._end_tokens:
         stemmed_words.append(Token(end_token, -1))
     return stemmed_words
Example #15
0
    def __init__(
        self,
        model_path: str,
    ) -> None:

        self.bpe = fastBPE(Args(model_path + "/bpe.codes"))
        self.vocab = Dictionary()
        self.vocab.add_from_file(f"{model_path}/dict.txt")
        self._tokenizer_lowercases = False
        self.sequence_pair_start_tokens = [Token(text="<s>", text_id=0, type_id=0)]
        self.sequence_pair_mid_tokens = [Token(text="</s>", text_id=2, type_id=0), Token(text="</s>", text_id=2, type_id=0)]
        self.sequence_pair_end_tokens = [Token(text="</s>", text_id=2, type_id=0)]
Example #16
0
def tokens_to_lm_instance(tokens: List[Token],
                          token_indexers: Dict[str, TokenIndexer]):
    tokens = list(tokens)  # shallow copy
    tokens.insert(0, Token(START_SYMBOL))
    tokens.append(Token(END_SYMBOL))

    input_field = TextField(tokens[:-1], token_indexers)
    output_field = TextField(tokens[1:], token_indexers)
    return Instance({
        'input_tokens': input_field,
        'output_tokens': output_field
    })
Example #17
0
    def __init__(self,
                 namespace: str = 'tokens',
                 lowercase_tokens: bool = False,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None,
                 token_min_padding_length: int = 0) -> None:
        super().__init__(token_min_padding_length)
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
    def text_to_instance(
        self,
        source_string: str,
        target_string: str = None,
        v_i=None,
    ) -> Instance:  # type: ignore
        tokenized_source = self._source_tokenizer.tokenize(source_string)

        if self._source_max_tokens and len(
                tokenized_source) > self._source_max_tokens:
            self._source_max_exceeded += 1
            tokenized_source = tokenized_source[:self._source_max_tokens]

        if self.pseudo:
            # tokenized_source = [Token(self.tags[v_i])] + tokenized_source
            tokenized_source.insert(0, Token(copy.deepcopy(self.tags[v_i])))

        if self._source_add_start_token:
            tokenized_source.insert(0, Token(copy.deepcopy(self._start_token)))
        if self._source_add_end_token:
            tokenized_source.append(Token(copy.deepcopy(self._end_token)))

        self._70 += len(tokenized_source) >= 70

        l_s = len(tokenized_source) // 20 * 20
        self.s_dic[l_s] = self.s_dic.get(l_s, 0) + 1

        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:

            tokenized_target = self._target_tokenizer.tokenize(target_string)

            if self._target_max_tokens and len(
                    tokenized_target) > self._target_max_tokens:
                self._target_max_exceeded += 1
                tokenized_target = tokenized_target[:self._target_max_tokens]
            if self._target_add_start_token:
                tokenized_target.insert(
                    0, Token(copy.deepcopy(self._start_token)))
            if self._target_add_end_token:
                tokenized_target.append(Token(copy.deepcopy(self._end_token)))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)
            l_t = len(tokenized_target) // 20 * 20
            self.t_dic[l_t] = self.t_dic.get(l_t, 0) + 1

            return Instance({
                "source_tokens": source_field,
                "target_tokens": target_field
            })
        else:
            return Instance({"source_tokens": source_field})
Example #19
0
    def tokenize(self, text: str) -> List[Token]:
        konoha_tokens = self._tokenizer.tokenize(text)
        tokens = [
            Token(text=token.surface, lemma_=token.base_form, pos_=token.postag,)
            for token in konoha_tokens
        ]

        for start_token in self._start_tokens:
            tokens.insert(0, Token(start_token, 0))

        for end_token in self._end_tokens:
            tokens.append(Token(end_token, -1))

        return tokens
Example #20
0
    def _construct_embedding_matrix(self):
        """
        For HotFlip, we need a word embedding matrix to search over. The below is necessary for
        models such as ELMo, character-level models, or for models that use a projection layer
        after their word embeddings.

        We run all of the tokens from the vocabulary through the TextFieldEmbedder, and save the
        final output embedding. We then group all of those output embeddings into an "embedding
        matrix".
        """
        # Gets all tokens in the vocab and their corresponding IDs
        all_tokens = self.vocab._token_to_index["tokens"]
        all_indices = list(self.vocab._index_to_token["tokens"].keys())
        all_inputs = {
            "tokens":
            torch.LongTensor(all_indices).to(self.model_device).unsqueeze(0)
        }
        for token_indexer in self.predictor._dataset_reader._token_indexers.values(
        ):
            # handle when a model uses character-level inputs, e.g., a CharCNN
            if isinstance(token_indexer, TokenCharactersIndexer):
                tokens = [Token(x) for x in all_tokens]
                max_token_length = max(len(x) for x in all_tokens)
                indexed_tokens = token_indexer.tokens_to_indices(
                    tokens, self.vocab, "token_characters")
                padded_tokens = token_indexer.as_padded_tensor(
                    indexed_tokens, {"token_characters": len(tokens)},
                    {"num_token_characters": max_token_length})
                all_inputs['token_characters'] = torch.LongTensor(
                    padded_tokens['token_characters']).to(
                        self.model_device).unsqueeze(0)
            # for ELMo models
            if isinstance(token_indexer, ELMoTokenCharactersIndexer):
                elmo_tokens = []
                for token in all_tokens:
                    elmo_indexed_token = token_indexer.tokens_to_indices(
                        [Token(text=token)], self.vocab,
                        "sentence")["sentence"]
                    elmo_tokens.append(elmo_indexed_token[0])
                all_inputs["elmo"] = torch.LongTensor(elmo_tokens).to(
                    self.model_device).unsqueeze(0)

        # find the TextFieldEmbedder
        for module in self.predictor._model.modules():
            if isinstance(module, TextFieldEmbedder):
                embedder = module
        # pass all tokens through the fake matrix and create an embedding out of it.
        embedding_matrix = embedder(all_inputs).squeeze()
        return embedding_matrix
Example #21
0
 def tokenize(self, text: str) -> List[Token]:
     tokens = [Token(t) for t in self._phonemizer(text)]
     for start_token in self._start_tokens:
         if isinstance(start_token, int):
             token = Token(text_id=start_token, idx=0)
         else:
             token = Token(text=start_token, idx=0)
         tokens.insert(0, token)
     for end_token in self._end_tokens:
         if isinstance(end_token, int):
             token = Token(text_id=end_token, idx=0)
         else:
             token = Token(text=end_token, idx=0)
         tokens.append(token)
     return tokens
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            entity_1: Tuple[int],
            entity_2: Tuple[int],
            label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1] + 1] + [
            '__del1__'
        ] + tokens[entity_2[0]:entity_2[1] + 1] + ['__del2__'
                                                   ] + tokens + ['__clf__']

        sentence = TextField([Token(text=t) for t in tokens],
                             self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)

        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)
 def split_words(self, sentence: str) -> List[Token]:
     # We use the [^\W\d_] pattern as a trick to match unicode letters
     tokens = [
         Token(m.group(), idx=m.start())
         for m in re.finditer(r"[^\W\d_]+|\d+|\S", sentence)
     ]
     return tokens
Example #24
0
 def tokenize(self, text: str) -> List[Token]:
     # We use the [^\W\d_] pattern as a trick to match unicode letters
     tokens = [
         Token(m.group(), idx=m.start())
         for m in re.finditer(r"[^\W\d_]+|\d+|\S", text)
     ]
     return tokens
 def tokenize(self, text: str) -> List[Token]:
     if self._nbest_size and self._alpha:
         subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha)
     else:
         subwords = self._processor.EncodeAsPieces(text)
     tokens = [Token(s) for s in subwords]
     return tokens
    def _intra_word_tokenize(
        self, string_tokens: List[str]
    ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
        tokens: List[Token] = []
        offsets: List[Optional[Tuple[int, int]]] = []
        for token_string in string_tokens:
            wordpieces = self.tokenizer.encode_plus(
                token_string,
                add_special_tokens=False,
                return_tensors=None,
                return_offsets_mapping=False,
                return_attention_mask=False,
                return_token_type_ids=False,
            )
            wp_ids = wordpieces["input_ids"]

            if len(wp_ids) > 0:
                offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
                tokens.extend(
                    Token(text=wp_text, text_id=wp_id)
                    for wp_id, wp_text in zip(
                        wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids)))
            else:
                offsets.append(None)
        return tokens, offsets
Example #27
0
 def batch_split_words(self, sentences: List[str]) -> List[List[Token]]:
     with ThreadPoolExecutor() as executor:
         return [
             Token(ret_val) for ret_val in executor.map(
                 lambda s: [t for t in self._make_parser().tokenize(s)],
                 sentences)
         ]
Example #28
0
    def _tokenize(self, sentence_1: str, sentence_2: str = None):
        """
        This method works on both sentence and sentence pair.
        """
        # TODO(mattg): track character offsets.  Might be too challenging to do it here, given that
        # ``transformers``` is dealing with the whitespace...

        encoded_tokens = self._tokenizer.encode_plus(
            text=sentence_1,
            text_pair=sentence_2,
            add_special_tokens=self._add_special_tokens,
            max_length=self._max_length,
            stride=self._stride,
            truncation_strategy=self._truncation_strategy,
            return_tensors=None,
        )
        # token_ids containes a final list with ids for both regualr and special tokens
        token_ids, token_type_ids = encoded_tokens["input_ids"], encoded_tokens["token_type_ids"]

        tokens = []
        for token_id, token_type_id in zip(token_ids, token_type_ids):
            token_str = self._tokenizer.convert_ids_to_tokens(token_id, skip_special_tokens=False)
            tokens.append(Token(text=token_str, text_id=token_id, type_id=token_type_id))

        return tokens
 def aux(tokens):
     if len(tokens) < length:
         tokens = (tokens
                   + [Token(DEFAULT_PADDING_TOKEN)]*(length - len(tokens)))
     else:
         tokens = tokens[:length]
     return tokens
Example #30
0
 def tokenize(self, text: str) -> List[Token]:
     """
     Does whatever processing is required to convert a string of text into a sequence of tokens.
     At a minimum, this uses a ``WordSplitter`` to split words into text.  It may also do
     stemming or stopword removal, depending on the parameters given to the constructor.
     """
     return [Token(text)]