def split_words(self, doc: str) -> List[Token]: #tokens = requests.get("http://127.0.0.1:8000/thulac?text=%s"%doc).json() try: tokens =[t.split('_') for t in requests.post("http://127.0.0.1:8000/thulac",data={'text':doc}).json()] except: return [Token(',',pos='ws')],[Token(',',pos='ws')],[] return [Token(t[0],pos=t[1]) for t in tokens],[Token(t[0],pos=t[1]) if not self._pos_pattern or t[1] not in self._pattern_pos else Token('@@'+t[1]+'@@',pos=t[1]) for t in tokens],[t[0] if not self._pos_pattern or t[1] not in self._pattern_pos else 'c_'+t[0] for t in tokens]
def __init__( self, namespace: str = 'bme_token_characters', character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = None, end_tokens: List[str] = None, token_min_padding_length: int = 0, begin_size: int = 3, end_size: int = 3) -> None: super().__init__(token_min_padding_length) major, minor, patch = map(int, torch.__version__.split('.')) torch_version = major + 0.1 * minor if torch_version < 1.1: raise Exception( "BMETokenIndexer requires pytorch version >= 1.1 because it provides \ torch.nn.functional.one_hot. Your version is {}".format( torch.__version__)) self._namespace = namespace self._character_tokenizer = character_tokenizer self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])] self._begin_size = begin_size self._end_size = end_size
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[List[int]]]: chunk_tags = self.get_chunk_tags(tokens, vocabulary) # Add BOS, EOS characters tokens_with_bos_eos = [Token(self.bos_token) ] + tokens + [Token(self.bos_token)] character_indices_with_eos_bos = self.elmo_indexer.tokens_to_indices( tokens_with_bos_eos, vocabulary, "elmo") # Get string chunk tags. chunk_tags_str, instance_fields = self.get_input_data_structures_for_segmental_lm( chunk_tags) # Convert these into tags for the language model. chunk_tags_seglm_ids = self.get_tags_in_lm_vocab(chunk_tags_str) return_dict = { 'character_ids': character_indices_with_eos_bos["elmo"], 'mask': [1] * len(tokens), "mask_with_bos_eos": [1] * len(tokens_with_bos_eos), 'tags': chunk_tags_seglm_ids } return_dict.update(instance_fields) return return_dict
def __init__( self, namespace: str = "token_characters", character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = None, end_tokens: List[str] = None, min_padding_length: int = 0, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) if min_padding_length == 0: url = "https://github.com/allenai/allennlp/issues/1954" warnings.warn( "You are using the default value (0) of `min_padding_length`, " f"which can cause some subtle bugs (more info see {url}). " "Strongly recommend to set a value, usually the maximum size " "of the convolutional layer size when using CnnEncoder.", UserWarning, ) self._min_padding_length = min_padding_length self._namespace = namespace self._character_tokenizer = character_tokenizer self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def add_special_token_to_utterance(column_tokens): # new_token = [] specail_list = ["year", "name"] specail_already = [False, False] for col in column_tokens: for i, tok in enumerate(specail_list): if tok == col.lemma_: specail_already[i] = True name_start_token = [ "what", "give", "tell", "show", "which", "find" ] for i, col in enumerate(column_tokens): if col.text.isdigit(): digit = int(col.text) if digit > 1700 and digit < 2100 and not specail_already[0]: column_tokens[i] = Token(text="year", lemma="year", tag="NN") # new_token.append(Token(text="year",lemma="year",tag = "NN")) specail_already[0] = True elif col.text == "each" or (i == 0 and col.lemma_ in name_start_token ) and not specail_already[1]: column_tokens[i] = Token(text="name", lemma="name", tag="NN") # new_token.append(Token(text="name",lemma="name",tag = "NN")) specail_already[1] = True # column_tokens.extend(new_token) return column_tokens
def _filter_and_stem(self, words: List[Token]) -> List[Token]: filtered_words = self._word_filter.filter_words(words) stemmed_words = [self._word_stemmer.stem_word(word) for word in filtered_words] for start_token in self._start_tokens: stemmed_words.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: stemmed_words.append(Token(end_token, -1)) return stemmed_words
def split_words(self, sentence: str) -> List[Token]: """ Splits a sentence into word tokens. We handle four kinds of things: words with punctuation that should be ignored as a special case (Mr. Mrs., etc.), contractions/genitives (isn't, don't, Matt's), and beginning and ending punctuation ("antennagate", (parentheticals), and such.). The basic outline is to split on whitespace, then check each of these cases. First, we strip off beginning punctuation, then strip off ending punctuation, then strip off contractions. When we strip something off the beginning of a word, we can add it to the list of tokens immediately. When we strip it off the end, we have to save it to be added to after the word itself has been added. Before stripping off any part of a token, we first check to be sure the token isn't in our list of special cases. """ if self.pos_tags: cut_res = pseg.lcut(sentence=sentence, HMM=self.hmm) fields = [text for text, _ in cut_res] tags = [tag for _, tag in cut_res] else: if self.cut_for_search: fields = jieba.cut_for_search(sentence=sentence, HMM=self.hmm) else: fields = jieba.cut(sentence, cut_all=self.cut_all, HMM=self.hmm) tokens: List[Token] = [] for idx, field in enumerate(fields): add_at_end: List[Token] = [] while self._can_split( field) and field[0] in self.beginning_punctuation: tokens.append(Token(field[0])) field = field[1:] while self._can_split( field) and field[-1] in self.ending_punctuation: add_at_end.insert(0, Token(field[-1])) field = field[:-1] # There could (rarely) be several contractions in a word, but we check contractions # sequentially, in a random order. If we've removed one, we need to check again to be # sure there aren't others. remove_contractions = True while remove_contractions: remove_contractions = False for contraction in self.contractions: if self._can_split(field) and field.lower().endswith( contraction): add_at_end.insert(0, Token(field[-len(contraction):])) field = field[:-len(contraction)] remove_contractions = True if field: if self.pos_tags: tokens.append(Token(field, pos=tags[idx], tag=tags[idx])) else: tokens.append(Token(field)) tokens.extend(add_at_end) return tokens
def __init__(self, meta_ids: Dict[str, str] = None, start_token="<start>", end_token="<end>"): if not meta_ids: meta_ids = {"text": "lex"} self._meta_ids = meta_ids self._start_token = Token(**{i: start_token for i in self._meta_ids}) self._end_token = Token(**{i: end_token for i in self._meta_ids})
def __init__(self, namespace: str = 'tokens', lowercase_tokens: bool = False, start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: self.namespace = namespace self.lowercase_tokens = lowercase_tokens self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def __init__( self, namespace: str = 'token_characters', character_tokenizer: CharacterTokenizer = CharacterTokenizer(), start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: self._namespace = namespace self._character_tokenizer = character_tokenizer self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def test_tokenize_handles_unicode_letters(self): sentence = "HAL9000 and Ångström" expected_tokens = [ Token("HAL", 0), Token("9000", 3), Token("and", 10), Token("Ångström", 17) ] tokens = self.word_splitter.split_words(sentence) assert [t.text for t in tokens] == [t.text for t in expected_tokens] assert [t.idx for t in tokens] == [t.idx for t in expected_tokens]
def __init__( self, lowercase_tokens: bool = False, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, token_min_padding_length: int = 0, ) -> None: super().__init__(token_min_padding_length) self.lowercase_tokens = lowercase_tokens self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def tokenize(self, text: str) -> List[Token]: """ Splits sentences into a set of all possible ngrams up to self._max_ngram_degree using nltk """ ngrams_iterator = everygrams(text.split(), max_len=self._max_ngram_degree) tokens = [Token(" ".join(ngram)) for ngram in ngrams_iterator] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def _filter_and_stem(self, words): # filtered_words = self._word_filter.filter_words(words) # Not to filter stop words to avoid the mis-alignment. filtered_words = words stemmed_words = [ self._word_stemmer.stem_word(word) for word in filtered_words ] for start_token in self._start_tokens: stemmed_words.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: stemmed_words.append(Token(end_token, -1)) return stemmed_words
def __init__( self, model_path: str, ) -> None: self.bpe = fastBPE(Args(model_path + "/bpe.codes")) self.vocab = Dictionary() self.vocab.add_from_file(f"{model_path}/dict.txt") self._tokenizer_lowercases = False self.sequence_pair_start_tokens = [Token(text="<s>", text_id=0, type_id=0)] self.sequence_pair_mid_tokens = [Token(text="</s>", text_id=2, type_id=0), Token(text="</s>", text_id=2, type_id=0)] self.sequence_pair_end_tokens = [Token(text="</s>", text_id=2, type_id=0)]
def tokens_to_lm_instance(tokens: List[Token], token_indexers: Dict[str, TokenIndexer]): tokens = list(tokens) # shallow copy tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) input_field = TextField(tokens[:-1], token_indexers) output_field = TextField(tokens[1:], token_indexers) return Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def __init__(self, namespace: str = 'tokens', lowercase_tokens: bool = False, start_tokens: List[str] = None, end_tokens: List[str] = None, token_min_padding_length: int = 0) -> None: super().__init__(token_min_padding_length) self.namespace = namespace self.lowercase_tokens = lowercase_tokens self._start_tokens = [Token(st) for st in (start_tokens or [])] self._end_tokens = [Token(et) for et in (end_tokens or [])]
def text_to_instance( self, source_string: str, target_string: str = None, v_i=None, ) -> Instance: # type: ignore tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_max_tokens and len( tokenized_source) > self._source_max_tokens: self._source_max_exceeded += 1 tokenized_source = tokenized_source[:self._source_max_tokens] if self.pseudo: # tokenized_source = [Token(self.tags[v_i])] + tokenized_source tokenized_source.insert(0, Token(copy.deepcopy(self.tags[v_i]))) if self._source_add_start_token: tokenized_source.insert(0, Token(copy.deepcopy(self._start_token))) if self._source_add_end_token: tokenized_source.append(Token(copy.deepcopy(self._end_token))) self._70 += len(tokenized_source) >= 70 l_s = len(tokenized_source) // 20 * 20 self.s_dic[l_s] = self.s_dic.get(l_s, 0) + 1 source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) if self._target_max_tokens and len( tokenized_target) > self._target_max_tokens: self._target_max_exceeded += 1 tokenized_target = tokenized_target[:self._target_max_tokens] if self._target_add_start_token: tokenized_target.insert( 0, Token(copy.deepcopy(self._start_token))) if self._target_add_end_token: tokenized_target.append(Token(copy.deepcopy(self._end_token))) target_field = TextField(tokenized_target, self._target_token_indexers) l_t = len(tokenized_target) // 20 * 20 self.t_dic[l_t] = self.t_dic.get(l_t, 0) + 1 return Instance({ "source_tokens": source_field, "target_tokens": target_field }) else: return Instance({"source_tokens": source_field})
def tokenize(self, text: str) -> List[Token]: konoha_tokens = self._tokenizer.tokenize(text) tokens = [ Token(text=token.surface, lemma_=token.base_form, pos_=token.postag,) for token in konoha_tokens ] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def _construct_embedding_matrix(self): """ For HotFlip, we need a word embedding matrix to search over. The below is necessary for models such as ELMo, character-level models, or for models that use a projection layer after their word embeddings. We run all of the tokens from the vocabulary through the TextFieldEmbedder, and save the final output embedding. We then group all of those output embeddings into an "embedding matrix". """ # Gets all tokens in the vocab and their corresponding IDs all_tokens = self.vocab._token_to_index["tokens"] all_indices = list(self.vocab._index_to_token["tokens"].keys()) all_inputs = { "tokens": torch.LongTensor(all_indices).to(self.model_device).unsqueeze(0) } for token_indexer in self.predictor._dataset_reader._token_indexers.values( ): # handle when a model uses character-level inputs, e.g., a CharCNN if isinstance(token_indexer, TokenCharactersIndexer): tokens = [Token(x) for x in all_tokens] max_token_length = max(len(x) for x in all_tokens) indexed_tokens = token_indexer.tokens_to_indices( tokens, self.vocab, "token_characters") padded_tokens = token_indexer.as_padded_tensor( indexed_tokens, {"token_characters": len(tokens)}, {"num_token_characters": max_token_length}) all_inputs['token_characters'] = torch.LongTensor( padded_tokens['token_characters']).to( self.model_device).unsqueeze(0) # for ELMo models if isinstance(token_indexer, ELMoTokenCharactersIndexer): elmo_tokens = [] for token in all_tokens: elmo_indexed_token = token_indexer.tokens_to_indices( [Token(text=token)], self.vocab, "sentence")["sentence"] elmo_tokens.append(elmo_indexed_token[0]) all_inputs["elmo"] = torch.LongTensor(elmo_tokens).to( self.model_device).unsqueeze(0) # find the TextFieldEmbedder for module in self.predictor._model.modules(): if isinstance(module, TextFieldEmbedder): embedder = module # pass all tokens through the fake matrix and create an embedding out of it. embedding_matrix = embedder(all_inputs).squeeze() return embedding_matrix
def tokenize(self, text: str) -> List[Token]: tokens = [Token(t) for t in self._phonemizer(text)] for start_token in self._start_tokens: if isinstance(start_token, int): token = Token(text_id=start_token, idx=0) else: token = Token(text=start_token, idx=0) tokens.insert(0, token) for end_token in self._end_tokens: if isinstance(end_token, int): token = Token(text_id=end_token, idx=0) else: token = Token(text=end_token, idx=0) tokens.append(token) return tokens
def text_to_instance( self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1] + 1] + [ '__del1__' ] + tokens[entity_2[0]:entity_2[1] + 1] + ['__del2__' ] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
def split_words(self, sentence: str) -> List[Token]: # We use the [^\W\d_] pattern as a trick to match unicode letters tokens = [ Token(m.group(), idx=m.start()) for m in re.finditer(r"[^\W\d_]+|\d+|\S", sentence) ] return tokens
def tokenize(self, text: str) -> List[Token]: # We use the [^\W\d_] pattern as a trick to match unicode letters tokens = [ Token(m.group(), idx=m.start()) for m in re.finditer(r"[^\W\d_]+|\d+|\S", text) ] return tokens
def tokenize(self, text: str) -> List[Token]: if self._nbest_size and self._alpha: subwords = self._processor.SampleEncodeAsPieces(text, self._nbest_size, self._alpha) else: subwords = self._processor.EncodeAsPieces(text) tokens = [Token(s) for s in subwords] return tokens
def _intra_word_tokenize( self, string_tokens: List[str] ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]: tokens: List[Token] = [] offsets: List[Optional[Tuple[int, int]]] = [] for token_string in string_tokens: wordpieces = self.tokenizer.encode_plus( token_string, add_special_tokens=False, return_tensors=None, return_offsets_mapping=False, return_attention_mask=False, return_token_type_ids=False, ) wp_ids = wordpieces["input_ids"] if len(wp_ids) > 0: offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1)) tokens.extend( Token(text=wp_text, text_id=wp_id) for wp_id, wp_text in zip( wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))) else: offsets.append(None) return tokens, offsets
def batch_split_words(self, sentences: List[str]) -> List[List[Token]]: with ThreadPoolExecutor() as executor: return [ Token(ret_val) for ret_val in executor.map( lambda s: [t for t in self._make_parser().tokenize(s)], sentences) ]
def _tokenize(self, sentence_1: str, sentence_2: str = None): """ This method works on both sentence and sentence pair. """ # TODO(mattg): track character offsets. Might be too challenging to do it here, given that # ``transformers``` is dealing with the whitespace... encoded_tokens = self._tokenizer.encode_plus( text=sentence_1, text_pair=sentence_2, add_special_tokens=self._add_special_tokens, max_length=self._max_length, stride=self._stride, truncation_strategy=self._truncation_strategy, return_tensors=None, ) # token_ids containes a final list with ids for both regualr and special tokens token_ids, token_type_ids = encoded_tokens["input_ids"], encoded_tokens["token_type_ids"] tokens = [] for token_id, token_type_id in zip(token_ids, token_type_ids): token_str = self._tokenizer.convert_ids_to_tokens(token_id, skip_special_tokens=False) tokens.append(Token(text=token_str, text_id=token_id, type_id=token_type_id)) return tokens
def aux(tokens): if len(tokens) < length: tokens = (tokens + [Token(DEFAULT_PADDING_TOKEN)]*(length - len(tokens))) else: tokens = tokens[:length] return tokens
def tokenize(self, text: str) -> List[Token]: """ Does whatever processing is required to convert a string of text into a sequence of tokens. At a minimum, this uses a ``WordSplitter`` to split words into text. It may also do stemming or stopword removal, depending on the parameters given to the constructor. """ return [Token(text)]