def txt2vec(dic, text, fasttext_type=None): if hasattr(dic, "bert_tokenizer"): orig_mapping = get_bert_token_mapping(fasttext_type) mapping = dict((re.escape(k), v) for k, v in orig_mapping.items()) pattern = re.compile("|".join(mapping.keys())) cleaned_text = pattern.sub(lambda m: mapping[re.escape(m.group(0))], text) tokenized_text = dic.bert_tokenizer.tokenize(cleaned_text) return dic.bert_tokenizer.convert_tokens_to_ids(tokenized_text) elif type(dic) is ParlAIDictionary: return dic.txt2vec(text) else: return [dic.index(token) for token in tokenize(text)]
def txt2vec(self, text): return [ self.tok2ind.get(token, self.tok2ind.get(self.unk_token, None)) for token in tokenize(text) ]
def get_token_tensor(sentence): words = net_dictionary["words"] tokenized = tokenize(sentence, split_sep=None) return torch.LongTensor([words.get(w, NET_UNK_IDX) for w in tokenized])