def txt2vec(dic, text, fasttext_type=None):
    if hasattr(dic, "bert_tokenizer"):
        orig_mapping = get_bert_token_mapping(fasttext_type)
        mapping = dict((re.escape(k), v) for k, v in orig_mapping.items())
        pattern = re.compile("|".join(mapping.keys()))
        cleaned_text = pattern.sub(lambda m: mapping[re.escape(m.group(0))],
                                   text)
        tokenized_text = dic.bert_tokenizer.tokenize(cleaned_text)
        return dic.bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    elif type(dic) is ParlAIDictionary:
        return dic.txt2vec(text)
    else:
        return [dic.index(token) for token in tokenize(text)]
Exemple #2
0
 def txt2vec(self, text):
     return [
         self.tok2ind.get(token, self.tok2ind.get(self.unk_token, None))
         for token in tokenize(text)
     ]
Exemple #3
0
def get_token_tensor(sentence):
    words = net_dictionary["words"]
    tokenized = tokenize(sentence, split_sep=None)
    return torch.LongTensor([words.get(w, NET_UNK_IDX) for w in tokenized])