def testVocab(): tokenizer=get_tokenizer(name="bert", model_path=AlphaPathLookUp.BertBaseUnCased) ids=[0,1,2,3,4,5,6,7,8,9,10] print(tokenizer.decode(ids)) tokenizer=get_tokenizer(name="xlnet", model_path=AlphaPathLookUp.XLNetBaseCased) print(tokenizer.decode(ids) )
def __init__(self, config_file): self.config = AlphaConfig.loadConfig( os.path.join(AlphaPathLookUp.ConfigPath, config_file)) self.tokenizer = get_tokenizer(name=self.config.tokenizer, model_path=self.config.model_path) self.textExtractor = InformationAbstrator(maxClip=100, tokenizer=None) self.textExtractor.initParagraphFilter( self.textExtractor.lexrankSummary)
def testBasicFunctions(): tokenizer=get_tokenizer(name="gpt2", model_path=AlphaPathLookUp.GPT2Base) #tokenizer=get_tokenizer(name="roberta", model_path=programmingalpha.RoBertaBase) #tokenizer=get_tokenizer(name="bert", model_path=AlphaPathLookUp.BertBaseUnCased) print(tokenizer.tokenizer.additional_special_tokens) print(tokenizer.tokenizer.added_tokens_encoder) #exit(10) s="I am fantastic [CODE] supreme [MATH] !" print(tokenizer.tokenize(s)) s_ids=tokenizer.tokenizeLine(s) print(s) print(s_ids) for id in s_ids.split(): print(tokenizer.decode([id]))
def init(): global tokenizer name= args.tokenizer tokenizer=get_tokenizer(path_map_tokenizers[name], name)
def __init__(self, config_file): AlphaHTTPProxy.__init__(self,config_file) args=self.args self.tokenizer=get_tokenizer(model_path=programmingalpha.BertBaseUnCased,name=args.tokenizer)