def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer, tables_file: str, dataset_path: str): self.dataset_path = dataset_path self.tables_file = tables_file self.db_id = db_id self.utterance = utterance tokenized_utterance = tokenizer.tokenize(utterance.lower()) self.tokenized_utterance = [ Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance ] if db_id not in SpiderDBContext.schemas: SpiderDBContext.schemas = read_dataset_schema(self.tables_file) self.schema = SpiderDBContext.schemas[db_id] self.knowledge_graph = self.get_db_knowledge_graph(db_id) entity_texts = [ self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities ] entity_tokens = tokenizer.batch_tokenize(entity_texts) self.entity_tokens = [[ Token(text=t.text, lemma_=t.lemma_) for t in et ] for et in entity_tokens]
def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer, tables_file: str, dataset_path: str): self.dataset_path = dataset_path self.tables_file = tables_file self.db_id = db_id self.utterance = utterance tokenized_utterance = tokenizer.tokenize(utterance.lower()) #todo keyword argument lemma 报错 可能是跟 allennlp 的版本有关 lemma -> lemma_ self.tokenized_utterance = [ Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance ] if db_id not in WikiDBContext.schemas: WikiDBContext.schemas = read_wiki_dataset_schema(self.tables_file) self.schema = WikiDBContext.schemas[db_id] self.knowledge_graph = self.get_db_knowledge_graph(db_id) #todo entity_texts里面的table_name 毫无意义啊 是一串这样的 “1-10015132-11” 的字符 而且分词处理后是 [1, -, 10015132, -, 11]的东西 # 这样token加入到vocabulary 里面简直是灾难啊 entity_texts = [ self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities ] entity_tokens = tokenizer.batch_tokenize(entity_texts) #todo error lemma=> lemma_ self.entity_tokens = [[ Token(text=t.text, lemma_=t.lemma_) for t in et ] for et in entity_tokens]
def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer, tables_file: str, dataset_path: str): self.dataset_path = dataset_path self.tables_file = tables_file self.db_id = db_id self.utterance = utterance tokenized_utterance = tokenizer.tokenize(utterance.lower()) #todo keyword argument lemma 报错 可能是跟 allennlp 的版本有关 官方的api上已经改成了 lemma_ 于是将lemma替换为lemma_ self.tokenized_utterance = [ Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance ] if db_id not in SpiderDBContext.schemas: SpiderDBContext.schemas = read_spider_dataset_schema( self.tables_file) self.schema = SpiderDBContext.schemas[db_id] self.knowledge_graph = self.get_db_knowledge_graph(db_id) entity_texts = [ self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities ] entity_tokens = tokenizer.batch_tokenize(entity_texts) #todo error lemma => lemma_ 和上面一样的错误 self.entity_tokens = [[ Token(text=t.text, lemma_=t.lemma_) for t in et ] for et in entity_tokens]
def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer, tables_file: str, dataset_path: str): self.dataset_path = dataset_path self.tables_file = tables_file self.db_id = db_id self.utterance = utterance # lemma is the basic form of a word, # for example the singular form of a noun or the infinitive form of a verb, # as it is shown at the beginning of a dictionary entry tokenized_utterance = tokenizer.tokenize(utterance.lower()) # For example: if the utterance.lower() = ['biggest', 'departments'] # tokenized_utterance will be [token_from_('biggest'), token_from_('departments')] # And token_from_('biggest').text = 'biggest', token_from_('biggest').lemma_ = 'big'; # And token_from_('departments').text = 'departments', token_from_('departments').lemma_ = 'department'; # the obj Token is similar to the obj in tokenized_utterance but not the same. # And the here, we take only a part of data from original tokenized_utterance. # So the Token obj is a simplified version of the obj in tokenized_utterance self.tokenized_utterance = [ Token(text=t.text, lemma=t.lemma_) for t in tokenized_utterance ] if db_id not in SpiderDBContext.schemas: SpiderDBContext.schemas = read_dataset_schema(self.tables_file) self.schema = SpiderDBContext.schemas[db_id] self.knowledge_graph = self.get_db_knowledge_graph(db_id) entity_texts = [ self.knowledge_graph.entity_text[entity].lower() for entity in self.knowledge_graph.entities ] entity_tokens = tokenizer.batch_tokenize(entity_texts) self.entity_tokens = [[Token(text=t.text, lemma=t.lemma_) for t in et] for et in entity_tokens]