예제 #1
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        tokenized_utterance = tokenizer.tokenize(utterance.lower())
        self.tokenized_utterance = [
            Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_dataset_schema(self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        self.entity_tokens = [[
            Token(text=t.text, lemma_=t.lemma_) for t in et
        ] for et in entity_tokens]
예제 #2
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        tokenized_utterance = tokenizer.tokenize(utterance.lower())

        #todo  keyword argument lemma 报错 可能是跟 allennlp 的版本有关 lemma -> lemma_
        self.tokenized_utterance = [
            Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in WikiDBContext.schemas:
            WikiDBContext.schemas = read_wiki_dataset_schema(self.tables_file)
        self.schema = WikiDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        #todo entity_texts里面的table_name 毫无意义啊 是一串这样的 “1-10015132-11” 的字符 而且分词处理后是 [1, -, 10015132, -, 11]的东西
        # 这样token加入到vocabulary 里面简直是灾难啊
        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)

        #todo error  lemma=> lemma_
        self.entity_tokens = [[
            Token(text=t.text, lemma_=t.lemma_) for t in et
        ] for et in entity_tokens]
예제 #3
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        tokenized_utterance = tokenizer.tokenize(utterance.lower())
        #todo keyword argument lemma 报错 可能是跟 allennlp 的版本有关 官方的api上已经改成了 lemma_ 于是将lemma替换为lemma_
        self.tokenized_utterance = [
            Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_spider_dataset_schema(
                self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        #todo  error  lemma => lemma_ 和上面一样的错误
        self.entity_tokens = [[
            Token(text=t.text, lemma_=t.lemma_) for t in et
        ] for et in entity_tokens]
예제 #4
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        # lemma is the basic form of a word,
        # for example the singular form of a noun or the infinitive form of a verb,
        # as it is shown at the beginning of a dictionary entry
        tokenized_utterance = tokenizer.tokenize(utterance.lower())

        # For example: if the utterance.lower() = ['biggest', 'departments']
        # tokenized_utterance will be [token_from_('biggest'), token_from_('departments')]
        # And token_from_('biggest').text = 'biggest', token_from_('biggest').lemma_ = 'big';
        # And token_from_('departments').text = 'departments', token_from_('departments').lemma_ = 'department';

        # the obj Token is similar to the obj in tokenized_utterance but not the same.
        # And the here, we take only a part of data from original tokenized_utterance.
        # So the Token obj is a simplified version of the obj in tokenized_utterance
        self.tokenized_utterance = [
            Token(text=t.text, lemma=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_dataset_schema(self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        self.entity_tokens = [[Token(text=t.text, lemma=t.lemma_) for t in et]
                              for et in entity_tokens]