Esempio n. 1
0
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        tokenized_utterance = tokenizer.tokenize(utterance.lower())
        self.tokenized_utterance = [
            Token(text=t.text, lemma_=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_dataset_schema(self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        self.entity_tokens = [[
            Token(text=t.text, lemma_=t.lemma_) for t in et
        ] for et in entity_tokens]
    def __init__(self, db_id: str, utterance: str, tokenizer: Tokenizer,
                 tables_file: str, dataset_path: str):
        self.dataset_path = dataset_path
        self.tables_file = tables_file
        self.db_id = db_id
        self.utterance = utterance

        # lemma is the basic form of a word,
        # for example the singular form of a noun or the infinitive form of a verb,
        # as it is shown at the beginning of a dictionary entry
        tokenized_utterance = tokenizer.tokenize(utterance.lower())

        # For example: if the utterance.lower() = ['biggest', 'departments']
        # tokenized_utterance will be [token_from_('biggest'), token_from_('departments')]
        # And token_from_('biggest').text = 'biggest', token_from_('biggest').lemma_ = 'big';
        # And token_from_('departments').text = 'departments', token_from_('departments').lemma_ = 'department';

        # the obj Token is similar to the obj in tokenized_utterance but not the same.
        # And the here, we take only a part of data from original tokenized_utterance.
        # So the Token obj is a simplified version of the obj in tokenized_utterance
        self.tokenized_utterance = [
            Token(text=t.text, lemma=t.lemma_) for t in tokenized_utterance
        ]

        if db_id not in SpiderDBContext.schemas:
            SpiderDBContext.schemas = read_dataset_schema(self.tables_file)
        self.schema = SpiderDBContext.schemas[db_id]

        self.knowledge_graph = self.get_db_knowledge_graph(db_id)

        entity_texts = [
            self.knowledge_graph.entity_text[entity].lower()
            for entity in self.knowledge_graph.entities
        ]
        entity_tokens = tokenizer.batch_tokenize(entity_texts)
        self.entity_tokens = [[Token(text=t.text, lemma=t.lemma_) for t in et]
                              for et in entity_tokens]