예제 #1
0
파일: mixin.py 프로젝트: zzozzolev/claf
    def predict(self, output_dict, arguments, helper):
        """
        Inference by raw_feature

        * Args:
            output_dict: model's output dictionary consisting of
            arguments: arguments dictionary consisting of user_input
            helper: dictionary for helping get answer

        * Returns:
            query: Generated SQL Query
            execute_result: Execute result by generated query
        """
        output_dict["table_id"] = arguments["table_id"]
        output_dict["tokenized_question"] = helper["tokenized_question"]

        prediction = self.generate_queries(output_dict)[0]
        pred_query = Query.from_dict(prediction["query"], ordered=True)

        dbengine = DBEngine(arguments["db_path"])
        try:
            pred_execute_result = dbengine.execute_query(
                prediction["table_id"], pred_query, lower=True)
        except IndexError as e:
            pred_execute_result = str(e)

        return {
            "query": str(pred_query),
            "execute_result": pred_execute_result
        }
예제 #2
0
def evaluate(labels, predictions, db_path, ordered=True):  # pragma: no cover
    """ labels and predictions: dictionary {data_uid: sql_data, ...} """
    engine = DBEngine(db_path)

    exact_match, grades = [], []
    for idx, data_uid in enumerate(predictions):
        eg = labels[data_uid]
        ep = predictions[data_uid]

        qg = eg["sql_query"]
        gold = eg["execution_result"]

        pred = ep.get("error", None)
        qp = None
        if not ep.get("error", None):
            try:
                qp = Query.from_dict(ep["query"], ordered=ordered)
                pred = engine.execute_query(ep["table_id"], qp, lower=True)
            except Exception as e:
                pred = repr(e)

        correct = pred == gold
        match = qp == qg
        grades.append(correct)
        exact_match.append(match)

    return {
        "ex_accuracy": sum(grades) / len(grades) * 100.0,
        "lf_accuracy": sum(exact_match) / len(exact_match) * 100.0,
    }
예제 #3
0
    def _read(self, file_path, data_type=None):
        file_path = self.data_handler.read(file_path, return_path=True)
        file_path = Path(file_path)

        data_dir = file_path.parent
        file_name = file_path.stem

        db_path = data_dir / f"{file_name}.db"
        table_path = data_dir / f"{file_name}.tables.jsonl"

        self.dbengine = DBEngine(db_path)

        helper = {"file_path": file_path, "db_path": db_path, "examples": {}}
        features, labels = [], []

        sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type)
        for sql_data in tqdm(sql_datas, desc=data_type):
            question = sql_data["question"]
            table_id = sql_data["table_id"]
            column_headers = table_data[table_id]["header"]

            feature_row = {"column": column_headers, "question": question}

            data_uid = str(uuid.uuid1())
            conditions_value_position = self.get_coditions_value_position(
                sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]]
            )

            sql_query = Query.from_dict(sql_data["sql"], ordered=True)
            execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True)

            label_row = {
                "id": data_uid,
                "table_id": table_id,
                "tokenized_question": self.word_tokenizer.tokenize(question),
                "aggregator_idx": sql_data["sql"]["agg"],
                "select_column_idx": sql_data["sql"]["sel"],
                "conditions_num": len(sql_data["sql"]["conds"]),
                "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]],
                "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]],
                "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]],
                "conditions_value_position": conditions_value_position,
                "sql_query": sql_query,
                "execution_result": execution_result,
            }

            features.append(feature_row)
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "question": question,
                "sql_query": sql_query,
                "execution_result": execution_result,
            }

            if self.is_test and len(labels) == 10:
                break

        return make_batch(features, labels), helper
예제 #4
0
파일: mixin.py 프로젝트: zzozzolev/claf
    def print_examples(self, index, inputs, predictions):
        """
        Print evaluation examples

        * Args:
            index: data index
            inputs: mini-batch inputs
            predictions: prediction dictionary consisting of
                - key: 'id' (question id)
                - value: consisting of dictionary
                    table_id, query (agg, sel, conds)

        * Returns:
            print(Context, Question, Answers and Predict)
        """

        data_index = inputs["labels"]["data_idx"][index].item()
        data_id = self._dataset.get_id(data_index)

        helper = self._dataset.helper
        question = helper["examples"][data_id]["question"]

        label = self._dataset.get_ground_truth(data_id)

        dbengine = DBEngine(helper["db_path"])

        prediction = predictions[data_id]
        pred_query = Query.from_dict(prediction["query"], ordered=True)
        pred_execute_result = dbengine.execute_query(prediction["table_id"],
                                                     pred_query,
                                                     lower=True)

        print("- Question:", question)
        print("- Answers:")
        print("    SQL Query: ", label["sql_query"])
        print("    Execute Results:", label["execution_result"])
        print("- Predict:")
        print("    SQL Query: ", pred_query)
        print("    Execute Results:", pred_execute_result)
        print("-" * 30)
예제 #5
0
    }


if __name__ == "__main__":  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument("source_file", help="source file for the prediction")
    parser.add_argument("db_file", help="source database for the prediction")
    parser.add_argument("pred_file", help="predictions by the model")
    parser.add_argument(
        "--ordered",
        action="store_true",
        help="whether the exact match should consider the order of conditions",
    )
    args = parser.parse_args()

    engine = DBEngine(args.db_file)
    exact_match = []
    with open(args.source_file) as fs, open(args.pred_file) as fp:
        grades = []
        for ls, lp in tqdm(zip(fs, fp), total=count_lines(args.source_file)):
            eg = json.loads(ls)
            ep = json.loads(lp)
            qg = Query.from_dict(eg["sql"], ordered=args.ordered)
            gold = engine.execute_query(eg["table_id"], qg, lower=True)
            pred = ep.get("error", None)
            qp = None
            if not ep.get("error", None):
                try:
                    qp = Query.from_dict(ep["query"], ordered=args.ordered)
                    pred = engine.execute_query(eg["table_id"], qp, lower=True)
                except Exception as e:
예제 #6
0
파일: wikisql.py 프로젝트: zzozzolev/claf
class WikiSQLReader(DataReader):
    """
    WikiSQL DataReader
    (http://arxiv.org/abs/1709.00103)

    * Args:
        file_paths: .json file paths (train and dev)
        tokenizers: defined tokenizers config (char/word)
    """
    def __init__(self,
                 file_paths,
                 tokenizers,
                 context_max_length=None,
                 is_test=None):
        super(WikiSQLReader, self).__init__(file_paths, WikiSQLDataset)
        self.is_test = is_test
        self.text_columns = ["column", "question"]

        if "word" not in tokenizers:
            raise ValueError(
                "WordTokenizer is required. define English WordTokenizer")
        self.word_tokenizer = tokenizers["word"]
        self.dbengine = None

    @overrides
    def _read(self, file_path, data_type=None):
        file_path = self.data_handler.read(file_path, return_path=True)
        file_path = Path(file_path)

        data_dir = file_path.parent
        file_name = file_path.stem

        db_path = data_dir / f"{file_name}.db"
        table_path = data_dir / f"{file_name}.tables.jsonl"

        self.dbengine = DBEngine(db_path)

        helper = Helper(**{
            "file_path": file_path,
            "db_path": db_path,
        })

        features, labels = [], []

        sql_datas, table_data = self.load_data(file_path,
                                               table_path,
                                               data_type=data_type)
        for sql_data in tqdm(sql_datas, desc=data_type):
            question = sql_data["question"]
            table_id = sql_data["table_id"]
            column_headers = table_data[table_id]["header"]

            feature_row = {"column": column_headers, "question": question}

            data_uid = str(uuid.uuid1())
            conditions_value_position = self.get_coditions_value_position(
                sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]])

            sql_query = Query.from_dict(sql_data["sql"], ordered=True)
            execution_result = self.dbengine.execute_query(table_id,
                                                           sql_query,
                                                           lower=True)

            label_row = {
                "id":
                data_uid,
                "table_id":
                table_id,
                "tokenized_question":
                self.word_tokenizer.tokenize(question),
                "aggregator_idx":
                sql_data["sql"]["agg"],
                "select_column_idx":
                sql_data["sql"]["sel"],
                "conditions_num":
                len(sql_data["sql"]["conds"]),
                "conditions_column_idx":
                [x[0] for x in sql_data["sql"]["conds"]],
                "conditions_operator_idx":
                [x[1] for x in sql_data["sql"]["conds"]],
                "conditions_value_string":
                [str(x[2]) for x in sql_data["sql"]["conds"]],
                "conditions_value_position":
                conditions_value_position,
                "sql_query":
                sql_query,
                "execution_result":
                execution_result,
            }

            features.append(feature_row)
            labels.append(label_row)

            helper.set_example(
                data_uid, {
                    "question": question,
                    "sql_query": sql_query,
                    "execution_result": execution_result,
                })

            if self.is_test and len(labels) == 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()

    @overrides
    def read_one_example(self, inputs):
        """ inputs keys: question, column, db_path, table_id """
        question_text = inputs["question"]
        helper = {
            "tokenized_question": self.word_tokenizer.tokenize(question_text)
        }
        return inputs, helper

    def load_data(self, sql_path, table_path, data_type=None):
        sql_data = []
        table_data = {}

        logger.info(f"Loading data from {sql_path}")
        with open(sql_path) as inf:
            for line in tqdm(inf, desc=f"sql_{data_type}"):
                sql = json.loads(line.strip())
                sql_data.append(sql)

        logger.info(f"Loading data from {table_path}")
        with open(table_path) as inf:
            for line in tqdm(inf, desc=f"table_{data_type}"):
                tab = json.loads(line.strip())
                table_data[tab["id"]] = tab

        for sql in sql_data:
            assert sql["table_id"] in table_data
        return sql_data, table_data

    def get_coditions_value_position(self, question, values):
        tokenized_question = self.word_tokenizer.tokenize(question.lower())
        tokenized_values = [
            self.word_tokenizer.tokenize(str(value).lower())
            for value in values
        ]

        START_TOKEN, END_TOKEN = "<BEG>", "<END>"

        token_to_index = {START_TOKEN: 0}
        for token in tokenized_question:
            token_to_index[token] = len(token_to_index)
        token_to_index[END_TOKEN] = len(token_to_index)

        position_tokens = []
        for value in tokenized_values:
            position_token = [token_to_index[START_TOKEN]]
            for token in value:
                if token in token_to_index:
                    position_token.append(token_to_index[token])
                else:
                    for i in range(len(tokenized_question)):
                        q_token = tokenized_question[i]
                        if token in q_token:
                            position_token.append(token_to_index[q_token])
            position_token.append(token_to_index[END_TOKEN])

            assert len(position_token) != 2
            position_tokens.append(position_token)

        return position_tokens