Beispiel #1
0
def build_nonredundant_query_split(jsons,
                                   save_loc,
                                   max_questions=None,
                                   keep_variables=False):
    import random
    datasets = {}
    for json_dict in jsons:
        split = json_dict["query-split"]
        if split == "exclude": continue
        if not split in datasets:
            datasets[split] = []
        query = json_dict["sql"][0]
        sql_vars = json_dict['variables']
        sentence = random.choice(json_dict["sentences"])
        text, variables, _ = extract_sentence_fields(sentence)
        if keep_variables:
            sql = query
            question = text
        else:
            sql, question = read_new_as_old.insert_variables(
                query, sql_vars, text, variables)
            sql = tokenise(sql)
            question = preprocess_text(question)
            datasets[split].append((question, sql))
    print "Nonredundant query split:"
    for k, v in sorted(datasets.items()):
        print "\t%s: %d" % (k, len(v))
    save_datasets(datasets, save_loc)
Beispiel #2
0
def build_query_split(jsons,
                      save_loc,
                      max_questions=None,
                      keep_variables=False):
    datasets = {}
    for json_dict in jsons:
        split = json_dict["query-split"]
        if split == "exclude": continue
        if not split in datasets:
            datasets[split] = []
        for query in [json_dict["sql"][0]]:
            sql_vars = json_dict['variables']
            sentences = json_dict["sentences"]
            if max_questions and max_questions < len(sentences):
                sentences = sentences[:max_questions]
            for sentence in sentences:
                text, variables, _ = extract_sentence_fields(sentence)
                if keep_variables:
                    sql = query
                    question = text
                else:
                    sql, question = read_new_as_old.insert_variables(
                        query, sql_vars, text, variables)
                sql = tokenise(sql)
                question = preprocess_text(question)
                datasets[split].append((question, sql))
    print("Query split:")
    for k, v in sorted(datasets.items()):
        print("\t%s: %d" % (k, len(v)))
    save_datasets(datasets, save_loc)
def convert_instance(data):
    var_sql = None
    var_sql = data["sql"][0]
    for sentence in data["sentences"]:
        text = sentence['text']
        sql = var_sql # Needed to do variable replacement correctly

        # Variable replacement
        if not args.keep_vars:
            for name in sentence['variables']:
                value = sentence['variables'][name]
                if len(value) == 0:
                    for variable in data['variables']:
                        if variable['name'] == name:
                            value = variable['example']
                text = value.join(text.split(name))
                if not args.keep_sql_vars:
                    sql = value.join(sql.split(name))

        # Tokenise
        if args.tokenise_sql:
            sql = tokenise(sql)

        # Select the output file
        output_file = out_train
        if args.query_split:
            if data['query-split'] == 'dev':
                output_file = out_dev
            elif data['query-split'] == 'test':
                output_file = out_test
        else:
            if sentence['question-split'] == 'dev':
                output_file = out_dev
            elif sentence['question-split'] == 'test':
                output_file = out_test
        if args.to_stdout:
            output_file = sys.stdout

        print(text, "|||", sql, file=output_file)