Ejemplo n.º 1
0
def build_nonredundant_query_split(jsons,
                                   save_loc,
                                   max_questions=None,
                                   keep_variables=False):
    import random
    datasets = {}
    for json_dict in jsons:
        split = json_dict["query-split"]
        if split == "exclude": continue
        if not split in datasets:
            datasets[split] = []
        query = json_dict["sql"][0]
        sql_vars = json_dict['variables']
        sentence = random.choice(json_dict["sentences"])
        text, variables, _ = extract_sentence_fields(sentence)
        if keep_variables:
            sql = query
            question = text
        else:
            sql, question = read_new_as_old.insert_variables(
                query, sql_vars, text, variables)
            sql = tokenise(sql)
            question = preprocess_text(question)
            datasets[split].append((question, sql))
    print "Nonredundant query split:"
    for k, v in sorted(datasets.items()):
        print "\t%s: %d" % (k, len(v))
    save_datasets(datasets, save_loc)
Ejemplo n.º 2
0
def build_query_split(jsons,
                      save_loc,
                      max_questions=None,
                      keep_variables=False):
    datasets = {}
    for json_dict in jsons:
        split = json_dict["query-split"]
        if split == "exclude": continue
        if not split in datasets:
            datasets[split] = []
        for query in [json_dict["sql"][0]]:
            sql_vars = json_dict['variables']
            sentences = json_dict["sentences"]
            if max_questions and max_questions < len(sentences):
                sentences = sentences[:max_questions]
            for sentence in sentences:
                text, variables, _ = extract_sentence_fields(sentence)
                if keep_variables:
                    sql = query
                    question = text
                else:
                    sql, question = read_new_as_old.insert_variables(
                        query, sql_vars, text, variables)
                sql = tokenise(sql)
                question = preprocess_text(question)
                datasets[split].append((question, sql))
    print("Query split:")
    for k, v in sorted(datasets.items()):
        print("\t%s: %d" % (k, len(v)))
    save_datasets(datasets, save_loc)