def build_nonredundant_query_split(jsons, save_loc, max_questions=None, keep_variables=False): import random datasets = {} for json_dict in jsons: split = json_dict["query-split"] if split == "exclude": continue if not split in datasets: datasets[split] = [] query = json_dict["sql"][0] sql_vars = json_dict['variables'] sentence = random.choice(json_dict["sentences"]) text, variables, _ = extract_sentence_fields(sentence) if keep_variables: sql = query question = text else: sql, question = read_new_as_old.insert_variables( query, sql_vars, text, variables) sql = tokenise(sql) question = preprocess_text(question) datasets[split].append((question, sql)) print "Nonredundant query split:" for k, v in sorted(datasets.items()): print "\t%s: %d" % (k, len(v)) save_datasets(datasets, save_loc)
def build_query_split(jsons, save_loc, max_questions=None, keep_variables=False): datasets = {} for json_dict in jsons: split = json_dict["query-split"] if split == "exclude": continue if not split in datasets: datasets[split] = [] for query in [json_dict["sql"][0]]: sql_vars = json_dict['variables'] sentences = json_dict["sentences"] if max_questions and max_questions < len(sentences): sentences = sentences[:max_questions] for sentence in sentences: text, variables, _ = extract_sentence_fields(sentence) if keep_variables: sql = query question = text else: sql, question = read_new_as_old.insert_variables( query, sql_vars, text, variables) sql = tokenise(sql) question = preprocess_text(question) datasets[split].append((question, sql)) print("Query split:") for k, v in sorted(datasets.items()): print("\t%s: %d" % (k, len(v))) save_datasets(datasets, save_loc)
def convert_instance(data): var_sql = None var_sql = data["sql"][0] for sentence in data["sentences"]: text = sentence['text'] sql = var_sql # Needed to do variable replacement correctly # Variable replacement if not args.keep_vars: for name in sentence['variables']: value = sentence['variables'][name] if len(value) == 0: for variable in data['variables']: if variable['name'] == name: value = variable['example'] text = value.join(text.split(name)) if not args.keep_sql_vars: sql = value.join(sql.split(name)) # Tokenise if args.tokenise_sql: sql = tokenise(sql) # Select the output file output_file = out_train if args.query_split: if data['query-split'] == 'dev': output_file = out_dev elif data['query-split'] == 'test': output_file = out_test else: if sentence['question-split'] == 'dev': output_file = out_dev elif sentence['question-split'] == 'test': output_file = out_test if args.to_stdout: output_file = sys.stdout print(text, "|||", sql, file=output_file)