def parse_ifttt_dataset(): WORD_FREQ_CUT_OFF = 2 annot_file = '/Users/yinpengcheng/Research/SemanticParsing/ifttt/Data/lang.all.txt' code_file = '/Users/yinpengcheng/Research/SemanticParsing/ifttt/Data/code.all.txt' data = preprocess_ifttt_dataset(annot_file, code_file) # build the grammar grammar = get_grammar([e['parse_tree'] for e in data]) annot_tokens = list(chain(*[e['query_tokens'] for e in data])) annot_vocab = gen_vocab(annot_tokens, vocab_size=30000, freq_cutoff=WORD_FREQ_CUT_OFF) logging.info('annot vocab. size: %d', annot_vocab.size) # we have no terminal tokens in ifttt all_terminal_tokens = [] terminal_vocab = gen_vocab(all_terminal_tokens, vocab_size=4000, freq_cutoff=WORD_FREQ_CUT_OFF) # now generate the dataset! train_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.train_data') dev_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.dev_data') test_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.test_data') all_examples = [] can_fully_reconstructed_examples_num = 0 examples_with_empty_actions_num = 0 for entry in data: idx = entry['id'] query_tokens = entry['query_tokens'] code = entry['code'] parse_tree = entry['parse_tree'] # check if query tokens are valid query_token_ids = [annot_vocab[token] for token in query_tokens if token not in string.punctuation] valid_query_tokens_ids = [tid for tid in query_token_ids if tid != annot_vocab.unk] # remove examples with rare words from train and dev, avoid overfitting if len(valid_query_tokens_ids) == 0 and 0 <= idx < 77495 + 5171: continue rule_list, rule_parents = parse_tree.get_productions(include_value_node=True) actions = [] can_fully_reconstructed = True rule_pos_map = dict() for rule_count, rule in enumerate(rule_list): if not grammar.is_value_node(rule.parent): assert rule.value is None parent_rule = rule_parents[(rule_count, rule)][0] if parent_rule: parent_t = rule_pos_map[parent_rule] else: parent_t = 0 rule_pos_map[rule] = len(actions) d = {'rule': rule, 'parent_t': parent_t, 'parent_rule': parent_rule} action = Action(APPLY_RULE, d) actions.append(action) else: raise RuntimeError('no terminals should be in ifttt dataset!') if len(actions) == 0: examples_with_empty_actions_num += 1 continue example = DataEntry(idx, query_tokens, parse_tree, code, actions, {'str_map': None, 'raw_code': entry['raw_code']}) if can_fully_reconstructed: can_fully_reconstructed_examples_num += 1 # train, valid, test splits if 0 <= idx < 77495: train_data.add(example) elif idx < 77495 + 5171: dev_data.add(example) else: test_data.add(example) all_examples.append(example) # print statistics max_query_len = max(len(e.query) for e in all_examples) max_actions_len = max(len(e.actions) for e in all_examples) # serialize_to_file([len(e.query) for e in all_examples], 'query.len') # serialize_to_file([len(e.actions) for e in all_examples], 'actions.len') logging.info('train_data examples: %d', train_data.count) logging.info('dev_data examples: %d', dev_data.count) logging.info('test_data examples: %d', test_data.count) logging.info('examples that can be fully reconstructed: %d/%d=%f', can_fully_reconstructed_examples_num, len(all_examples), can_fully_reconstructed_examples_num / len(all_examples)) logging.info('empty_actions_count: %d', examples_with_empty_actions_num) logging.info('max_query_len: %d', max_query_len) logging.info('max_actions_len: %d', max_actions_len) train_data.init_data_matrices(max_query_length=40, max_example_action_num=6) dev_data.init_data_matrices() test_data.init_data_matrices() serialize_to_file((train_data, dev_data, test_data), 'data/ifttt.freq{WORD_FREQ_CUT_OFF}.bin'.format(WORD_FREQ_CUT_OFF=WORD_FREQ_CUT_OFF)) return train_data, dev_data, test_data
def parse_ifttt_dataset(): WORD_FREQ_CUT_OFF = 2 annot_file = '/Users/yinpengcheng/Research/SemanticParsing/ifttt/Data/lang.all.txt' code_file = '/Users/yinpengcheng/Research/SemanticParsing/ifttt/Data/code.all.txt' data = preprocess_ifttt_dataset(annot_file, code_file) # build the grammar grammar = get_grammar([e['parse_tree'] for e in data]) annot_tokens = list(chain(*[e['query_tokens'] for e in data])) annot_vocab = gen_vocab(annot_tokens, vocab_size=30000, freq_cutoff=WORD_FREQ_CUT_OFF) logging.info('annot vocab. size: %d', annot_vocab.size) # we have no terminal tokens in ifttt all_terminal_tokens = [] terminal_vocab = gen_vocab(all_terminal_tokens, vocab_size=4000, freq_cutoff=WORD_FREQ_CUT_OFF) # now generate the dataset! train_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.train_data') dev_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.dev_data') test_data = DataSet(annot_vocab, terminal_vocab, grammar, 'ifttt.test_data') all_examples = [] can_fully_reconstructed_examples_num = 0 examples_with_empty_actions_num = 0 for entry in data: idx = entry['id'] query_tokens = entry['query_tokens'] code = entry['code'] parse_tree = entry['parse_tree'] # check if query tokens are valid query_token_ids = [ annot_vocab[token] for token in query_tokens if token not in string.punctuation ] valid_query_tokens_ids = [ tid for tid in query_token_ids if tid != annot_vocab.unk ] # remove examples with rare words from train and dev, avoid overfitting if len(valid_query_tokens_ids) == 0 and 0 <= idx < 77495 + 5171: continue rule_list, rule_parents = parse_tree.get_productions( include_value_node=True) actions = [] can_fully_reconstructed = True rule_pos_map = dict() for rule_count, rule in enumerate(rule_list): if not grammar.is_value_node(rule.parent): assert rule.value is None parent_rule = rule_parents[(rule_count, rule)][0] if parent_rule: parent_t = rule_pos_map[parent_rule] else: parent_t = 0 rule_pos_map[rule] = len(actions) d = { 'rule': rule, 'parent_t': parent_t, 'parent_rule': parent_rule } action = Action(APPLY_RULE, d) actions.append(action) else: raise RuntimeError('no terminals should be in ifttt dataset!') if len(actions) == 0: examples_with_empty_actions_num += 1 continue example = DataEntry(idx, query_tokens, parse_tree, code, actions, { 'str_map': None, 'raw_code': entry['raw_code'] }) if can_fully_reconstructed: can_fully_reconstructed_examples_num += 1 # train, valid, test splits if 0 <= idx < 77495: train_data.add(example) elif idx < 77495 + 5171: dev_data.add(example) else: test_data.add(example) all_examples.append(example) # print statistics max_query_len = max(len(e.query) for e in all_examples) max_actions_len = max(len(e.actions) for e in all_examples) # serialize_to_file([len(e.query) for e in all_examples], 'query.len') # serialize_to_file([len(e.actions) for e in all_examples], 'actions.len') logging.info('train_data examples: %d', train_data.count) logging.info('dev_data examples: %d', dev_data.count) logging.info('test_data examples: %d', test_data.count) logging.info('examples that can be fully reconstructed: %d/%d=%f', can_fully_reconstructed_examples_num, len(all_examples), can_fully_reconstructed_examples_num / len(all_examples)) logging.info('empty_actions_count: %d', examples_with_empty_actions_num) logging.info('max_query_len: %d', max_query_len) logging.info('max_actions_len: %d', max_actions_len) train_data.init_data_matrices(max_query_length=40, max_example_action_num=6) dev_data.init_data_matrices() test_data.init_data_matrices() serialize_to_file((train_data, dev_data, test_data), 'data/ifttt.freq{WORD_FREQ_CUT_OFF}.bin'.format( WORD_FREQ_CUT_OFF=WORD_FREQ_CUT_OFF)) return train_data, dev_data, test_data
def parse_train_dataset(args): MAX_QUERY_LENGTH = 70 # FIXME: figure out the best config! WORD_FREQ_CUT_OFF = 0 # nl_file = './data/mix.nl' # sql_file = './data/mix-1.sql' # data_file = './data/train.json' # ast_file = './data/mix.json' train_data = preprocess_sql_dataset(args.train_data, args.train_data_ast) dev_data = preprocess_sql_dataset(args.dev_data, args.dev_data_ast) test_data = preprocess_sql_dataset(args.test_data, args.test_data_ast) data = train_data + dev_data + test_data print("data size: {}".format(len(data))) parse_trees = [e['parse_tree'] for e in data] # apply unary closures # unary_closures = get_top_unary_closures(parse_trees, k=20) # for parse_tree in parse_trees: # apply_unary_closures(parse_tree, unary_closures) # build the grammar grammar = get_grammar(parse_trees) with open('sql.grammar.unary_closure.txt', 'w') as f: for rule in grammar: f.write(rule.__repr__() + '\n') nl_tokens = list(chain(*[e['query_tokens'] for e in data])) nl_vocab = gen_vocab(nl_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) # enumerate all terminal tokens to build up the terminal tokens vocabulary all_terminal_tokens = [] for entry in data: parse_tree = entry['parse_tree'] for node in parse_tree.get_leaves(): if grammar.is_value_node(node): terminal_val = node.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) for terminal_token in terminal_tokens: assert len(terminal_token) > 0 all_terminal_tokens.append(terminal_token) # print all_terminal_tokens table_schema = args.table_schema terminal_vocab = gen_vocab(all_terminal_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) non_schema_vocab_size = terminal_vocab.size db_dict, schema_vocab = load_table_schema_data(table_schema) terminal_vocab = gen_schema_vocab(schema_vocab, terminal_vocab) db_mask = gen_db_mask(terminal_vocab, non_schema_vocab_size, table_schema) # print terminal_vocab # now generate the dataset! # print(terminal_vocab) # print(terminal_vocab.token_id_map.keys()) train_data = DataSet(nl_vocab, terminal_vocab, grammar, db_mask, 'sql.train_data') dev_data = DataSet(nl_vocab, terminal_vocab, grammar, db_mask, 'sql.dev_data') test_data = DataSet(nl_vocab, terminal_vocab, grammar, db_mask, 'sql.test_data') all_examples = [] can_fully_reconstructed_examples_num = 0 examples_with_empty_actions_num = 0 # print(list(terminal_vocab.iteritems())) for index, entry in enumerate(data): idx = entry['id'] query_tokens = entry['query_tokens'] code = entry['code'] parse_tree = entry['parse_tree'] rule_list, rule_parents = parse_tree.get_productions( include_value_node=True) actions = [] can_fully_reconstructed = True rule_pos_map = dict() for rule_count, rule in enumerate(rule_list): # if rule_count == 116: # continue if not grammar.is_value_node(rule.parent): assert rule.value is None, rule.value parent_rule = rule_parents[(rule_count, rule)][0] if parent_rule: parent_t = rule_pos_map[parent_rule] else: parent_t = 0 rule_pos_map[rule] = len(actions) d = { 'rule': rule, 'parent_t': parent_t, 'parent_rule': parent_rule } action = Action(APPLY_RULE, d) actions.append(action) else: assert rule.is_leaf, (rule.type, rule.value, rule.label) parent_rule = rule_parents[(rule_count, rule)][0] parent_t = rule_pos_map[parent_rule] terminal_val = rule.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) # assert len(terminal_tokens) > 0 for terminal_token in terminal_tokens: term_tok_id = terminal_vocab[terminal_token] tok_src_idx = -1 try: tok_src_idx = query_tokens.index(terminal_token) except ValueError: pass d = { 'literal': terminal_token, 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t } # cannot copy, only generation # could be unk! if tok_src_idx < 0 or tok_src_idx >= MAX_QUERY_LENGTH: action = Action(GEN_TOKEN, d) if terminal_token not in terminal_vocab: if terminal_token not in query_tokens: # print terminal_token can_fully_reconstructed = False else: # copy if term_tok_id != terminal_vocab.unk: d['source_idx'] = tok_src_idx action = Action(GEN_COPY_TOKEN, d) else: d['source_idx'] = tok_src_idx action = Action(COPY_TOKEN, d) actions.append(action) d = { 'literal': '<eos>', 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t } actions.append(Action(GEN_TOKEN, d)) if len(actions) == 0: examples_with_empty_actions_num += 1 continue mask = db_mask[entry['db_id']] example = DataEntry(idx, query_tokens, parse_tree, code, actions, mask, { 'str_map': None, 'raw_code': entry['raw_code'] }) if can_fully_reconstructed: can_fully_reconstructed_examples_num += 1 # train, valid, test splits if 0 <= index < args.train_data_size: train_data.add(example) elif index < args.train_data_size + args.dev_data_size: dev_data.add(example) else: test_data.add(example) all_examples.append(example) # print("test data size {}".format(len(test_data))) # print statistics max_query_len = max(len(e.query) for e in all_examples) max_actions_len = max(len(e.actions) for e in all_examples) # serialize_to_file([len(e.query) for e in all_examples], 'query.len') # serialize_to_file([len(e.actions) for e in all_examples], 'actions.len') logging.info('examples that can be fully reconstructed: %d/%d=%f', can_fully_reconstructed_examples_num, len(all_examples), can_fully_reconstructed_examples_num / len(all_examples)) logging.info('empty_actions_count: %d', examples_with_empty_actions_num) logging.info('max_query_len: %d', max_query_len) logging.info('max_actions_len: %d', max_actions_len) train_data.init_data_matrices(max_query_length=70, max_example_action_num=350) dev_data.init_data_matrices(max_query_length=70, max_example_action_num=350) test_data.init_data_matrices(max_query_length=70, max_example_action_num=350) # serialize_to_file((train_data, dev_data, test_data), # './data/sql.freq{WORD_FREQ_CUT_OFF}.max_action350.pre_suf.unary_closure.bin'.format(WORD_FREQ_CUT_OFF=WORD_FREQ_CUT_OFF)) print("train data size:{}".format(train_data.count)) print("dev data size:{}".format(dev_data.count)) print("test data size:{}".format(test_data.count)) serialize_to_file((train_data, dev_data, test_data), args.output_path) return train_data, dev_data, test_data
def parse_hs_dataset(): MAX_QUERY_LENGTH = 70 # FIXME: figure out the best config! WORD_FREQ_CUT_OFF = 3 annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.mod.in' code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' data = preprocess_hs_dataset(annot_file, code_file) parse_trees = [e['parse_tree'] for e in data] # apply unary closures unary_closures = get_top_unary_closures(parse_trees, k=20) for parse_tree in parse_trees: apply_unary_closures(parse_tree, unary_closures) # build the grammar grammar = get_grammar(parse_trees) with open('hs.grammar.unary_closure.txt', 'w') as f: for rule in grammar: f.write(rule.__repr__() + '\n') annot_tokens = list(chain(*[e['query_tokens'] for e in data])) annot_vocab = gen_vocab(annot_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) def get_terminal_tokens(_terminal_str): """ get terminal tokens break words like MinionCards into [Minion, Cards] """ tmp_terminal_tokens = [ t for t in _terminal_str.split(' ') if len(t) > 0 ] _terminal_tokens = [] for token in tmp_terminal_tokens: sub_tokens = re.sub(r'([a-z])([A-Z])', r'\1 \2', token).split(' ') _terminal_tokens.extend(sub_tokens) _terminal_tokens.append(' ') return _terminal_tokens[:-1] # enumerate all terminal tokens to build up the terminal tokens vocabulary all_terminal_tokens = [] for entry in data: parse_tree = entry['parse_tree'] for node in parse_tree.get_leaves(): if grammar.is_value_node(node): terminal_val = node.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) for terminal_token in terminal_tokens: assert len(terminal_token) > 0 all_terminal_tokens.append(terminal_token) terminal_vocab = gen_vocab(all_terminal_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) # now generate the dataset! train_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.train_data') dev_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.dev_data') test_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.test_data') all_examples = [] can_fully_reconstructed_examples_num = 0 examples_with_empty_actions_num = 0 for entry in data: idx = entry['id'] query_tokens = entry['query_tokens'] code = entry['code'] parse_tree = entry['parse_tree'] rule_list, rule_parents = parse_tree.get_productions( include_value_node=True) actions = [] can_fully_reconstructed = True rule_pos_map = dict() for rule_count, rule in enumerate(rule_list): if not grammar.is_value_node(rule.parent): assert rule.value is None parent_rule = rule_parents[(rule_count, rule)][0] if parent_rule: parent_t = rule_pos_map[parent_rule] else: parent_t = 0 rule_pos_map[rule] = len(actions) d = { 'rule': rule, 'parent_t': parent_t, 'parent_rule': parent_rule } action = Action(APPLY_RULE, d) actions.append(action) else: assert rule.is_leaf parent_rule = rule_parents[(rule_count, rule)][0] parent_t = rule_pos_map[parent_rule] terminal_val = rule.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) # assert len(terminal_tokens) > 0 for terminal_token in terminal_tokens: term_tok_id = terminal_vocab[terminal_token] tok_src_idx = -1 try: tok_src_idx = query_tokens.index(terminal_token) except ValueError: pass d = { 'literal': terminal_token, 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t } # cannot copy, only generation # could be unk! if tok_src_idx < 0 or tok_src_idx >= MAX_QUERY_LENGTH: action = Action(GEN_TOKEN, d) if terminal_token not in terminal_vocab: if terminal_token not in query_tokens: # print terminal_token can_fully_reconstructed = False else: # copy if term_tok_id != terminal_vocab.unk: d['source_idx'] = tok_src_idx action = Action(GEN_COPY_TOKEN, d) else: d['source_idx'] = tok_src_idx action = Action(COPY_TOKEN, d) actions.append(action) d = { 'literal': '<eos>', 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t } actions.append(Action(GEN_TOKEN, d)) if len(actions) == 0: examples_with_empty_actions_num += 1 continue example = DataEntry(idx, query_tokens, parse_tree, code, actions, { 'str_map': None, 'raw_code': entry['raw_code'] }) if can_fully_reconstructed: can_fully_reconstructed_examples_num += 1 # train, valid, test splits if 0 <= idx < 533: train_data.add(example) elif idx < 599: dev_data.add(example) else: test_data.add(example) all_examples.append(example) # print statistics max_query_len = max(len(e.query) for e in all_examples) max_actions_len = max(len(e.actions) for e in all_examples) # serialize_to_file([len(e.query) for e in all_examples], 'query.len') # serialize_to_file([len(e.actions) for e in all_examples], 'actions.len') logging.info('examples that can be fully reconstructed: %d/%d=%f', can_fully_reconstructed_examples_num, len(all_examples), can_fully_reconstructed_examples_num / len(all_examples)) logging.info('empty_actions_count: %d', examples_with_empty_actions_num) logging.info('max_query_len: %d', max_query_len) logging.info('max_actions_len: %d', max_actions_len) train_data.init_data_matrices(max_query_length=70, max_example_action_num=350) dev_data.init_data_matrices(max_query_length=70, max_example_action_num=350) test_data.init_data_matrices(max_query_length=70, max_example_action_num=350) serialize_to_file(( train_data, dev_data, test_data ), 'data/hs.freq{WORD_FREQ_CUT_OFF}.max_action350.pre_suf.unary_closure.bin' .format(WORD_FREQ_CUT_OFF=WORD_FREQ_CUT_OFF)) return train_data, dev_data, test_data
def parse_hs_dataset(): MAX_QUERY_LENGTH = 70 # FIXME: figure out the best config! WORD_FREQ_CUT_OFF = 3 annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.mod.in' code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' data = preprocess_hs_dataset(annot_file, code_file) parse_trees = [e['parse_tree'] for e in data] # apply unary closures unary_closures = get_top_unary_closures(parse_trees, k=20) for parse_tree in parse_trees: apply_unary_closures(parse_tree, unary_closures) # build the grammar grammar = get_grammar(parse_trees) with open('hs.grammar.unary_closure.txt', 'w') as f: for rule in grammar: f.write(rule.__repr__() + '\n') annot_tokens = list(chain(*[e['query_tokens'] for e in data])) annot_vocab = gen_vocab(annot_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) def get_terminal_tokens(_terminal_str): """ get terminal tokens break words like MinionCards into [Minion, Cards] """ tmp_terminal_tokens = [t for t in _terminal_str.split(' ') if len(t) > 0] _terminal_tokens = [] for token in tmp_terminal_tokens: sub_tokens = re.sub(r'([a-z])([A-Z])', r'\1 \2', token).split(' ') _terminal_tokens.extend(sub_tokens) _terminal_tokens.append(' ') return _terminal_tokens[:-1] # enumerate all terminal tokens to build up the terminal tokens vocabulary all_terminal_tokens = [] for entry in data: parse_tree = entry['parse_tree'] for node in parse_tree.get_leaves(): if grammar.is_value_node(node): terminal_val = node.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) for terminal_token in terminal_tokens: assert len(terminal_token) > 0 all_terminal_tokens.append(terminal_token) terminal_vocab = gen_vocab(all_terminal_tokens, vocab_size=5000, freq_cutoff=WORD_FREQ_CUT_OFF) # now generate the dataset! train_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.train_data') dev_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.dev_data') test_data = DataSet(annot_vocab, terminal_vocab, grammar, 'hs.test_data') all_examples = [] can_fully_reconstructed_examples_num = 0 examples_with_empty_actions_num = 0 for entry in data: idx = entry['id'] query_tokens = entry['query_tokens'] code = entry['code'] parse_tree = entry['parse_tree'] rule_list, rule_parents = parse_tree.get_productions(include_value_node=True) actions = [] can_fully_reconstructed = True rule_pos_map = dict() for rule_count, rule in enumerate(rule_list): if not grammar.is_value_node(rule.parent): assert rule.value is None parent_rule = rule_parents[(rule_count, rule)][0] if parent_rule: parent_t = rule_pos_map[parent_rule] else: parent_t = 0 rule_pos_map[rule] = len(actions) d = {'rule': rule, 'parent_t': parent_t, 'parent_rule': parent_rule} action = Action(APPLY_RULE, d) actions.append(action) else: assert rule.is_leaf parent_rule = rule_parents[(rule_count, rule)][0] parent_t = rule_pos_map[parent_rule] terminal_val = rule.value terminal_str = str(terminal_val) terminal_tokens = get_terminal_tokens(terminal_str) # assert len(terminal_tokens) > 0 for terminal_token in terminal_tokens: term_tok_id = terminal_vocab[terminal_token] tok_src_idx = -1 try: tok_src_idx = query_tokens.index(terminal_token) except ValueError: pass d = {'literal': terminal_token, 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t} # cannot copy, only generation # could be unk! if tok_src_idx < 0 or tok_src_idx >= MAX_QUERY_LENGTH: action = Action(GEN_TOKEN, d) if terminal_token not in terminal_vocab: if terminal_token not in query_tokens: # print terminal_token can_fully_reconstructed = False else: # copy if term_tok_id != terminal_vocab.unk: d['source_idx'] = tok_src_idx action = Action(GEN_COPY_TOKEN, d) else: d['source_idx'] = tok_src_idx action = Action(COPY_TOKEN, d) actions.append(action) d = {'literal': '<eos>', 'rule': rule, 'parent_rule': parent_rule, 'parent_t': parent_t} actions.append(Action(GEN_TOKEN, d)) if len(actions) == 0: examples_with_empty_actions_num += 1 continue example = DataEntry(idx, query_tokens, parse_tree, code, actions, {'str_map': None, 'raw_code': entry['raw_code']}) if can_fully_reconstructed: can_fully_reconstructed_examples_num += 1 # train, valid, test splits if 0 <= idx < 533: train_data.add(example) elif idx < 599: dev_data.add(example) else: test_data.add(example) all_examples.append(example) # print statistics max_query_len = max(len(e.query) for e in all_examples) max_actions_len = max(len(e.actions) for e in all_examples) # serialize_to_file([len(e.query) for e in all_examples], 'query.len') # serialize_to_file([len(e.actions) for e in all_examples], 'actions.len') logging.info('examples that can be fully reconstructed: %d/%d=%f', can_fully_reconstructed_examples_num, len(all_examples), can_fully_reconstructed_examples_num / len(all_examples)) logging.info('empty_actions_count: %d', examples_with_empty_actions_num) logging.info('max_query_len: %d', max_query_len) logging.info('max_actions_len: %d', max_actions_len) train_data.init_data_matrices(max_query_length=70, max_example_action_num=350) dev_data.init_data_matrices(max_query_length=70, max_example_action_num=350) test_data.init_data_matrices(max_query_length=70, max_example_action_num=350) serialize_to_file((train_data, dev_data, test_data), 'data/hs.freq{WORD_FREQ_CUT_OFF}.max_action350.pre_suf.unary_closure.bin'.format(WORD_FREQ_CUT_OFF=WORD_FREQ_CUT_OFF)) return train_data, dev_data, test_data