evaluate_parser.add_argument('-is_nbest', default=False, action='store_true') # misc parser.add_argument('-ifttt_test_split', default='data/ifff.test_data.gold.id') # interactive operation interactive_parser.add_argument('-mode', default='dataset') if __name__ == '__main__': args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) np.random.seed(args.random_seed) init_logging(os.path.join(args.output_dir, 'parser.log'), logging.INFO) logging.info('command line: %s', ' '.join(sys.argv)) logging.info('loading dataset [%s]', args.data) train_data, dev_data, test_data = deserialize_from_file(args.data) if not args.source_vocab_size: args.source_vocab_size = train_data.annot_vocab.size if not args.target_vocab_size: args.target_vocab_size = train_data.terminal_vocab.size if not args.rule_num: args.rule_num = len(train_data.grammar.rules) if not args.node_num: args.node_num = len(train_data.grammar.node_type_to_id) logging.info('current config: %s', args)
if breakCamelStr: sub_tokens = re.sub(r'([a-z])([A-Z])', r'\1 #MERGE# \2', tokval).split(' ') tokens.extend(sub_tokens) else: tokens.append(tokval) if toknum == tk.STRING: tokens.append(quote) return tokens if __name__ == '__main__': from nn.utils.generic_utils import init_logging init_logging('misc.log') # django_code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' # # grammar, parse_trees = extract_grammar(django_code_file) # id = 1888 # parse_tree = parse_trees[id] # print parse_tree # from components import Hyp # hyp = Hyp(grammar) # rules, rule_parents = parse_tree.get_productions() # # while hyp.frontier_nt(): # nt = hyp.frontier_nt() # if grammar.is_value_node(nt): # hyp.append_token('111<eos>')
f_gold = open('data/ifff.test_data.gold.id', 'w') for url in lt_three_agree_with_gold: i = url2id[url] f_gold.write(str(i) + '\n') f_gold.close() f_gold = open('data/ifff.test_data.omit_unintelligible.id', 'w') for url in omit_unintelligible_examples: i = url2id[url] f_gold.write(str(i) + '\n') f_gold.close() f_gold = open('data/ifff.test_data.omit_non_english.id', 'w') for url in omit_non_english_examples: i = url2id[url] f_gold.write(str(i) + '\n') f_gold.close() omit_non_english_examples = [url2id[url] for url in omit_non_english_examples] omit_unintelligible_examples = [url2id[url] for url in omit_unintelligible_examples] lt_three_agree_with_gold = [url2id[url] for url in lt_three_agree_with_gold] return omit_non_english_examples, omit_unintelligible_examples, lt_three_agree_with_gold if __name__ == '__main__': init_logging('ifttt.log') # parse_ifttt_dataset() # analyze_ifttt_dataset() extract_turk_data() # parse_data_for_seq2seq()
for url in omit_unintelligible_examples: i = url2id[url] f_gold.write(str(i) + '\n') f_gold.close() f_gold = open('data/ifff.test_data.omit_non_english.id', 'w') for url in omit_non_english_examples: i = url2id[url] f_gold.write(str(i) + '\n') f_gold.close() omit_non_english_examples = [ url2id[url] for url in omit_non_english_examples ] omit_unintelligible_examples = [ url2id[url] for url in omit_unintelligible_examples ] lt_three_agree_with_gold = [ url2id[url] for url in lt_three_agree_with_gold ] return omit_non_english_examples, omit_unintelligible_examples, lt_three_agree_with_gold if __name__ == '__main__': init_logging('ifttt.log') # parse_ifttt_dataset() # analyze_ifttt_dataset() extract_turk_data() # parse_data_for_seq2seq()
idx += 1 f_annot.close() f_annot.close() # serialize_to_file(examples, 'django.cleaned.bin') print 'error num: %d' % err_num print 'preprocess_dataset: cleaned example num: %d' % len(examples) return examples if __name__ == '__main__': from nn.utils.generic_utils import init_logging init_logging('parse.log') # annot_file = 'all.anno' # code_file = 'all.code' # preprocess_dataset(annot_file, code_file) # parse_django_dataset() # check_terminals() # print process_query(""" ALLOWED_VARIABLE_CHARS is a string 'abcdefgh"ijklm" nop"%s"qrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.'.""") # for i, query in enumerate(open('all.anno')): # print i, process_query(query) # clean_dataset()
idx += 1 f_annot.close() f_annot.close() # serialize_to_file(examples, 'django.cleaned.bin') print 'error num: %d' % err_num print 'preprocess_dataset: cleaned example num: %d' % len(examples) return examples if __name__== '__main__': from nn.utils.generic_utils import init_logging init_logging('parse.log') # annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' # preprocess_dataset(annot_file, code_file) # parse_django_dataset() # check_terminals() # print process_query(""" ALLOWED_VARIABLE_CHARS is a string 'abcdefgh"ijklm" nop"%s"qrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.'.""") # for i, query in enumerate(open('/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno')): # print i, process_query(query) # clean_dataset()
]: raw_ids = [ int(i.strip()) for i in open(os.path.join(config.ifttt_test_split), split) ] # 'data/ifff.test_data.gold.id' eids = [ i for i, e in enumerate(test_data.examples) if e.raw_id in raw_ids ] test_data_subset = test_data.get_dataset_by_ids( eids, test_data.name + '.' + split) from decoder import decode_ifttt_dataset decode_results = decode_ifttt_dataset(model, test_data_subset, verbose=True) evaluate_ifttt_results(test_data_subset, decode_results) if __name__ == '__main__': from dataset import DataEntry, DataSet, Vocab, Action init_logging('parser.log', logging.INFO) train_data, dev_data, test_data = deserialize_from_file( 'data/ifttt.freq3.bin') decoding_results = [] for eid in range(test_data.count): example = test_data.examples[eid] decoding_results.append([(eid, example.parse_tree)]) evaluate_ifttt_results(test_data, decoding_results, verbose=True)
target_code = target_code.strip() tokenized_target = tokenize_code_adv( target_code, breakCamelStr=False if data_type == 'django' else True) tokenized_target = [ tk.replace('\n', '#NEWLINE#') for tk in tokenized_target ] tokenized_target = [ tk for tk in tokenized_target if tk is not None ] while tokenized_target[-1] == '#INDENT#': tokenized_target = tokenized_target[:-1] f_source.write(' '.join(query_tokens) + '\n') f_target.write(' '.join(tokenized_target) + '\n') f_source.close() f_target.close() if __name__ == '__main__': init_logging('py.log') # rule_vs_node_stat() # process_heart_stone_dataset() parse_hs_dataset() # dump_data_for_evaluation(data_file='data/django.cleaned.dataset.freq5.par_info.refact.space_only.bin') # dump_data_for_evaluation(data_type='hs', data_file='data/hs.freq3.pre_suf.unary_closure.bin') # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' # py_grammar, _ = extract_grammar(code_file) # serialize_to_file(py_grammar, 'py_grammar.bin')
tokenized_target = [ tk for tk in tokenized_target if tk is not None ] while tokenized_target[-1] == '#INDENT#': tokenized_target = tokenized_target[:-1] f_source.write(' '.join(query_tokens) + '\n') f_target.write(' '.join(tokenized_target) + '\n') f_source.close() f_target.close() if __name__ == '__main__': init_logging('sql.log') parser = argparse.ArgumentParser() parser.add_argument('-table_schema') parser.add_argument('-train_data') parser.add_argument('-train_data_ast') parser.add_argument('-train_data_size', type=int) parser.add_argument('-dev_data') parser.add_argument('-dev_data_ast') parser.add_argument('-dev_data_size', type=int) parser.add_argument('-test_data') parser.add_argument('-test_data_ast') # parser.add_argument('-test_data_size', type=int) parser.add_argument('-output_path') args = parser.parse_args() # parser.add_argument('-random_seed', default=181783, type=int) # parser.add_argument('-output_dir', default='.outputs')
if data_type == 'django': target_code = de_canonicalize_code_for_seq2seq(code, e.meta_data['raw_code']) else: target_code = code # tokenize code target_code = target_code.strip() tokenized_target = tokenize_code_adv(target_code, breakCamelStr=False if data_type=='django' else True) tokenized_target = [tk.replace('\n', '#NEWLINE#') for tk in tokenized_target] tokenized_target = [tk for tk in tokenized_target if tk is not None] while tokenized_target[-1] == '#INDENT#': tokenized_target = tokenized_target[:-1] f_source.write(' '.join(query_tokens) + '\n') f_target.write(' '.join(tokenized_target) + '\n') f_source.close() f_target.close() if __name__ == '__main__': init_logging('py.log') # rule_vs_node_stat() # process_heart_stone_dataset() parse_hs_dataset() # dump_data_for_evaluation(data_file='data/django.cleaned.dataset.freq5.par_info.refact.space_only.bin') # dump_data_for_evaluation(data_type='hs', data_file='data/hs.freq3.pre_suf.unary_closure.bin') # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' # py_grammar, _ = extract_grammar(code_file) # serialize_to_file(py_grammar, 'py_grammar.bin')
from decoder import decode_ifttt_dataset decode_results = decode_ifttt_dataset(model, test_data_subset, verbose=True) evaluate_ifttt_results(test_data_subset, decode_results) return decode_results def decode_and_evaluate_ifttt_by_split(model, test_data): for split in ['ifff.test_data.omit_non_english.id', 'ifff.test_data.omit_unintelligible.id', 'ifff.test_data.gold.id']: raw_ids = [int(i.strip()) for i in open(os.path.join(config.ifttt_test_split), split)] # 'data/ifff.test_data.gold.id' eids = [i for i, e in enumerate(test_data.examples) if e.raw_id in raw_ids] test_data_subset = test_data.get_dataset_by_ids(eids, test_data.name + '.' + split) from decoder import decode_ifttt_dataset decode_results = decode_ifttt_dataset(model, test_data_subset, verbose=True) evaluate_ifttt_results(test_data_subset, decode_results) if __name__ == '__main__': from dataset import DataEntry, DataSet, Vocab, Action init_logging('parser.log', logging.INFO) train_data, dev_data, test_data = deserialize_from_file('data/ifttt.freq3.bin') decoding_results = [] for eid in range(test_data.count): example = test_data.examples[eid] decoding_results.append([(eid, example.parse_tree)]) evaluate_ifttt_results(test_data, decoding_results, verbose=True)