def process_heart_stone_dataset(): data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' parse_trees = [] rule_num = 0. example_num = 0 for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) # sanity check pred_ast = parse_tree_to_python_ast(parse_tree) pred_code = astor.to_source(pred_ast) ref_ast = ast.parse(code) ref_code = astor.to_source(ref_ast) if pred_code != ref_code: raise RuntimeError('code mismatch!') rules, _ = parse_tree.get_productions(include_value_node=False) rule_num += len(rules) example_num += 1 parse_trees.append(parse_tree) grammar = get_grammar(parse_trees) with open('hs.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open('hs.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') print 'avg. nums of rules: %f' % (rule_num / example_num)
def canonicalize_example(query, code): from lang.py.parse import parse_raw, parse_tree_to_python_ast, canonicalize_code as make_it_compilable import astor, ast canonical_query, str_map = canonicalize_query(query) canonical_code = code for str_literal, str_repr in str_map.iteritems(): canonical_code = canonical_code.replace(str_literal, '\'' + str_repr + '\'') canonical_code = make_it_compilable(canonical_code) # sanity check parse_tree = parse_raw(canonical_code) gold_ast_tree = ast.parse(canonical_code).body[0] gold_source = astor.to_source(gold_ast_tree) ast_tree = parse_tree_to_python_ast(parse_tree) source = astor.to_source(ast_tree) assert gold_source == source, 'sanity check fails: gold=[%s], actual=[%s]' % ( gold_source, source) query_tokens = canonical_query.split(' ') return query_tokens, canonical_code, str_map
def extract_grammar(code_file, prefix='py'): line_num = 0 parse_trees = [] for line in open(code_file): code = line.strip() parse_tree = parse(code) # leaves = parse_tree.get_leaves() # for leaf in leaves: # if not is_terminal_type(leaf.type): # print parse_tree # parse_tree = add_root(parse_tree) parse_trees.append(parse_tree) # sanity check ast_tree = parse_tree_to_python_ast(parse_tree) ref_ast_tree = ast.parse(canonicalize_code(code)).body[0] source1 = astor.to_source(ast_tree) source2 = astor.to_source(ref_ast_tree) assert source1 == source2 # check rules # rule_list = parse_tree.get_rule_list(include_leaf=True) # for rule in rule_list: # if rule.parent.type == int and rule.children[0].type == int: # # rule.parent.type == str and rule.children[0].type == str: # pass # ast_tree = tree_to_ast(parse_tree) # print astor.to_source(ast_tree) # print parse_tree # except Exception as e: # error_num += 1 # #pass # #print e line_num += 1 print 'total line of code: %d' % line_num grammar = get_grammar(parse_trees) with open(prefix + '.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open(prefix + '.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') return grammar, parse_trees
def canonicalize_code(code): from lang.py.parse import parse_raw, parse_tree_to_python_ast, canonicalize_code as make_it_compilable import astor, ast canonical_code = make_it_compilable(code) # sanity check parse_tree = parse_raw(canonical_code) gold_ast_tree = ast.parse(canonical_code).body[0] gold_source = astor.to_source(gold_ast_tree) ast_tree = parse_tree_to_python_ast(parse_tree) source = astor.to_source(ast_tree) assert gold_source == source, 'sanity check fails: gold=[%s], actual=[%s]' % (gold_source, source) return canonical_code
def canonicalize_hs_example(query, code): query = re.sub(r'<.*?>', '', query) query_tokens = nltk.word_tokenize(query) code = code.replace('§', '\n').strip() # sanity check parse_tree = parse_raw(code) gold_ast_tree = ast.parse(code).body[0] gold_source = astor.to_source(gold_ast_tree) ast_tree = parse_tree_to_python_ast(parse_tree) pred_source = astor.to_source(ast_tree) assert gold_source == pred_source, 'sanity check fails: gold=[%s], actual=[%s]' % (gold_source, pred_source) return query_tokens, code, parse_tree
def canonicalize_hs_example(query, code): query = re.sub(r'<.*?>', '', query) query_tokens = nltk.word_tokenize(query) code = code.replace('§', '\n').strip() # sanity check parse_tree = parse_raw(code) gold_ast_tree = ast.parse(code).body[0] gold_source = astor.to_source(gold_ast_tree) ast_tree = parse_tree_to_python_ast(parse_tree) pred_source = astor.to_source(ast_tree) assert gold_source == pred_source, 'sanity check fails: gold=[%s], actual=[%s]' % ( gold_source, pred_source) return query_tokens, code, parse_tree
def canonicalize_example(query, code): from lang.py.parse import parse_raw, parse_tree_to_python_ast, canonicalize_code as make_it_compilable import astor, ast canonical_query, str_map = canonicalize_query(query) canonical_code = code for str_literal, str_repr in str_map.iteritems(): canonical_code = canonical_code.replace(str_literal, '\'' + str_repr + '\'') canonical_code = make_it_compilable(canonical_code) # sanity check parse_tree = parse_raw(canonical_code) gold_ast_tree = ast.parse(canonical_code).body[0] gold_source = astor.to_source(gold_ast_tree) ast_tree = parse_tree_to_python_ast(parse_tree) source = astor.to_source(ast_tree) assert gold_source == source, 'sanity check fails: gold=[%s], actual=[%s]' % (gold_source, source) query_tokens = canonical_query.split(' ') return query_tokens, canonical_code, str_map
def process_heart_stone_dataset(): data_file = '/home1/zjq/try3/en-django/all.anno' parse_trees = [] rule_num = 0. example_num = 0 for line in open(data_file): code = line.replace('§', '\n').strip() parse_tree = parse(code) # sanity check pred_ast = parse_tree_to_python_ast(parse_tree) pred_code = astor.to_source(pred_ast) #print(pred_code) ref_ast = ast.parse(code) ref_code = astor.to_source(ref_ast) #print(ref_code) if pred_code != ref_code: raise RuntimeError('code mismatch!') rules, _ = parse_tree.get_productions(include_value_node=False) rule_num += len(rules) example_num += 1 parse_trees.append(parse_tree) grammar = get_grammar(parse_trees) with open('hs.grammar.txt', 'w') as f: for rule in grammar: str = rule.__repr__() f.write(str + '\n') with open('hs.parse_trees.txt', 'w') as f: for tree in parse_trees: f.write(tree.__repr__() + '\n') print('avg. nums of rules: %f' % (rule_num / example_num))
def canonicalize_hs_example(query, code): query = re.sub(r'<.*?>', '', query) #print(query) query_tokens = nltk.word_tokenize(query) code = code.replace('§', '\n').strip() # sanity check #print(code) parse_tree = parse_raw(code) gold_ast_tree = ast.parse(code).body gold_source = '' for pt in gold_ast_tree: gold_source += astor.to_source(pt) #print(gold_source) #print('====gold========') pred_source = '' for pc in parse_tree.children: ast_tree = parse_tree_to_python_ast(pc) pred_source += astor.to_source(ast_tree) #print(pred_source) assert gold_source == pred_source, 'sanity check fails: gold=[%s], actual=[%s]' % ( gold_source, pred_source) return query_tokens, code, parse_tree