Exemple #1
0
def process_heart_stone_dataset():
    data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out'
    parse_trees = []
    rule_num = 0.
    example_num = 0
    for line in open(data_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        # sanity check
        pred_ast = parse_tree_to_python_ast(parse_tree)
        pred_code = astor.to_source(pred_ast)
        ref_ast = ast.parse(code)
        ref_code = astor.to_source(ref_ast)

        if pred_code != ref_code:
            raise RuntimeError('code mismatch!')

        rules, _ = parse_tree.get_productions(include_value_node=False)
        rule_num += len(rules)
        example_num += 1

        parse_trees.append(parse_tree)

    grammar = get_grammar(parse_trees)

    with open('hs.grammar.txt', 'w') as f:
        for rule in grammar:
            str = rule.__repr__()
            f.write(str + '\n')

    with open('hs.parse_trees.txt', 'w') as f:
        for tree in parse_trees:
            f.write(tree.__repr__() + '\n')

    print 'avg. nums of rules: %f' % (rule_num / example_num)
Exemple #2
0
 def is_well_formed_python_code(_hyp):
     try:
         _hyp = _hyp.replace('#NEWLINE#', '\n').replace('#INDENT#', '    ').replace(' #MERGE# ', '')
         hyp_ast_tree = parse(_hyp)
         return True
     except:
         return False
Exemple #3
0
def extract_grammar(code_file, prefix='py'):
    line_num = 0
    parse_trees = []
    for line in open(code_file):
        code = line.strip()
        parse_tree = parse(code)

        # leaves = parse_tree.get_leaves()
        # for leaf in leaves:
        #     if not is_terminal_type(leaf.type):
        #         print parse_tree

        # parse_tree = add_root(parse_tree)

        parse_trees.append(parse_tree)

        # sanity check
        ast_tree = parse_tree_to_python_ast(parse_tree)
        ref_ast_tree = ast.parse(canonicalize_code(code)).body[0]
        source1 = astor.to_source(ast_tree)
        source2 = astor.to_source(ref_ast_tree)

        assert source1 == source2

        # check rules
        # rule_list = parse_tree.get_rule_list(include_leaf=True)
        # for rule in rule_list:
        #     if rule.parent.type == int and rule.children[0].type == int:
        #         # rule.parent.type == str and rule.children[0].type == str:
        #         pass

        # ast_tree = tree_to_ast(parse_tree)
        # print astor.to_source(ast_tree)
        # print parse_tree
        # except Exception as e:
        #     error_num += 1
        #     #pass
        #     #print e

        line_num += 1

    print 'total line of code: %d' % line_num

    grammar = get_grammar(parse_trees)

    with open(prefix + '.grammar.txt', 'w') as f:
        for rule in grammar:
            str = rule.__repr__()
            f.write(str + '\n')

    with open(prefix + '.parse_trees.txt', 'w') as f:
        for tree in parse_trees:
            f.write(tree.__repr__() + '\n')

    return grammar, parse_trees
Exemple #4
0
def extract_grammar(code_file, prefix='py'):
    line_num = 0
    parse_trees = []
    for line in open(code_file):
        code = line.strip()
        parse_tree = parse(code)

        # leaves = parse_tree.get_leaves()
        # for leaf in leaves:
        #     if not is_terminal_type(leaf.type):
        #         print parse_tree

        # parse_tree = add_root(parse_tree)

        parse_trees.append(parse_tree)

        # sanity check
        ast_tree = parse_tree_to_python_ast(parse_tree)
        ref_ast_tree = ast.parse(canonicalize_code(code)).body[0]
        source1 = astor.to_source(ast_tree)
        source2 = astor.to_source(ref_ast_tree)

        assert source1 == source2

        # check rules
        # rule_list = parse_tree.get_rule_list(include_leaf=True)
        # for rule in rule_list:
        #     if rule.parent.type == int and rule.children[0].type == int:
        #         # rule.parent.type == str and rule.children[0].type == str:
        #         pass

        # ast_tree = tree_to_ast(parse_tree)
        # print astor.to_source(ast_tree)
            # print parse_tree
        # except Exception as e:
        #     error_num += 1
        #     #pass
        #     #print e

        line_num += 1

    print 'total line of code: %d' % line_num

    grammar = get_grammar(parse_trees)

    with open(prefix + '.grammar.txt', 'w') as f:
        for rule in grammar:
            str = rule.__repr__()
            f.write(str + '\n')

    with open(prefix + '.parse_trees.txt', 'w') as f:
        for tree in parse_trees:
            f.write(tree.__repr__() + '\n')

    return grammar, parse_trees
Exemple #5
0
def rule_vs_node_stat():
    line_num = 0
    parse_trees = []
    code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out'  # '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    node_nums = rule_nums = 0.
    for line in open(code_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        node_nums += len(list(parse_tree.nodes))
        rules, _ = parse_tree.get_productions()
        rule_nums += len(rules)
        parse_trees.append(parse_tree)

        line_num += 1

    print 'avg. nums of nodes: %f' % (node_nums / line_num)
    print 'avg. nums of rules: %f' % (rule_nums / line_num)
Exemple #6
0
def rule_vs_node_stat():
    line_num = 0
    parse_trees = []
    code_file = '/home1/zjq/try3/en-django/all.code'
    node_nums = rule_nums = 0.
    for line in open(code_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        node_nums += len(list(parse_tree.nodes))
        rules, _ = parse_tree.get_productions()
        rule_nums += len(rules)
        parse_trees.append(parse_tree)

        line_num += 1

    print('avg. nums of nodes: %f' % (node_nums / line_num))
    print('avg. nums of rules: %f' % (rule_nums / line_num))
Exemple #7
0
def rule_vs_node_stat():
    line_num = 0
    parse_trees = []
    code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out' # '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    node_nums = rule_nums = 0.
    for line in open(code_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        node_nums += len(list(parse_tree.nodes))
        rules, _ = parse_tree.get_productions()
        rule_nums += len(rules)
        parse_trees.append(parse_tree)

        line_num += 1

    print 'avg. nums of nodes: %f' % (node_nums / line_num)
    print 'avg. nums of rules: %f' % (rule_nums / line_num)
Exemple #8
0
def get_unary_links():
    # data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out'
    data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    parse_trees = []
    unary_links_counter = Counter()

    for line in open(data_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        parse_trees.append(parse_tree)

        example_unary_links = extract_unary_closure(parse_tree)
        for link in example_unary_links:
            unary_links_counter[link] += 1

    ranked_links = sorted(unary_links_counter,
                          key=unary_links_counter.get,
                          reverse=True)
    for link in ranked_links:
        print(str(link) + ' ||| ' + str(unary_links_counter[link]))

    unary_links = ranked_links[:20]
    unary_closures = []
    for link in unary_links:
        unary_closures.append(unary_link_to_closure(link))

    unary_closures = zip(unary_links, unary_closures)

    node_nums = rule_nums = 0.
    for parse_tree in parse_trees:
        original_parse_tree = parse_tree.copy()
        for link, closure in unary_closures:
            apply_unary_closure(parse_tree, closure, link)

        # assert original_parse_tree != parse_tree
        compressed_ast_to_normal(parse_tree)
        assert original_parse_tree == parse_tree

        rules, _ = parse_tree.get_productions()
        rule_nums += len(rules)
        node_nums += len(list(parse_tree.nodes))

    print('**** after applying unary closures ****')
    print('avg. nums of nodes: %f' % (node_nums / len(parse_trees)))
    print('avg. nums of rules: %f' % (rule_nums / len(parse_trees)))
Exemple #9
0
def get_unary_links():
    # data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out'
    data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    parse_trees = []
    unary_links_counter = Counter()

    for line in open(data_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        parse_trees.append(parse_tree)

        example_unary_links = extract_unary_closure(parse_tree)
        for link in example_unary_links:
            unary_links_counter[link] += 1

    ranked_links = sorted(unary_links_counter, key=unary_links_counter.get, reverse=True)
    for link in ranked_links:
        print str(link) + ' ||| ' + str(unary_links_counter[link])

    unary_links = ranked_links[:20]
    unary_closures = []
    for link in unary_links:
        unary_closures.append(unary_link_to_closure(link))

    unary_closures = zip(unary_links, unary_closures)

    node_nums = rule_nums = 0.
    for parse_tree in parse_trees:
        original_parse_tree = parse_tree.copy()
        for link, closure in unary_closures:
            apply_unary_closure(parse_tree, closure, link)

        # assert original_parse_tree != parse_tree
        compressed_ast_to_normal(parse_tree)
        assert original_parse_tree == parse_tree

        rules, _ = parse_tree.get_productions()
        rule_nums += len(rules)
        node_nums += len(list(parse_tree.nodes))

    print '**** after applying unary closures ****'
    print 'avg. nums of nodes: %f' % (node_nums / len(parse_trees))
    print 'avg. nums of rules: %f' % (rule_nums / len(parse_trees))
Exemple #10
0
def process_heart_stone_dataset():
    data_file = '/home1/zjq/try3/en-django/all.anno'
    parse_trees = []
    rule_num = 0.
    example_num = 0
    for line in open(data_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        # sanity check
        pred_ast = parse_tree_to_python_ast(parse_tree)
        pred_code = astor.to_source(pred_ast)
        #print(pred_code)
        ref_ast = ast.parse(code)
        ref_code = astor.to_source(ref_ast)
        #print(ref_code)
        if pred_code != ref_code:
            raise RuntimeError('code mismatch!')

        rules, _ = parse_tree.get_productions(include_value_node=False)
        rule_num += len(rules)
        example_num += 1

        parse_trees.append(parse_tree)

    grammar = get_grammar(parse_trees)

    with open('hs.grammar.txt', 'w') as f:
        for rule in grammar:
            str = rule.__repr__()
            f.write(str + '\n')

    with open('hs.parse_trees.txt', 'w') as f:
        for tree in parse_trees:
            f.write(tree.__repr__() + '\n')

    print('avg. nums of rules: %f' % (rule_num / example_num))
Exemple #11
0
def process_heart_stone_dataset():
    data_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/card_datasets/hearthstone/all_hs.out'
    parse_trees = []
    rule_num = 0.
    example_num = 0
    for line in open(data_file):
        code = line.replace('§', '\n').strip()
        parse_tree = parse(code)
        # sanity check
        pred_ast = parse_tree_to_python_ast(parse_tree)
        pred_code = astor.to_source(pred_ast)
        ref_ast = ast.parse(code)
        ref_code = astor.to_source(ref_ast)

        if pred_code != ref_code:
            raise RuntimeError('code mismatch!')

        rules, _ = parse_tree.get_productions(include_value_node=False)
        rule_num += len(rules)
        example_num += 1

        parse_trees.append(parse_tree)

    grammar = get_grammar(parse_trees)

    with open('hs.grammar.txt', 'w') as f:
        for rule in grammar:
            str = rule.__repr__()
            f.write(str + '\n')

    with open('hs.parse_trees.txt', 'w') as f:
        for tree in parse_trees:
            f.write(tree.__repr__() + '\n')


    print 'avg. nums of rules: %f' % (rule_num / example_num)
Exemple #12
0
    f_test.close()

    f_train_rawid.close()
    f_dev_rawid.close()
    f_test_rawid.close()

    # print 'num. of decoding time steps distribution:'
    for k in sorted(decode_time_steps):
        print '%d\t%d' % (k, decode_time_steps[k])


if __name__ == '__main__':
    init_logging('py.log')
    # code = "return (  format_html_join ( '' , '_STR:0_' , sorted ( attrs . items ( ) ) ) +  format_html_join ( '' , ' {0}' , sorted ( boolean_attrs ) )  )"
    code = "call('{0}')"
    parse_tree = parse(code)

    # parse_tree = ASTNode('root', children=[
    #     ASTNode('lambda'),
    #     ASTNode('$0'),
    #     ASTNode('e', children=[
    #         ASTNode('and', children=[
    #             ASTNode('>', children=[ASTNode('$0')]),
    #             ASTNode('from', children=[ASTNode('$0'), ASTNode('ci0')]),
    #         ])
    #     ]),
    # ])

    original_parse_tree = parse_tree.copy()
    break_value_nodes(parse_tree)