def parse_gen(code, grammar):
    tokens = filter_space(tokenize(round_trip(code), new_line=False))
    result = OrderedDict()
    result['code'] = code
    result['start_w_close_curly'] = start_w_close_curly(tokens)
    result['end_w_open_curly'] = end_w_open_curly(tokens)
    cur_idx = 0
    if tokens[-1].value == ';':
        tokens = tokens[:-1]
    for atom in grammar:
        if cur_idx >= len(tokens):
            next_idx, content = None, None
        else:
            next_idx, content = atom.f(cur_idx, tokens)
        if next_idx is None and not atom.optional:
            raise parse_single_line_exception(
                'Error parsing atom %s, which is required' % atom.name)
        result[atom.name] = content
        if next_idx is not None:
            cur_idx = next_idx
    if 'stmt' in result:
        result['new_scope'] = result['stmt'] is None
    else:
        result['new_scope'] = False
    return result
def pseudo_compile_check(code, indent, search_opt):
    code_by_line = code.strip().split('\n')
    program_length = len(code_by_line)
    gold_sents_l, gold_scores_l = [[round_trip(c)] for c in code_by_line
                                   ], [[0] for _ in range(program_length)]
    gold_groups = search_structured_groups(gold_sents_l, gold_scores_l,
                                           search_opt, indent)['groups']
    return gold_groups is not None
def return_result_for_line_marker(code):
    tokens = filter_space(tokenize(round_trip(code), new_line=False))
    result = OrderedDict()
    result['code'] = code
    result['start_w_close_curly'] = start_w_close_curly(tokens)
    result['end_w_open_curly'] = end_w_open_curly(tokens)
    result['new_scope'] = result['end_w_open_curly']
    result['line_type'] = 'marker'
    return result
def get_cores(line_code):
    line_code = round_trip(line_code)
    try:
        tokens = tokenize(line_code)
        result = parse_func_header(tokens)
        return []
    except:
        result = decompose_line(line_code)
        cores = []
        for key in result:
            if 'stmt' in key:
                stmt, depth = result[key]
                stmt = tokenize(stmt)
                segment_info = parse_chunk(stmt)
                cores += segment_info['nodes']
        return cores
def return_result_for_trivial_code(code):
    tokens = filter_space(tokenize(round_trip(code), new_line=False))
    result = OrderedDict()
    result['code'] = code
    result['start_w_close_curly'] = start_w_close_curly(tokens)
    result['end_w_open_curly'] = end_w_open_curly(tokens)
    result['new_scope'] = result['end_w_open_curly']
    if code == '{':
        result['line_type'] = 'open_curly_only'
    elif code == '}':
        result['line_type'] = 'close_curly_only'
    elif code == '':
        result['line_type'] = 'empty'
    elif code == ';':
        result['line_type'] = 'line'
    return result
def parse_dowhile(code):
    while_idx = code.index('while')
    do_part, while_part = code[:while_idx], code[while_idx:]
    do_result = parse_doline(do_part)
    while_result = parse_whileline(while_part)
    result = OrderedDict()
    for key in do_result:
        result[key] = do_result[key]
    for key in while_result:
        result[key] = while_result[key]

    tokens = filter_space(tokenize(round_trip(code), new_line=False))
    result['code'] = code
    result['start_w_close_curly'] = start_w_close_curly(tokens)
    result['end_w_open_curly'] = end_w_open_curly(tokens)
    result['new_scope'] = False
    return result
def decompose_line(code):
    if not line_well_formed_brace(code):
        return None
    code = round_trip(code)
    if code in trivial_code:
        return return_result_for_trivial_code(code)
    if code[-1] == ':':
        return return_result_for_line_marker(code)

    result = None
    has_do, has_while, has_if, has_for, has_else = [
        has_key_word(code, kword)
        for kword in ['do', 'while', 'if', 'for', 'else']
    ]
    if has_if:
        result = parse_ifline(code)
        result['line_type'] = 'if' if not has_else else 'else if'
    elif has_else:
        result = parse_elseline(code)
        result['line_type'] = 'else'
    elif has_while and not has_do:
        result = parse_whileline(code)
        result['line_type'] = 'while'
    elif has_for:
        result = parse_forline(code)
        result['line_type'] = 'for'
    elif has_do and not has_while:
        result = parse_doline(code)
        result['line_type'] = 'do'
    elif has_do and has_while:
        result = parse_dowhile(code)
        result['line_type'] = 'dowhile'
    if result is not None:
        result = post_process_decomposition(result, 1)
    else:
        result = parse_simpleline(code)
        result = post_process_decomposition(result, 0)
        result['line_type'] = 'line'

    if result is not None and debug:
        for key in result:
            if 'stmt' in key:
                print(key, result[key])
        input()
    return result
def type_line(line_code):
    line_code = round_trip(line_code)
    atoms_declared, atoms_used, prototype = {}, {}, None
    forest = []
    try:
        try:
            tokens = tokenize(line_code, new_line=False)
            result = parse_func_header(tokens)
            line_type = 'prototype' if result['is_prototype'] else 'function'
            prototype = adddepth2func(result, atoms_declared)
            sw_close_curly, ew_open_curly = start_w_close_curly(
                tokens), end_w_open_curly(tokens)
        except ParseChunkError:
            result = decompose_line(line_code)
            if result is None:
                return None
            line_type = result['line_type']
            sw_close_curly, ew_open_curly = result[
                'start_w_close_curly'], result['end_w_open_curly']
            for key in result:
                if 'stmt' in key:
                    stmt, depth = result[key]
                    stmt = tokenize(stmt, new_line=False)
                    segment_info = parse_chunk(stmt)
                    addvar_decl2line(segment_info, atoms_declared, depth)
                    parse_var_used(segment_info['nodes'], atoms_used, depth)
                    forest += segment_info['nodes']

        return {
            # used for consider scope
            'line_type': line_type,
            'start_w_close_curly': sw_close_curly,
            'end_w_open_curly': ew_open_curly,
            'line_complete': len(line_code) > 0
            and line_code[-1] in ('}', ';'),
            'atoms_declared': atoms_declared,
            'atoms_used': atoms_used,
            'prototype': prototype,
            'forest': forest,
            'code': line_code
        }
    except ParseChunkError:
        return None
def obtain_gold_program(f_name):
    src_file_path = program_dir + f_name + '.cc'
    with open(src_file_path, 'r') as in_file:
        program_str = in_file.read()
    program_by_line = [round_trip(c) for c in program_str.split('\n')]
    return program_by_line
Exemple #10
0
def search(translation_map: Callable[[str], Tuple[List[List[str]], List[List[float]]]],  # see documentation above, generate code pieces and scores
           program_dict: Dict[str, Any],  # a dictionary that contains information needed for a program, including pseudo code, indent, etc
           result_dir: str,  # the directory to dump the results
           budget: int,  # budget B
           search_opt: str,  # the constraint we use for searching,
           structure_beam_size: int = 50,  # beam width W for the search
           structure_topk: int = 20,  # the top K scaffolds we use for the search
           regular: bool = False  # whether to use hierarchical or regular beam search
           ):
    # load program information
    f_name, indent = program_dict['f_name'], program_dict['indent']
    program_length = len(indent)

    # evaluation requires running on a lot of testcases and is time consuming
    # we memoize all the evaluation results and save it on the disk
    memo_dir = '../spoc/eval_memo/' + f_name
    memo = {}
    if os.path.exists(memo_dir):
        memo = pkl.load(open(memo_dir, 'rb'))

    # the id of the problem (for testcases) is the substring after the 1st -
    pid = f_name.split('-')[1]

    # the path we are dumping the search results and statistics
    search_result_dir = result_dir + f_name + '.pkl'
    search_stats_result_dir = result_dir + f_name + '.stats'

    # if the result path already exists, return
    # else dump a lock to indicate that currently this process is working on it
    if os.path.exists(search_result_dir):
        return
    pkl.dump('working', open(search_result_dir, 'wb'))
    if verbose:
        print('searching for file %s.' % f_name)

    # check whether the gold program can pass the constraint
    gold_sents_l, gold_scores_l = [[round_trip(c)] for c in program_dict['program_by_line']], [[0] for _ in range(program_length)]
    gold_groups = search_structured_groups(gold_sents_l, gold_scores_l, search_opt, indent)['groups']
    if gold_groups is not None:
        gold_passed = True
    else:
        gold_passed = False

    # load the translation
    translations = translation_map(f_name)
    sents_l, scores_l = translations

    # search the scaffold
    if not regular:
        search_info = search_structured_groups(sents_l, scores_l, search_opt, indent,
                                               beam_size=structure_beam_size, top_k=structure_topk)
    else:
        search_info = search_structured_groups(sents_l, scores_l, search_opt, indent,
                                               beam_size=budget * 2, top_k=budget, use_code=True)
    # if regular beam search
    # groups is a list of full candidate programs
    # if hierarchical beam search
    # group is a list, each element represent a scaffold,
    # where each scaffold is Tuple[List[List[str]], List[List[float]], float], the first two element
    # the same return type as translations, the third element is the score of a scaffold
    # every translation within the same scaffold has the same configuration for each line.
    groups = search_info['groups']

    if groups is None:
        pkl.dump([], open(search_result_dir, 'wb'))
        return []

    # next_code returns an iterator that generates the next candidate
    if regular:
        def next_code():
            idx = 0
            while True:
                if idx < len(groups):
                    yield groups[idx]
                    idx += 1
                else:
                    yield None
    else:
        def next_code():
            mpq = Multipq(groups)
            while True:
                yield mpq.pop()

    cur_idx = 0
    return_val = []
    code_iter = next_code()
    while cur_idx < budget:
        code = next(code_iter)
        if code is None:
            break
        # check whether a piece of code has been evaluated in the history
        # if yes, then directly load the result and hence avoid computation
        if memo.get(code) is None:
            # if the braces do not match (e.g. more '{' than '}' in the program), then reject directly
            if braces_acceptable(code):
                j = Judge(problem_id=pid, judge_type='all', eager=True, judge_id=f_name + str(cur_idx))
                result = j.judge_program_str(code)
                return_val.append({'rank': cur_idx, 'code': code, 'status': result['Status'], 'gold_pass': gold_passed})
                memo[code] = result['Status']
            else:
                return_val.append({'rank': cur_idx, 'code': code, 'status': 'braces rejected', 'gold_pass': gold_passed})
        else:
            return_val.append({'rank': cur_idx, 'code': code, 'status': memo[code], 'gold_pass': gold_passed})
        if code in memo and memo[code] == 'Passed':
            break

        cur_idx += 1

    #  dump the search results and the memo
    pkl.dump(return_val, open(search_result_dir, 'wb'))
    pkl.dump(search_info, open(search_stats_result_dir, 'wb'))
    pkl.dump(memo, open(memo_dir, 'wb'))
    return return_val