Example #1
0
 def from_final_progress(progress):
     # type: (FixProgress) -> FixResult
     raw_code = progress.raw_code
     raw_error_count = progress.raw_error_count
     final_tokenized_code = progress.tokenized_code_2
     final_code = tokens_to_source(final_tokenized_code, progress.name_dict)
     final_error_count = progress.error_count
     iteration_count = progress.iteration_count
     return FixResult(raw_code=raw_code,
                      raw_error_count=raw_error_count,
                      final_code=final_code,
                      final_error_count=final_error_count,
                      iteration_count=iteration_count)
Example #2
0
 def get_program_source_from_vector(self,
                                    program_vector,
                                    name_dict,
                                    name_seq,
                                    keep_cursor=False,
                                    clang_format=False,
                                    get_tokens=False):
     return tokens_to_source(self.devectorize(program_vector, keep_cursor),
                             name_dict,
                             clang_format,
                             name_seq,
                             cursor=('EOF' if keep_cursor else None),
                             get_tokens=get_tokens)
Example #3
0
    def process(self, source_code_array, max_attempts=6):
        sequences_of_programs = {}
        fixes_suggested_by_network = {}
        entries = []
        entries_ids = []
        errors = {}
        fixes_to_return = {}
        error_messages = {}

        # Wrap it up into a nice box
        for idx, source_code in enumerate(source_code_array):
            program, name_dict, name_sequence, literal_sequence = C_Tokenizer().tokenize(source_code)
            entries.append((idx, program, name_dict, name_sequence, literal_sequence))
            entries_ids.append((idx, program, name_dict, name_sequence, literal_sequence))
            sequences_of_programs[idx] = [program]
            fixes_suggested_by_network[idx] = []
            errors[idx], _ = compilation_errors(source_code)
            error_messages[idx] = []
            fixes_to_return[idx] = []

        network = self.network

        if self.task == 'ids':
            normalize_names = False
            fix_kind = 'insert'
            
        else:
            assert self.task == 'typo'
            normalize_names = True
            fix_kind = 'replace'

        # Reinitialize `entries'
        entries = entries_ids

        try:
            for round_ in range(max_attempts):
                to_delete = []
                input_ = []

                for i, entry in enumerate(entries):
                    idx, program, name_dict, name_sequence, literal_sequence = entry

                    try:
                        program_vector = vectorize(sequences_of_programs[idx][-1], network['in_seq_length'], network['dictionary'], normalize_names=normalize_names, reverse=True, append_eos=False)
                    except VectorizationFailedException:
                        program_vector = None

                    if program_vector is not None:
                        input_.append(program_vector)
                    else:
                        to_delete.append(i)
                        error_messages[idx].append('VectorizationFailed')

                # Delete
                to_delete = sorted(to_delete)[::-1]

                for i in to_delete:
                    del entries[i]

                assert len(input_) == len(entries)

                if len(input_) == 0:
                    break

                # Pass it through the network
                fix_vectors = get_fixes(network['session'], input_, network)
                fixes = []

                # Devectorize them
                for i, fix_vector in enumerate(fix_vectors):
                    idx, _, _, _, _ = entries[i]

                    fix = devectorize(fix_vector, network['dictionary'])
                    fixes_suggested_by_network[idx].append(fix)
                    fixes.append(fix)

                to_delete = []

                # Apply fixes
                for i, entry, fix in zip(range(len(fixes)), entries, fixes):
                    idx, program, name_dict, name_sequence, literal_sequence = entry

                    try:
                        program = sequences_of_programs[idx][-1]
                        program = apply_fix(program, fix, kind=fix_kind, check_literals=True)
                        sequences_of_programs[idx].append(program)
                        regen_source_code = tokens_to_source(program, name_dict, clang_format=True, literal_seq=literal_sequence)
                        this_errors, _ = compilation_errors(regen_source_code)

                        if len(fix.strip().split()) > 0 and len(this_errors) > len(errors[idx]):
                            to_delete.append(i)
                            error_messages[idx].append('ErrorsIncreased')
                        else:
                            errors[idx] = this_errors
                    except IndexError:
                        to_delete.append(i)
                        error_messages[idx].append('IndexError')
                    except VectorizationFailedException as e:
                        to_delete.append(i)
                        error_messages[idx].append('VectorizationFailed')
                    except InvalidFixLocationException:
                        to_delete.append(i)

                        if fix.strip().split()[0] == '_eos_':
                            error_messages[idx].append('OK')
                        else:
                            error_messages[idx].append('InvalidFixLocation')
                    except SubstitutionFailedException:
                        to_delete.append(i)
                        error_messages[idx].append('SubstitutionFailed')
                    else:
                        assert len(fix.strip().split()) == 0 or fix.strip().split()[0] != '_eos_'

                        if fix_kind == 'insert':
                            fix_ = ' '.join(fix.split()[1:])
                            fix_line = extract_line_number(fix_) + 1
                            fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix_, program, name_dict, clang_format=True).split('\n'))))
                        else:
                            fix_line = extract_line_number(fix) + 1
                            fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix, program, name_dict, name_seq=name_sequence, literal_seq=literal_sequence, clang_format=True).split('\n'))))

                # Delete
                to_delete = sorted(to_delete)[::-1]

                for i in to_delete:
                    del entries[i]

        except KeyError as e:
            pass

        except InvalidFixLocationException:
            pass

        except SubstitutionFailedException:
            pass
        # -----------

        repaired_programs = {}

        for idx in sequences_of_programs:
            repaired_programs[idx] = tokens_to_source(sequences_of_programs[idx][-1], name_dict, clang_format=True, literal_seq=literal_sequence)
            repaired_programs[idx] = repaired_programs[idx].strip()

        return fixes_to_return, repaired_programs, error_messages
Example #4
0
 def process_many(self, sequence_of_code):
     # type: (Iterable[str]) -> List[FixResult]
     sequence_of_fix_status = [
         FixProgress.from_code(code) for code in sequence_of_code
     ]
     needed_to_fix = [
         fix_status for fix_status in sequence_of_fix_status
         if isinstance(fix_status, FixProgress)
     ]
     attempt_count = 0
     while needed_to_fix and attempt_count < 5:
         indices_unneeded_to_fix = []
         vectors = []
         for i, fix_progress in enumerate(needed_to_fix):
             vector = self.vectorize(fix_progress.tokenized_code)
             if vector is None:
                 indices_unneeded_to_fix.append(i)
             else:
                 vectors.append(vector)
         for i in reversed(indices_unneeded_to_fix):
             del needed_to_fix[i]
         indices_unneeded_to_fix = []
         fixes = [
             devectorize(vector, self.get_dictionary())
             for vector in self._get_fixes_ported_from_initial(vectors)
         ]
         for i, fix_progress, fix in zip(range(len(needed_to_fix)),
                                         needed_to_fix, fixes):
             try:
                 tokenized_fixed = apply_fix(fix_progress.tokenized_code,
                                             fix,
                                             self.get_fix_kind(),
                                             flag_replace_ids=False)
                 tokenized_fixed_2 = apply_fix(
                     fix_progress.tokenized_code_2, fix,
                     self.get_fix_kind())
             except Exception:
                 indices_unneeded_to_fix.append(i)
                 continue
             if self.get_task() != 'typo':
                 raise NotImplementedError
             if not meets_criterion(fix_progress.tokenized_code, fix,
                                    'replace'):
                 indices_unneeded_to_fix.append(i)
                 continue
             error_count_new = FixProgress.get_error_count(
                 tokens_to_source(tokenized_fixed_2, fix_progress.name_dict,
                                  False))
             if error_count_new > fix_progress.error_count:
                 indices_unneeded_to_fix.append(i)
                 continue
             fix_progress.tokenized_code = tokenized_fixed
             fix_progress.tokenized_code_2 = tokenized_fixed_2
             fix_progress.error_count = error_count_new
             fix_progress.iteration_count += 1
         for i in reversed(indices_unneeded_to_fix):
             del needed_to_fix[i]
         attempt_count += 1
     results = []
     for fix_status in sequence_of_fix_status:
         if isinstance(fix_status, str):
             results.append(FixResult.from_correct_code(fix_status))
         else:
             results.append(FixResult.from_final_progress(fix_status))
     return results
                else:
                    fixed = fixed.replace(str(l) + " ~ ", "", 1)
            fixed = fixed.replace("  ", " ")
            fixed = fixed.split()

            log = getTrace(source, fixed, getEditDistance(source, fixed))
            target = ["0" for i in range(len(source))]
            for l in log:
                if l[0] == "i":
                    target.insert(l[1], target_vocab["insert"][l[2]])
                elif l[0] == "r":
                    target[l[1]] = target[l[1]].replace(target[l[1]], target_vocab["replace"][l[2]])
                elif l[0] == "d":
                    target[l[1]] = target[l[1]].replace(target[l[1]], "-1")
                    
            assert (tokens_to_source(' '.join(fixed), inverse_vocab, False) == tokens_to_source(' '.join(apply_edits(source, target, inverse_vocab)), inverse_vocab, False))

        train.write("%s\t%s\n" % (" ".join(source), " ".join(target)))

train.close()

for k in tqdm(data['validation']):
    for i in data['validation'][k]:
        #source sequence
        source = i[0]
        lines = source.count('~')
        for l in range(lines):
            if l >= 10:
                source = source.replace(list(str(l))[0] + " " + list(str(l))[1] + " ~ ", "", 1)
            else:
                source = source.replace(str(l) + " ~ ", "", 1)
def undeclare_variable(rng,
                       old_program,
                       program_string,
                       deleted_ids,
                       name_dict=None,
                       print_debug_messages=False):
    if name_dict is not None:
        rev_name_dict = get_rev_dict(name_dict)

    # Lines
    orig_lines = get_lines(program_string)
    old_lines = get_lines(old_program)

    # Lines to ignore
    struct_lines = []
    structs_deep = 0

    for i, line in enumerate(orig_lines):
        if len(re.findall('_<keyword>_struct _<id>_\d@ _<op>_\{', line)) > 0 or \
           len(re.findall('_<keyword>_union _<id>_\d@ _<op>_\{', line)) > 0 or \
           len(re.findall('_<keyword>_enum _<id>_\d@ _<op>_\{', line)) > 0:
            structs_deep += len(re.findall('_<op>_\{', line))
        elif structs_deep > 0:
            structs_deep += len(re.findall('_<op>_\{', line))
            structs_deep -= len(re.findall('_<op>_\}', line))
            assert structs_deep >= 0, str(structs_deep) + " " + line
            struct_lines.append(i)

    global_lines = []
    brackets_deep = 0

    for i, line in enumerate(orig_lines):
        if len(re.findall('_<op>_\{', line)) > 0 or len(
                re.findall('_<op>_\}', line)) > 0:
            brackets_deep += len(re.findall('_<op>_\{', line))
            brackets_deep -= len(re.findall('_<op>_\}', line))
            assert brackets_deep >= 0, str(brackets_deep) + " " + line
        elif brackets_deep == 0:
            global_lines.append(i)

    if print_debug_messages:
        print 'Ignoring lines:', struct_lines
        print 'Ignoring lines:', global_lines

        for line in sorted(set(struct_lines + global_lines)):
            print "-", orig_lines[line]

    # Variables
    variables = []

    for token in program_string.split():
        if '_<id>_' in token:
            if token not in variables:
                variables.append(token)

    assert len(orig_lines) == len(old_lines)

    # Look for a declaration
    done = False

    rng.shuffle(variables)

    for to_undeclare in variables:
        if print_debug_messages:
            print 'Looking for:', rev_name_dict[to_undeclare], '...'

        # Find a location (scope) to undeclare it from
        shuffled_lines = list(
            set(range(len(orig_lines))) - set(struct_lines + global_lines))
        rng.shuffle(shuffled_lines)

        # NEW
        regex_alone_use = '(_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)((?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*)(?: _<op>_= [^,;]+)(?: _<op>_;)' % to_undeclare
        regex_alone = '((?:_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])* _<op>_;)' % to_undeclare
        regex_group_leader = '((?:_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)*)( %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*)(?: _<op>_= [^,;]+)?( _<op>_,)(?:(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)? _<op>_,)*(?:(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)? _<op>_;)' % to_undeclare
        regex_group = '(_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?(?: _<op>_,(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)*( _<op>_,(?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)(?: _<op>_,(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)*(?: _<op>_;)' % to_undeclare

        fix_line = None
        declaration = None
        declaration_pos = None

        # Start our search upwards
        for i in shuffled_lines:
            if len(re.findall(regex_alone_use, orig_lines[i])) == 1:
                if print_debug_messages:
                    print("On line %d:" % i), tokens_to_source(
                        orig_lines[i], name_dict, clang_format=True)
                    print "Found Alone use", re.findall(
                        regex_alone_use, orig_lines[i])
                m = re.search(regex_alone_use, orig_lines[i])
                declaration = orig_lines[i][m.start(1):m.end(2)] + ' _<op>_;'
                declaration_pos = i

                # Mutate
                orig_lines[i] = orig_lines[i][:m.start(1)] + orig_lines[i][
                    m.end(1) + 1:]
                done = True
                break

            if len(re.findall(regex_alone, orig_lines[i])) == 1:
                if print_debug_messages:
                    print("On line %d:" % i), tokens_to_source(
                        orig_lines[i], name_dict, clang_format=True)
                    print "Found Alone", re.findall(regex_alone, orig_lines[i])
                m = re.search(regex_alone, orig_lines[i])
                declaration = orig_lines[i][m.start(1):m.end(1)]
                declaration_pos = i

                # Mutate
                orig_lines[i] = orig_lines[i][:m.start(1)] + orig_lines[i][
                    m.end(1) + 1:]
                done = True
                break

            elif len(re.findall(regex_group, orig_lines[i])) == 1:
                if print_debug_messages:
                    print("On line %d:" % i), tokens_to_source(
                        orig_lines[i], name_dict, clang_format=True)
                    print "Found Group", re.findall(regex_group, orig_lines[i])
                m = re.search(regex_group, orig_lines[i])
                declaration = orig_lines[i][m.start(1):m.end(1)] + orig_lines[
                    i][m.start(2):m.end(2)][8:] + ' _<op>_;'
                declaration_pos = i

                try:
                    end_of_declr = declaration.index('_<op>_=')
                    declaration = declaration[:end_of_declr]
                except ValueError:
                    pass

                # Mutate
                orig_lines[i] = orig_lines[i][:m.start(2) +
                                              1] + orig_lines[i][m.end(2) + 1:]
                done = True
                break

            elif len(re.findall(regex_group_leader, orig_lines[i])) == 1:
                if print_debug_messages:
                    print("On line %d:" % i), tokens_to_source(
                        orig_lines[i], name_dict, clang_format=True)
                    print "Found Group Leader", re.findall(
                        regex_group_leader, orig_lines[i])
                m = re.search(regex_group_leader, orig_lines[i])
                declaration = orig_lines[i][m.start(1):m.end(2)] + ' _<op>_;'
                declaration_pos = i

                # Mutate
                orig_lines[i] = orig_lines[i][:m.start(2) +
                                              1] + orig_lines[i][m.end(3) + 1:]
                done = True
                break

        if done:
            break

    if not done:
        # Failed to find something to undeclare
        raise NothingToMutateException

    # Find the function signature
    fn_regex = '(?:_<keyword>_(?:struct|union|enum) _<id>_\d+@|_<type>_\w+|_<keyword>_void)(?: _<op>_\*)* (?:_<id>_\d+@|_<APIcall>_main) _<op>_\('
    fn_start_regex = '_<op>_\{'
    inserted = False

    assert declaration_pos != None
    for i in range(declaration_pos, 0, -1):
        if len(re.findall(fn_regex, old_lines[i])) == 1:
            for j in range(i, len(old_lines)):
                if len(re.findall(fn_start_regex, old_lines[i])) >= 1:
                    fix_line = j
                    break
            inserted = True

        if inserted:
            break

    if not inserted:
        # print Failed to insert fix
        raise FailedToMutateException
    if fix_line is None:
        # Couldn't find { after function definition
        raise FailedToMutateException

    fix = '_<insertion>_ '

    assert fix_line is not None

    for digit in str(fix_line):
        fix += str(digit) + ' '

    fix += '~ ' + declaration

    to_delete = False

    if orig_lines[declaration_pos].strip() == '':
        to_delete = declaration_pos
        del orig_lines[to_delete]

    recomposed_program = ''

    for i, line in enumerate(orig_lines):
        for digit in str(i):
            recomposed_program += digit + ' '

        recomposed_program += '~ '
        recomposed_program += line + ' '

    return recomposed_program, fix, fix_line
Example #7
0
def do_problem(problem_id):
    global reconstruction, errors, errors_full, total_count, errors_test

    c = conn.cursor()

    reconstruction[problem_id] = {}
    errors[problem_id] = {}
    errors_full[problem_id] = {}
    errors_test[problem_id] = []
    candidate_programs = []

    for row in c.execute('SELECT user_id, prog_id, code, name_dict, name_seq FROM programs WHERE prob_id = ?', (problem_id,)):
        user_id, prog_id, initial = row[0], row[1], row[2]
        name_dict = json.loads(row[3])
        name_seq = json.loads(row[4])

        candidate_programs.append(
            (user_id, prog_id, initial, name_dict, name_seq,))

    for _, prog_id, initial, name_dict, name_seq in candidate_programs:
        fixes_suggested_by_typo_network = []
        fixes_suggested_by_undeclared_network = []

        for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'typo\' ORDER BY iteration', (prog_id,)):
            fixes_suggested_by_typo_network.append(row[0])

        for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'ids\' ORDER BY iteration', (prog_id,)):
            fixes_suggested_by_undeclared_network.append(row[0])

        reconstruction[problem_id][prog_id] = [initial]
        temp_errors, temp_errors_full = compilation_errors(
            tokens_to_source(initial, name_dict, False))
        errors[problem_id][prog_id] = [temp_errors]
        errors_full[problem_id][prog_id] = [temp_errors_full]

        try:
            for fix in fixes_suggested_by_typo_network:
                if meets_criterion(reconstruction[problem_id][prog_id][-1], fix, 'replace'):
                    temp_prog = apply_fix(
                        reconstruction[problem_id][prog_id][-1], fix, 'replace')
                    temp_errors, temp_errors_full = compilation_errors(
                        tokens_to_source(temp_prog, name_dict, False))

                    if len(temp_errors) > len(errors[problem_id][prog_id][-1]):
                        break
                    else:
                        reconstruction[problem_id][prog_id].append(temp_prog)
                        errors[problem_id][prog_id].append(temp_errors)
                        errors_full[problem_id][prog_id].append(
                            temp_errors_full)
                else:
                    break

        except InvalidFixLocationException:
            print 'Localization failed'

        while len(reconstruction[problem_id][prog_id]) <= 5:
            reconstruction[problem_id][prog_id].append(
                reconstruction[problem_id][prog_id][-1])
            errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1])
            errors_full[problem_id][prog_id].append(
                errors_full[problem_id][prog_id][-1])

        already_fixed = []

        try:
            for fix in fixes_suggested_by_undeclared_network:
                if fix not in already_fixed:
                    temp_prog = apply_fix(
                        reconstruction[problem_id][prog_id][-1], fix, 'insert')
                    already_fixed.append(fix)
                    temp_errors, temp_errors_full = compilation_errors(
                        tokens_to_source(temp_prog, name_dict, False))

                    if len(temp_errors) > len(errors[problem_id][prog_id][-1]):
                        break
                    else:
                        reconstruction[problem_id][prog_id].append(temp_prog)
                        errors[problem_id][prog_id].append(temp_errors)
                        errors_full[problem_id][prog_id].append(
                            temp_errors_full)
                else:
                    pass

        except InvalidFixLocationException:
            print 'Localization failed'

        while len(reconstruction[problem_id][prog_id]) <= 10:
            reconstruction[problem_id][prog_id].append(
                reconstruction[problem_id][prog_id][-1])
            errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1])
            errors_full[problem_id][prog_id].append(
                errors_full[problem_id][prog_id][-1])

        errors_test[problem_id].append(errors[problem_id][prog_id])

        if not args.is_timing_experiment:
            for k, errors_t, errors_full_t in zip(range(len(errors[problem_id][prog_id])), errors[problem_id][prog_id], errors_full[problem_id][prog_id]):
                c.execute("INSERT INTO error_message_strings VALUES(?, ?, ?, ?, ?)", (
                    prog_id, k, 'typo', errors_full_t.decode('utf-8', 'ignore'), len(errors_t)))

                for error_ in errors_t:
                    c.execute("INSERT INTO error_messages VALUES(?, ?, ?, ?)",
                              (prog_id, k, 'typo', error_.decode('utf-8', 'ignore'),))

    count_t = len(candidate_programs)
    total_count += count_t

    if not args.is_timing_experiment:
        print 'Committing changes to database...'
        conn.commit()
        print 'Done!'
    else:
        print 'Done problem with', count_t, 'programs'

    c.close()