def from_final_progress(progress): # type: (FixProgress) -> FixResult raw_code = progress.raw_code raw_error_count = progress.raw_error_count final_tokenized_code = progress.tokenized_code_2 final_code = tokens_to_source(final_tokenized_code, progress.name_dict) final_error_count = progress.error_count iteration_count = progress.iteration_count return FixResult(raw_code=raw_code, raw_error_count=raw_error_count, final_code=final_code, final_error_count=final_error_count, iteration_count=iteration_count)
def get_program_source_from_vector(self, program_vector, name_dict, name_seq, keep_cursor=False, clang_format=False, get_tokens=False): return tokens_to_source(self.devectorize(program_vector, keep_cursor), name_dict, clang_format, name_seq, cursor=('EOF' if keep_cursor else None), get_tokens=get_tokens)
def process(self, source_code_array, max_attempts=6): sequences_of_programs = {} fixes_suggested_by_network = {} entries = [] entries_ids = [] errors = {} fixes_to_return = {} error_messages = {} # Wrap it up into a nice box for idx, source_code in enumerate(source_code_array): program, name_dict, name_sequence, literal_sequence = C_Tokenizer().tokenize(source_code) entries.append((idx, program, name_dict, name_sequence, literal_sequence)) entries_ids.append((idx, program, name_dict, name_sequence, literal_sequence)) sequences_of_programs[idx] = [program] fixes_suggested_by_network[idx] = [] errors[idx], _ = compilation_errors(source_code) error_messages[idx] = [] fixes_to_return[idx] = [] network = self.network if self.task == 'ids': normalize_names = False fix_kind = 'insert' else: assert self.task == 'typo' normalize_names = True fix_kind = 'replace' # Reinitialize `entries' entries = entries_ids try: for round_ in range(max_attempts): to_delete = [] input_ = [] for i, entry in enumerate(entries): idx, program, name_dict, name_sequence, literal_sequence = entry try: program_vector = vectorize(sequences_of_programs[idx][-1], network['in_seq_length'], network['dictionary'], normalize_names=normalize_names, reverse=True, append_eos=False) except VectorizationFailedException: program_vector = None if program_vector is not None: input_.append(program_vector) else: to_delete.append(i) error_messages[idx].append('VectorizationFailed') # Delete to_delete = sorted(to_delete)[::-1] for i in to_delete: del entries[i] assert len(input_) == len(entries) if len(input_) == 0: break # Pass it through the network fix_vectors = get_fixes(network['session'], input_, network) fixes = [] # Devectorize them for i, fix_vector in enumerate(fix_vectors): idx, _, _, _, _ = entries[i] fix = devectorize(fix_vector, network['dictionary']) fixes_suggested_by_network[idx].append(fix) fixes.append(fix) to_delete = [] # Apply fixes for i, entry, fix in zip(range(len(fixes)), entries, fixes): idx, program, name_dict, name_sequence, literal_sequence = entry try: program = sequences_of_programs[idx][-1] program = apply_fix(program, fix, kind=fix_kind, check_literals=True) sequences_of_programs[idx].append(program) regen_source_code = tokens_to_source(program, name_dict, clang_format=True, literal_seq=literal_sequence) this_errors, _ = compilation_errors(regen_source_code) if len(fix.strip().split()) > 0 and len(this_errors) > len(errors[idx]): to_delete.append(i) error_messages[idx].append('ErrorsIncreased') else: errors[idx] = this_errors except IndexError: to_delete.append(i) error_messages[idx].append('IndexError') except VectorizationFailedException as e: to_delete.append(i) error_messages[idx].append('VectorizationFailed') except InvalidFixLocationException: to_delete.append(i) if fix.strip().split()[0] == '_eos_': error_messages[idx].append('OK') else: error_messages[idx].append('InvalidFixLocation') except SubstitutionFailedException: to_delete.append(i) error_messages[idx].append('SubstitutionFailed') else: assert len(fix.strip().split()) == 0 or fix.strip().split()[0] != '_eos_' if fix_kind == 'insert': fix_ = ' '.join(fix.split()[1:]) fix_line = extract_line_number(fix_) + 1 fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix_, program, name_dict, clang_format=True).split('\n')))) else: fix_line = extract_line_number(fix) + 1 fixes_to_return[idx].append('%s at line %d: %s' % (fix_kind, fix_line, ''.join(fix_to_source(fix, program, name_dict, name_seq=name_sequence, literal_seq=literal_sequence, clang_format=True).split('\n')))) # Delete to_delete = sorted(to_delete)[::-1] for i in to_delete: del entries[i] except KeyError as e: pass except InvalidFixLocationException: pass except SubstitutionFailedException: pass # ----------- repaired_programs = {} for idx in sequences_of_programs: repaired_programs[idx] = tokens_to_source(sequences_of_programs[idx][-1], name_dict, clang_format=True, literal_seq=literal_sequence) repaired_programs[idx] = repaired_programs[idx].strip() return fixes_to_return, repaired_programs, error_messages
def process_many(self, sequence_of_code): # type: (Iterable[str]) -> List[FixResult] sequence_of_fix_status = [ FixProgress.from_code(code) for code in sequence_of_code ] needed_to_fix = [ fix_status for fix_status in sequence_of_fix_status if isinstance(fix_status, FixProgress) ] attempt_count = 0 while needed_to_fix and attempt_count < 5: indices_unneeded_to_fix = [] vectors = [] for i, fix_progress in enumerate(needed_to_fix): vector = self.vectorize(fix_progress.tokenized_code) if vector is None: indices_unneeded_to_fix.append(i) else: vectors.append(vector) for i in reversed(indices_unneeded_to_fix): del needed_to_fix[i] indices_unneeded_to_fix = [] fixes = [ devectorize(vector, self.get_dictionary()) for vector in self._get_fixes_ported_from_initial(vectors) ] for i, fix_progress, fix in zip(range(len(needed_to_fix)), needed_to_fix, fixes): try: tokenized_fixed = apply_fix(fix_progress.tokenized_code, fix, self.get_fix_kind(), flag_replace_ids=False) tokenized_fixed_2 = apply_fix( fix_progress.tokenized_code_2, fix, self.get_fix_kind()) except Exception: indices_unneeded_to_fix.append(i) continue if self.get_task() != 'typo': raise NotImplementedError if not meets_criterion(fix_progress.tokenized_code, fix, 'replace'): indices_unneeded_to_fix.append(i) continue error_count_new = FixProgress.get_error_count( tokens_to_source(tokenized_fixed_2, fix_progress.name_dict, False)) if error_count_new > fix_progress.error_count: indices_unneeded_to_fix.append(i) continue fix_progress.tokenized_code = tokenized_fixed fix_progress.tokenized_code_2 = tokenized_fixed_2 fix_progress.error_count = error_count_new fix_progress.iteration_count += 1 for i in reversed(indices_unneeded_to_fix): del needed_to_fix[i] attempt_count += 1 results = [] for fix_status in sequence_of_fix_status: if isinstance(fix_status, str): results.append(FixResult.from_correct_code(fix_status)) else: results.append(FixResult.from_final_progress(fix_status)) return results
else: fixed = fixed.replace(str(l) + " ~ ", "", 1) fixed = fixed.replace(" ", " ") fixed = fixed.split() log = getTrace(source, fixed, getEditDistance(source, fixed)) target = ["0" for i in range(len(source))] for l in log: if l[0] == "i": target.insert(l[1], target_vocab["insert"][l[2]]) elif l[0] == "r": target[l[1]] = target[l[1]].replace(target[l[1]], target_vocab["replace"][l[2]]) elif l[0] == "d": target[l[1]] = target[l[1]].replace(target[l[1]], "-1") assert (tokens_to_source(' '.join(fixed), inverse_vocab, False) == tokens_to_source(' '.join(apply_edits(source, target, inverse_vocab)), inverse_vocab, False)) train.write("%s\t%s\n" % (" ".join(source), " ".join(target))) train.close() for k in tqdm(data['validation']): for i in data['validation'][k]: #source sequence source = i[0] lines = source.count('~') for l in range(lines): if l >= 10: source = source.replace(list(str(l))[0] + " " + list(str(l))[1] + " ~ ", "", 1) else: source = source.replace(str(l) + " ~ ", "", 1)
def undeclare_variable(rng, old_program, program_string, deleted_ids, name_dict=None, print_debug_messages=False): if name_dict is not None: rev_name_dict = get_rev_dict(name_dict) # Lines orig_lines = get_lines(program_string) old_lines = get_lines(old_program) # Lines to ignore struct_lines = [] structs_deep = 0 for i, line in enumerate(orig_lines): if len(re.findall('_<keyword>_struct _<id>_\d@ _<op>_\{', line)) > 0 or \ len(re.findall('_<keyword>_union _<id>_\d@ _<op>_\{', line)) > 0 or \ len(re.findall('_<keyword>_enum _<id>_\d@ _<op>_\{', line)) > 0: structs_deep += len(re.findall('_<op>_\{', line)) elif structs_deep > 0: structs_deep += len(re.findall('_<op>_\{', line)) structs_deep -= len(re.findall('_<op>_\}', line)) assert structs_deep >= 0, str(structs_deep) + " " + line struct_lines.append(i) global_lines = [] brackets_deep = 0 for i, line in enumerate(orig_lines): if len(re.findall('_<op>_\{', line)) > 0 or len( re.findall('_<op>_\}', line)) > 0: brackets_deep += len(re.findall('_<op>_\{', line)) brackets_deep -= len(re.findall('_<op>_\}', line)) assert brackets_deep >= 0, str(brackets_deep) + " " + line elif brackets_deep == 0: global_lines.append(i) if print_debug_messages: print 'Ignoring lines:', struct_lines print 'Ignoring lines:', global_lines for line in sorted(set(struct_lines + global_lines)): print "-", orig_lines[line] # Variables variables = [] for token in program_string.split(): if '_<id>_' in token: if token not in variables: variables.append(token) assert len(orig_lines) == len(old_lines) # Look for a declaration done = False rng.shuffle(variables) for to_undeclare in variables: if print_debug_messages: print 'Looking for:', rev_name_dict[to_undeclare], '...' # Find a location (scope) to undeclare it from shuffled_lines = list( set(range(len(orig_lines))) - set(struct_lines + global_lines)) rng.shuffle(shuffled_lines) # NEW regex_alone_use = '(_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)((?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*)(?: _<op>_= [^,;]+)(?: _<op>_;)' % to_undeclare regex_alone = '((?:_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])* _<op>_;)' % to_undeclare regex_group_leader = '((?:_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)*)( %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*)(?: _<op>_= [^,;]+)?( _<op>_,)(?:(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)? _<op>_,)*(?:(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)? _<op>_;)' % to_undeclare regex_group = '(_<keyword>_(?:struct|enum|union) _<id>_\d+@|_<type>_\w+)(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?(?: _<op>_,(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)*( _<op>_,(?: _<op>_\*)* %s(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)(?: _<op>_,(?: _<op>_\*)* _<id>_\d+@(?: _<op>_\[(?: [^\]]+)? _<op>_\])*(?: _<op>_= [^,;]+)?)*(?: _<op>_;)' % to_undeclare fix_line = None declaration = None declaration_pos = None # Start our search upwards for i in shuffled_lines: if len(re.findall(regex_alone_use, orig_lines[i])) == 1: if print_debug_messages: print("On line %d:" % i), tokens_to_source( orig_lines[i], name_dict, clang_format=True) print "Found Alone use", re.findall( regex_alone_use, orig_lines[i]) m = re.search(regex_alone_use, orig_lines[i]) declaration = orig_lines[i][m.start(1):m.end(2)] + ' _<op>_;' declaration_pos = i # Mutate orig_lines[i] = orig_lines[i][:m.start(1)] + orig_lines[i][ m.end(1) + 1:] done = True break if len(re.findall(regex_alone, orig_lines[i])) == 1: if print_debug_messages: print("On line %d:" % i), tokens_to_source( orig_lines[i], name_dict, clang_format=True) print "Found Alone", re.findall(regex_alone, orig_lines[i]) m = re.search(regex_alone, orig_lines[i]) declaration = orig_lines[i][m.start(1):m.end(1)] declaration_pos = i # Mutate orig_lines[i] = orig_lines[i][:m.start(1)] + orig_lines[i][ m.end(1) + 1:] done = True break elif len(re.findall(regex_group, orig_lines[i])) == 1: if print_debug_messages: print("On line %d:" % i), tokens_to_source( orig_lines[i], name_dict, clang_format=True) print "Found Group", re.findall(regex_group, orig_lines[i]) m = re.search(regex_group, orig_lines[i]) declaration = orig_lines[i][m.start(1):m.end(1)] + orig_lines[ i][m.start(2):m.end(2)][8:] + ' _<op>_;' declaration_pos = i try: end_of_declr = declaration.index('_<op>_=') declaration = declaration[:end_of_declr] except ValueError: pass # Mutate orig_lines[i] = orig_lines[i][:m.start(2) + 1] + orig_lines[i][m.end(2) + 1:] done = True break elif len(re.findall(regex_group_leader, orig_lines[i])) == 1: if print_debug_messages: print("On line %d:" % i), tokens_to_source( orig_lines[i], name_dict, clang_format=True) print "Found Group Leader", re.findall( regex_group_leader, orig_lines[i]) m = re.search(regex_group_leader, orig_lines[i]) declaration = orig_lines[i][m.start(1):m.end(2)] + ' _<op>_;' declaration_pos = i # Mutate orig_lines[i] = orig_lines[i][:m.start(2) + 1] + orig_lines[i][m.end(3) + 1:] done = True break if done: break if not done: # Failed to find something to undeclare raise NothingToMutateException # Find the function signature fn_regex = '(?:_<keyword>_(?:struct|union|enum) _<id>_\d+@|_<type>_\w+|_<keyword>_void)(?: _<op>_\*)* (?:_<id>_\d+@|_<APIcall>_main) _<op>_\(' fn_start_regex = '_<op>_\{' inserted = False assert declaration_pos != None for i in range(declaration_pos, 0, -1): if len(re.findall(fn_regex, old_lines[i])) == 1: for j in range(i, len(old_lines)): if len(re.findall(fn_start_regex, old_lines[i])) >= 1: fix_line = j break inserted = True if inserted: break if not inserted: # print Failed to insert fix raise FailedToMutateException if fix_line is None: # Couldn't find { after function definition raise FailedToMutateException fix = '_<insertion>_ ' assert fix_line is not None for digit in str(fix_line): fix += str(digit) + ' ' fix += '~ ' + declaration to_delete = False if orig_lines[declaration_pos].strip() == '': to_delete = declaration_pos del orig_lines[to_delete] recomposed_program = '' for i, line in enumerate(orig_lines): for digit in str(i): recomposed_program += digit + ' ' recomposed_program += '~ ' recomposed_program += line + ' ' return recomposed_program, fix, fix_line
def do_problem(problem_id): global reconstruction, errors, errors_full, total_count, errors_test c = conn.cursor() reconstruction[problem_id] = {} errors[problem_id] = {} errors_full[problem_id] = {} errors_test[problem_id] = [] candidate_programs = [] for row in c.execute('SELECT user_id, prog_id, code, name_dict, name_seq FROM programs WHERE prob_id = ?', (problem_id,)): user_id, prog_id, initial = row[0], row[1], row[2] name_dict = json.loads(row[3]) name_seq = json.loads(row[4]) candidate_programs.append( (user_id, prog_id, initial, name_dict, name_seq,)) for _, prog_id, initial, name_dict, name_seq in candidate_programs: fixes_suggested_by_typo_network = [] fixes_suggested_by_undeclared_network = [] for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'typo\' ORDER BY iteration', (prog_id,)): fixes_suggested_by_typo_network.append(row[0]) for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'ids\' ORDER BY iteration', (prog_id,)): fixes_suggested_by_undeclared_network.append(row[0]) reconstruction[problem_id][prog_id] = [initial] temp_errors, temp_errors_full = compilation_errors( tokens_to_source(initial, name_dict, False)) errors[problem_id][prog_id] = [temp_errors] errors_full[problem_id][prog_id] = [temp_errors_full] try: for fix in fixes_suggested_by_typo_network: if meets_criterion(reconstruction[problem_id][prog_id][-1], fix, 'replace'): temp_prog = apply_fix( reconstruction[problem_id][prog_id][-1], fix, 'replace') temp_errors, temp_errors_full = compilation_errors( tokens_to_source(temp_prog, name_dict, False)) if len(temp_errors) > len(errors[problem_id][prog_id][-1]): break else: reconstruction[problem_id][prog_id].append(temp_prog) errors[problem_id][prog_id].append(temp_errors) errors_full[problem_id][prog_id].append( temp_errors_full) else: break except InvalidFixLocationException: print 'Localization failed' while len(reconstruction[problem_id][prog_id]) <= 5: reconstruction[problem_id][prog_id].append( reconstruction[problem_id][prog_id][-1]) errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1]) errors_full[problem_id][prog_id].append( errors_full[problem_id][prog_id][-1]) already_fixed = [] try: for fix in fixes_suggested_by_undeclared_network: if fix not in already_fixed: temp_prog = apply_fix( reconstruction[problem_id][prog_id][-1], fix, 'insert') already_fixed.append(fix) temp_errors, temp_errors_full = compilation_errors( tokens_to_source(temp_prog, name_dict, False)) if len(temp_errors) > len(errors[problem_id][prog_id][-1]): break else: reconstruction[problem_id][prog_id].append(temp_prog) errors[problem_id][prog_id].append(temp_errors) errors_full[problem_id][prog_id].append( temp_errors_full) else: pass except InvalidFixLocationException: print 'Localization failed' while len(reconstruction[problem_id][prog_id]) <= 10: reconstruction[problem_id][prog_id].append( reconstruction[problem_id][prog_id][-1]) errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1]) errors_full[problem_id][prog_id].append( errors_full[problem_id][prog_id][-1]) errors_test[problem_id].append(errors[problem_id][prog_id]) if not args.is_timing_experiment: for k, errors_t, errors_full_t in zip(range(len(errors[problem_id][prog_id])), errors[problem_id][prog_id], errors_full[problem_id][prog_id]): c.execute("INSERT INTO error_message_strings VALUES(?, ?, ?, ?, ?)", ( prog_id, k, 'typo', errors_full_t.decode('utf-8', 'ignore'), len(errors_t))) for error_ in errors_t: c.execute("INSERT INTO error_messages VALUES(?, ?, ?, ?)", (prog_id, k, 'typo', error_.decode('utf-8', 'ignore'),)) count_t = len(candidate_programs) total_count += count_t if not args.is_timing_experiment: print 'Committing changes to database...' conn.commit() print 'Done!' else: print 'Done problem with', count_t, 'programs' c.close()