def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) if kind_mutations == 'typo': from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) op = "replace" def rename_ids(x, y): return (x, y) else: from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate mutate = partial(id_mutate, rng) rename_ids = partial(rename_ids_, rng) op = "insert" result = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, code_id, tokenized_code, name_dict, name_seq FROM Code " +\ "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in tqdm(problem_list): for row in cursor.execute( query, (problem_id, min_program_length, max_program_length)): user_id, code_id, tokenized_code = map(str, row[:-2]) name_dict, name_sequence = json.loads(row[3]), json.loads( row[4]) key = 'validation' if user_id in validation_users[ problem_id] else 'train' program_length = len(tokenized_code.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: id_renamed_correct_program, _ = rename_ids( tokenized_code, '') # Correct pairs dummy_fix_for_correct_program = '-1' source = ' '.join( remove_line_numbers(id_renamed_correct_program).split( ' ')[:-1]) target = ["0" for i in range(len(source.split()))] try: result[key][problem_id] += [ (source, name_dict, name_sequence, user_id, code_id, " ".join(target)) ] except: result[key][problem_id] = [ (source, name_dict, name_sequence, user_id, code_id, " ".join(target)) ] # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_code, max_mutations, max_variants) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except AssertionError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except Exception: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise else: for i, (corrupt_program, fix) in zip(range(1, len(iterator) + 1), iterator): corrupt_program_length = len( corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: try: if kind_mutations == 'typo': corrupt_program, _ = rename_ids( corrupt_program, fix) else: full_program, corrupt_program = rename_ids( tokenized_code, corrupt_program) except FixIDNotFoundInSource: exceptions_in_mutate_call += 1 corrupt_source = ' '.join( remove_line_numbers(corrupt_program).split( ' ')[:-1]) if kind_mutations == 'typo': target = get_target( corrupt_source.split(), source.split()) else: full_source = ' '.join( remove_line_numbers( full_program).split(' ')[:-1]) target = get_target( corrupt_source.split(), full_source.split()) try: result[key][problem_id] += [ (corrupt_source, name_dict, name_sequence, user_id, code_id + "_" + str(i), target) ] except: result[key][problem_id] = [ (corrupt_source, name_dict, name_sequence, user_id, code_id + "_" + str(i), target) ] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print('Statistics') print('----------') print('Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))]) try: print('Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))]) except Exception as e: print(e) print('fix_lengths') print(fix_lengths) print('Total mutate calls:', total_mutate_calls) print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n') return result
def generate_seeded_test_data(db_path, bins, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, programs_per_problem, seed): rng = np.random.RandomState(seed) if kind_mutations == 'typo': from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj, just_one=True) else: from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate mutate = partial(id_mutate, rng) result = {} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, code_id, tokenized_code, name_dict, name_seq FROM Code " +\ "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in problem_list: for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)): user_id, code_id, tokenized_code = map(str, row[:-2]) name_dict, name_sequence = json.loads( row[3]), json.loads(row[4]) program_length = len(tokenized_code.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: # Mutate total_mutate_calls += 1 try: iterator = mutate( tokenized_code, max_mutations, num_mutated_progs=1) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except AssertionError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except Exception: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise else: for corrupt_program, fix in iterator: corrupt_program_length = len( corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: try: result[problem_id].append( (corrupt_program, name_dict, name_sequence, user_id, code_id)) except KeyError: result[problem_id] = [ (corrupt_program, name_dict, name_sequence, user_id, code_id)] if problem_id in result and len(result[problem_id]) >= programs_per_problem: break seeded_test_data = {problem_id: result[problem_id] for problem_id in result if len(result[problem_id]) == programs_per_problem} program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print('Statistics') print('----------') print('Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', \ program_lengths[int(0.95 * len(program_lengths))]) try: print('Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', \ fix_lengths[int(0.95 * len(fix_lengths))]) except Exception as e: print(e) print('fix_lengths') print(fix_lengths) print('Total mutate calls:', total_mutate_calls) print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n') return seeded_test_data, mutator_obj.get_mutation_distribution() if kind_mutations == 'typo' else {}
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \ max_fix_length, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) tokenize = C_Tokenizer().tokenize convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in problem_list: for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)): user_id, code_id, tokenized_program = map(str, row) key = 'validation' if user_id in validation_users[problem_id] else 'train' program_length = len(tokenized_program.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_program, max_mutations, max_variants) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 raise except AssertionError: exceptions_in_mutate_call += 1 raise except Exception: exceptions_in_mutate_call += 1 raise else: tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program)) for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program)) try: token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)] except: token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print fix_lengths print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' for key in token_strings: print key for problem_id in token_strings[key]: print problem_id, len(token_strings[key][problem_id]) return token_strings, mutator_obj.get_mutation_distribution()
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) if kind_mutations == 'typo': from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) def rename_ids(x, y): return x, y else: from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate mutate = partial(id_mutate, rng) rename_ids = partial(rename_ids_, rng) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() # Should be ">=?" and "<=?"??? query = 'SELECT user_id, tokenized_code, codelength FROM Code ' \ 'WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;' for problem_id in problem_list: for row in cursor.execute( query, (problem_id, min_program_length, max_program_length)): user_id, tokenized_code, program_length = map(str, row) key = 'validation' if user_id in validation_users[ problem_id] else 'train' program_lengths.append(program_length) id_renamed_correct_program, _ = rename_ids(tokenized_code, '') # Correct pairs token_strings[key].setdefault(problem_id, []).append( (id_renamed_correct_program, '-1')) # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_code, max_mutations, max_variants) except (FailedToMutateException, LoopCountThresholdExceededException): exceptions_in_mutate_call += 1 except Exception: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise else: for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if (min_program_length <= corrupt_program_length <= max_program_length and fix_length <= max_fix_length): try: corrupt_program, fix = rename_ids( corrupt_program, fix) except FixIDNotFoundInSource: exceptions_in_mutate_call += 1 token_strings[key].setdefault(problem_id, []).append( (corrupt_program, fix)) program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean( program_lengths), '\t95th %ile =', program_lengths[int( 0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean( fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print fix_lengths print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' return token_strings, mutator_obj.get_mutation_distribution( ) if kind_mutations == 'typo' else {}
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) if kind_mutations == 'typo': from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) def rename_ids(x, y): return (x, y) else: from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate mutate = partial(id_mutate, rng) rename_ids = partial(rename_ids_, rng) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, tokenized_code FROM Code " +\ "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in problem_list: for row in cursor.execute( query, (problem_id, min_program_length, max_program_length)): user_id, tokenized_code = map(str, row) key = 'validation' if user_id in validation_users[ problem_id] else 'train' program_length = len(tokenized_code.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: id_renamed_correct_program, _ = rename_ids( tokenized_code, '') # Correct pairs dummy_fix_for_correct_program = '-1' try: token_strings[key][problem_id] += [ (id_renamed_correct_program, dummy_fix_for_correct_program) ] except: token_strings[key][problem_id] = [ (id_renamed_correct_program, dummy_fix_for_correct_program) ] # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_code, max_mutations, max_variants) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except AssertionError: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise except Exception: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise else: for corrupt_program, fix in iterator: corrupt_program_length = len( corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: try: corrupt_program, fix = rename_ids( corrupt_program, fix) except FixIDNotFoundInSource: exceptions_in_mutate_call += 1 try: token_strings[key][problem_id] += [ (corrupt_program, fix) ] except: token_strings[key][problem_id] = [ (corrupt_program, fix) ] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print('Statistics') print('----------') print('Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))]) try: print('Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))]) except Exception as e: print(e) print('fix_lengths') print(fix_lengths) print('Total mutate calls:', total_mutate_calls) print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n') return token_strings, mutator_obj.get_mutation_distribution( ) if kind_mutations == 'typo' else {}
def generate_training_data(bins, min_program_length, max_program_length, max_fix_length, kind_mutations, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) if kind_mutations == 'typo': from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) def rename_ids(x, y): return x, y else: from data_processing.undeclared_mutator_cs import LoopCountThresholdExceededException, FailedToMutateException, id_mutate mutate = partial(id_mutate, rng) rename_ids = partial(rename_ids_, rng) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] for problem_id, tokenized_code in get_cs_tokenized().items(): program_length = len(tokenized_code.split()) key = 'train' program_lengths.append(program_length) id_renamed_correct_program, _ = rename_ids(tokenized_code, '') # Correct pairs token_strings[key].setdefault(problem_id, []).append( (id_renamed_correct_program, '-1')) # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_code, max_mutations, max_variants) except (FailedToMutateException, LoopCountThresholdExceededException): exceptions_in_mutate_call += 1 except Exception: exceptions_in_mutate_call += 1 if kind_mutations == 'typo': raise else: for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if (min_program_length <= corrupt_program_length <= max_program_length and fix_length <= max_fix_length): try: corrupt_program, fix = rename_ids(corrupt_program, fix) except FixIDNotFoundInSource: exceptions_in_mutate_call += 1 token_strings[key].setdefault(problem_id, []).append( (corrupt_program, fix)) program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean( program_lengths), '\t95th %ile =', program_lengths[int( 0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean( fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print fix_lengths print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' return token_strings, mutator_obj.get_mutation_distribution( ) if kind_mutations == 'typo' else {}, rng