import numpy as np from data_processing.training_data_generator import load_dictionaries from util.helpers import remove_empty_new_lines from util.c_tokenizer import C_Tokenizer deepfix_base_dir = 'data/deepfix-test-data/' RLAssist_base_dir = 'data/network_inputs/RLAssist-seed-1189/' iitk_db_path = 'data/iitk-dataset/dataset.db' max_program_len = 45000 dummy_correct_program = '_eos_ -new-line- _pad_' tokenize = C_Tokenizer().tokenize convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format convert_to_rla_format = lambda x: remove_empty_new_lines( convert_to_new_line_format(x)) raw_test_data = {} seeded_test_data = {} def vectorize(tokens, tldict, max_vector_length=max_program_len): vec_tokens = [] for token in tokens.split(): try: vec_tokens.append(tldict[token]) except Exception: return None if len(vec_tokens) > max_vector_length: return None
def generate_training_data(db_path, bins, min_program_length, max_program_length, \ max_fix_length, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format mutator_obj = Typo_Mutate_Java(rng) mutate = partial(typo_mutate, mutator_obj) token_strings = {'train': {}, 'validation': {}, 'test': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] code_id_list = [] for bin_ in bins: for problem_id in bin_: code_id_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() code_id_list = [] query = "SELECT code_id FROM Code WHERE codelength>? and codelength<?;" for row in cursor.execute(query, (min_program_length, max_program_length)): # get all the code_id code_id_list.append(row[0]) rng.shuffle(code_id_list) # split into train, valiation and test test: 80%, 10%, 10% validation_code_id_list = code_id_list[0:int(0.1 * len(code_id_list))] test_code_id_list = code_id_list[int(0.1 * len(code_id_list) ):int(0.1 * len(code_id_list)) * 2] training_code_id_list = code_id_list[int(0.1 * len(code_id_list)) * 2:] # make sure they do not intersect assert list(set(training_code_id_list) & set(validation_code_id_list)) == [] assert list(set(training_code_id_list) & set(test_code_id_list)) == [] assert list(set(validation_code_id_list) & set(test_code_id_list)) == [] query = "SELECT code_id, tokenized_code, codelength FROM Code " + "WHERE codelength>? and codelength<?;" total_variant_cnt = 0 for row in cursor.execute(query, (min_program_length, max_program_length)): code_id = row[0] tokenized_program = row[1] if code_id in validation_code_id_list: key = 'validation' if code_id in test_code_id_list: key = 'test' if code_id in training_code_id_list: key = 'train' # number of tokens program_length = row[2] # row[2] is codelength program_lengths.append(program_length) if program_length > min_program_length and program_length < max_program_length: # start to mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_program, max_mutations, max_variants) except FailedToMutateException: print code_id exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: print code_id exceptions_in_mutate_call += 1 except ValueError: print code_id exceptions_in_mutate_call += 1 raise except AssertionError: print code_id exceptions_in_mutate_call += 1 raise except Exception: print code_id exceptions_in_mutate_call += 1 raise else: tokenized_program = remove_empty_new_lines( convert_to_new_line_format(tokenized_program)) for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: corrupt_program = remove_empty_new_lines( convert_to_new_line_format(corrupt_program)) total_variant_cnt += 1 try: token_strings[key][code_id] += [ (code_id, corrupt_program, tokenized_program) ] except: token_strings[key][code_id] = [ (code_id, corrupt_program, tokenized_program) ] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean( program_lengths), '\t95th %ile =', program_lengths[int( 0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean( fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' return token_strings, mutator_obj.get_mutation_distribution()
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \ max_fix_length, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) tokenize = C_Tokenizer().tokenize convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in problem_list: for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)): user_id, code_id, tokenized_program = map(str, row) key = 'validation' if user_id in validation_users[problem_id] else 'train' program_length = len(tokenized_program.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_program, max_mutations, max_variants) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 raise except AssertionError: exceptions_in_mutate_call += 1 raise except Exception: exceptions_in_mutate_call += 1 raise else: tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program)) for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program)) try: token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)] except: token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print fix_lengths print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' for key in token_strings: print key for problem_id in token_strings[key]: print problem_id, len(token_strings[key][problem_id]) return token_strings, mutator_obj.get_mutation_distribution()