def generate_training_data(db_path, bins, validation_users, min_program_length,
                           max_program_length, max_fix_length, kind_mutations,
                           max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)

    if kind_mutations == 'typo':
        from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate
        mutator_obj = Typo_Mutate(rng)
        mutate = partial(typo_mutate, mutator_obj)
        op = "replace"

        def rename_ids(x, y):
            return (x, y)
    else:
        from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate
        mutate = partial(id_mutate, rng)
        rename_ids = partial(rename_ids_, rng)
        op = "insert"

    result = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, code_id, tokenized_code, name_dict, name_seq FROM Code " +\
            "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in tqdm(problem_list):
            for row in cursor.execute(
                    query,
                (problem_id, min_program_length, max_program_length)):
                user_id, code_id, tokenized_code = map(str, row[:-2])
                name_dict, name_sequence = json.loads(row[3]), json.loads(
                    row[4])
                key = 'validation' if user_id in validation_users[
                    problem_id] else 'train'

                program_length = len(tokenized_code.split())
                program_lengths.append(program_length)

                if program_length >= min_program_length and program_length <= max_program_length:
                    id_renamed_correct_program, _ = rename_ids(
                        tokenized_code, '')

                    # Correct pairs
                    dummy_fix_for_correct_program = '-1'
                    source = ' '.join(
                        remove_line_numbers(id_renamed_correct_program).split(
                            ' ')[:-1])
                    target = ["0" for i in range(len(source.split()))]
                    try:
                        result[key][problem_id] += [
                            (source, name_dict, name_sequence, user_id,
                             code_id, " ".join(target))
                        ]
                    except:
                        result[key][problem_id] = [
                            (source, name_dict, name_sequence, user_id,
                             code_id, " ".join(target))
                        ]

                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(tokenized_code, max_mutations,
                                          max_variants)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    else:
                        for i, (corrupt_program,
                                fix) in zip(range(1,
                                                  len(iterator) + 1),
                                            iterator):
                            corrupt_program_length = len(
                                corrupt_program.split())
                            fix_length = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                               corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                try:
                                    if kind_mutations == 'typo':
                                        corrupt_program, _ = rename_ids(
                                            corrupt_program, fix)
                                    else:
                                        full_program, corrupt_program = rename_ids(
                                            tokenized_code, corrupt_program)
                                except FixIDNotFoundInSource:
                                    exceptions_in_mutate_call += 1

                                corrupt_source = ' '.join(
                                    remove_line_numbers(corrupt_program).split(
                                        ' ')[:-1])
                                if kind_mutations == 'typo':
                                    target = get_target(
                                        corrupt_source.split(), source.split())
                                else:
                                    full_source = ' '.join(
                                        remove_line_numbers(
                                            full_program).split(' ')[:-1])
                                    target = get_target(
                                        corrupt_source.split(),
                                        full_source.split())

                                try:
                                    result[key][problem_id] += [
                                        (corrupt_source, name_dict,
                                         name_sequence, user_id,
                                         code_id + "_" + str(i), target)
                                    ]
                                except:
                                    result[key][problem_id] = [
                                        (corrupt_source, name_dict,
                                         name_sequence, user_id,
                                         code_id + "_" + str(i), target)
                                    ]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print('Statistics')
    print('----------')
    print('Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =',
          program_lengths[int(0.95 * len(program_lengths))])
    try:
        print('Mean fix length: Mean =', np.mean(fix_lengths),
              '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))])
    except Exception as e:
        print(e)
        print('fix_lengths')
        print(fix_lengths)
    print('Total mutate calls:', total_mutate_calls)
    print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n')

    return result
Exemple #2
0
def generate_seeded_test_data(db_path, bins, min_program_length, max_program_length, max_fix_length,
                              kind_mutations, max_mutations, programs_per_problem, seed):
    rng = np.random.RandomState(seed)

    if kind_mutations == 'typo':
        from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate
        mutator_obj = Typo_Mutate(rng)
        mutate = partial(typo_mutate, mutator_obj, just_one=True)
    else:
        from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate
        mutate = partial(id_mutate, rng)

    result = {}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, code_id, tokenized_code, name_dict, name_seq FROM Code " +\
            "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in problem_list:
            for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)):
                user_id, code_id, tokenized_code = map(str, row[:-2])
                name_dict, name_sequence = json.loads(
                    row[3]), json.loads(row[4])

                program_length = len(tokenized_code.split())
                program_lengths.append(program_length)
                if program_length >= min_program_length and program_length <= max_program_length:
                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(
                            tokenized_code, max_mutations, num_mutated_progs=1)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    else:
                        for corrupt_program, fix in iterator:
                            corrupt_program_length = len(
                                corrupt_program.split())
                            fix_length = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                               corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                try:
                                    result[problem_id].append(
                                        (corrupt_program, name_dict, name_sequence, user_id, code_id))
                                except KeyError:
                                    result[problem_id] = [
                                        (corrupt_program, name_dict, name_sequence, user_id, code_id)]

                if problem_id in result and len(result[problem_id]) >= programs_per_problem:
                    break

    seeded_test_data = {problem_id: result[problem_id] for problem_id in result if
                        len(result[problem_id]) == programs_per_problem}

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print('Statistics')
    print('----------')
    print('Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =', \
        program_lengths[int(0.95 * len(program_lengths))])
    try:
        print('Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', \
            fix_lengths[int(0.95 * len(fix_lengths))])
    except Exception as e:
        print(e)
        print('fix_lengths')
        print(fix_lengths)
    print('Total mutate calls:', total_mutate_calls)
    print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n')

    return seeded_test_data, mutator_obj.get_mutation_distribution() if kind_mutations == 'typo' else {}
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \
                                    max_fix_length, max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)
    tokenize = C_Tokenizer().tokenize
    convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format

    mutator_obj = Typo_Mutate(rng)
    mutate = partial(typo_mutate, mutator_obj)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in problem_list:
            for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)):
                user_id, code_id, tokenized_program = map(str, row)
                key = 'validation' if user_id in validation_users[problem_id] else 'train'

                program_length = len(tokenized_program.split())
                program_lengths.append(program_length)

                if program_length >= min_program_length and program_length <= max_program_length:

                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(tokenized_program, max_mutations, max_variants)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        raise
                    else:
                        tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program))

                        for corrupt_program, fix in iterator:
                            corrupt_program_length = len(corrupt_program.split())
                            fix_length             = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                            corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program))
                                try:
                                    token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)]
                                except:
                                    token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
        print fix_lengths
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    for key in token_strings:
        print key
        for problem_id in token_strings[key]:
            print problem_id, len(token_strings[key][problem_id])

    return token_strings, mutator_obj.get_mutation_distribution()
Exemple #4
0
def generate_training_data(db_path, bins, validation_users, min_program_length,
                           max_program_length, max_fix_length, kind_mutations,
                           max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)

    if kind_mutations == 'typo':
        from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate
        mutator_obj = Typo_Mutate(rng)
        mutate = partial(typo_mutate, mutator_obj)

        def rename_ids(x, y):
            return x, y
    else:
        from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate
        mutate = partial(id_mutate, rng)
        rename_ids = partial(rename_ids_, rng)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        # Should be ">=?" and "<=?"???
        query = 'SELECT user_id, tokenized_code, codelength FROM Code ' \
                'WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;'
        for problem_id in problem_list:
            for row in cursor.execute(
                    query,
                (problem_id, min_program_length, max_program_length)):
                user_id, tokenized_code, program_length = map(str, row)
                key = 'validation' if user_id in validation_users[
                    problem_id] else 'train'

                program_lengths.append(program_length)

                id_renamed_correct_program, _ = rename_ids(tokenized_code, '')

                # Correct pairs
                token_strings[key].setdefault(problem_id, []).append(
                    (id_renamed_correct_program, '-1'))

                # Mutate
                total_mutate_calls += 1
                try:
                    iterator = mutate(tokenized_code, max_mutations,
                                      max_variants)
                except (FailedToMutateException,
                        LoopCountThresholdExceededException):
                    exceptions_in_mutate_call += 1
                except Exception:
                    exceptions_in_mutate_call += 1
                    if kind_mutations == 'typo':
                        raise
                else:
                    for corrupt_program, fix in iterator:
                        corrupt_program_length = len(corrupt_program.split())
                        fix_length = len(fix.split())
                        fix_lengths.append(fix_length)
                        if (min_program_length <= corrupt_program_length <=
                                max_program_length
                                and fix_length <= max_fix_length):
                            try:
                                corrupt_program, fix = rename_ids(
                                    corrupt_program, fix)
                            except FixIDNotFoundInSource:
                                exceptions_in_mutate_call += 1
                            token_strings[key].setdefault(problem_id,
                                                          []).append(
                                                              (corrupt_program,
                                                               fix))

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(
        program_lengths), '\t95th %ile =', program_lengths[int(
            0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(
            fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 *
                                                            len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
        print fix_lengths
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    return token_strings, mutator_obj.get_mutation_distribution(
    ) if kind_mutations == 'typo' else {}
def generate_training_data(db_path, bins, validation_users, min_program_length,
                           max_program_length, max_fix_length, kind_mutations,
                           max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)

    if kind_mutations == 'typo':
        from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate
        mutator_obj = Typo_Mutate(rng)
        mutate = partial(typo_mutate, mutator_obj)

        def rename_ids(x, y):
            return (x, y)
    else:
        from data_processing.undeclared_mutator import LoopCountThresholdExceededException, FailedToMutateException, id_mutate
        mutate = partial(id_mutate, rng)
        rename_ids = partial(rename_ids_, rng)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, tokenized_code FROM Code " +\
            "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in problem_list:
            for row in cursor.execute(
                    query,
                (problem_id, min_program_length, max_program_length)):
                user_id, tokenized_code = map(str, row)
                key = 'validation' if user_id in validation_users[
                    problem_id] else 'train'

                program_length = len(tokenized_code.split())
                program_lengths.append(program_length)

                if program_length >= min_program_length and program_length <= max_program_length:
                    id_renamed_correct_program, _ = rename_ids(
                        tokenized_code, '')

                    # Correct pairs
                    dummy_fix_for_correct_program = '-1'
                    try:
                        token_strings[key][problem_id] += [
                            (id_renamed_correct_program,
                             dummy_fix_for_correct_program)
                        ]
                    except:
                        token_strings[key][problem_id] = [
                            (id_renamed_correct_program,
                             dummy_fix_for_correct_program)
                        ]

                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(tokenized_code, max_mutations,
                                          max_variants)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        if kind_mutations == 'typo':
                            raise
                    else:
                        for corrupt_program, fix in iterator:
                            corrupt_program_length = len(
                                corrupt_program.split())
                            fix_length = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                               corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                try:
                                    corrupt_program, fix = rename_ids(
                                        corrupt_program, fix)
                                except FixIDNotFoundInSource:
                                    exceptions_in_mutate_call += 1

                                try:
                                    token_strings[key][problem_id] += [
                                        (corrupt_program, fix)
                                    ]
                                except:
                                    token_strings[key][problem_id] = [
                                        (corrupt_program, fix)
                                    ]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print('Statistics')
    print('----------')
    print('Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =',
          program_lengths[int(0.95 * len(program_lengths))])
    try:
        print('Mean fix length: Mean =', np.mean(fix_lengths),
              '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))])
    except Exception as e:
        print(e)
        print('fix_lengths')
        print(fix_lengths)
    print('Total mutate calls:', total_mutate_calls)
    print('Exceptions in mutate() call:', exceptions_in_mutate_call, '\n')

    return token_strings, mutator_obj.get_mutation_distribution(
    ) if kind_mutations == 'typo' else {}
def generate_training_data(bins, min_program_length, max_program_length,
                           max_fix_length, kind_mutations, max_mutations,
                           max_variants, seed):
    rng = np.random.RandomState(seed)

    if kind_mutations == 'typo':
        from data_processing.typo_mutator import LoopCountThresholdExceededException, FailedToMutateException, Typo_Mutate, typo_mutate
        mutator_obj = Typo_Mutate(rng)
        mutate = partial(typo_mutate, mutator_obj)

        def rename_ids(x, y):
            return x, y
    else:
        from data_processing.undeclared_mutator_cs import LoopCountThresholdExceededException, FailedToMutateException, id_mutate
        mutate = partial(id_mutate, rng)
        rename_ids = partial(rename_ids_, rng)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    for problem_id, tokenized_code in get_cs_tokenized().items():
        program_length = len(tokenized_code.split())
        key = 'train'

        program_lengths.append(program_length)

        id_renamed_correct_program, _ = rename_ids(tokenized_code, '')

        # Correct pairs
        token_strings[key].setdefault(problem_id, []).append(
            (id_renamed_correct_program, '-1'))

        # Mutate
        total_mutate_calls += 1
        try:
            iterator = mutate(tokenized_code, max_mutations, max_variants)
        except (FailedToMutateException, LoopCountThresholdExceededException):
            exceptions_in_mutate_call += 1
        except Exception:
            exceptions_in_mutate_call += 1
            if kind_mutations == 'typo':
                raise
        else:
            for corrupt_program, fix in iterator:
                corrupt_program_length = len(corrupt_program.split())
                fix_length = len(fix.split())
                fix_lengths.append(fix_length)
                if (min_program_length <= corrupt_program_length <=
                        max_program_length and fix_length <= max_fix_length):
                    try:
                        corrupt_program, fix = rename_ids(corrupt_program, fix)
                    except FixIDNotFoundInSource:
                        exceptions_in_mutate_call += 1
                    token_strings[key].setdefault(problem_id, []).append(
                        (corrupt_program, fix))

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(
        program_lengths), '\t95th %ile =', program_lengths[int(
            0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(
            fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 *
                                                            len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
        print fix_lengths
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    return token_strings, mutator_obj.get_mutation_distribution(
    ) if kind_mutations == 'typo' else {}, rng