def generate_training_data(db_path, bins, min_program_length, max_program_length, \
                                    max_fix_length, max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)
    convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format

    mutator_obj = Typo_Mutate_Java(rng)
    mutate = partial(typo_mutate, mutator_obj)

    token_strings = {'train': {}, 'validation': {}, 'test': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    code_id_list = []
    for bin_ in bins:
        for problem_id in bin_:
            code_id_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()

        code_id_list = []
        query = "SELECT code_id FROM Code WHERE codelength>? and codelength<?;"
        for row in cursor.execute(query,
                                  (min_program_length, max_program_length)):
            # get all the code_id
            code_id_list.append(row[0])

        rng.shuffle(code_id_list)

        # split into train, valiation and test test: 80%, 10%, 10%
        validation_code_id_list = code_id_list[0:int(0.1 * len(code_id_list))]
        test_code_id_list = code_id_list[int(0.1 * len(code_id_list)
                                             ):int(0.1 * len(code_id_list)) *
                                         2]
        training_code_id_list = code_id_list[int(0.1 * len(code_id_list)) * 2:]

        # make sure they do not intersect
        assert list(set(training_code_id_list)
                    & set(validation_code_id_list)) == []
        assert list(set(training_code_id_list) & set(test_code_id_list)) == []
        assert list(set(validation_code_id_list)
                    & set(test_code_id_list)) == []

        query = "SELECT code_id, tokenized_code, codelength FROM Code " + "WHERE codelength>? and codelength<?;"
        total_variant_cnt = 0
        for row in cursor.execute(query,
                                  (min_program_length, max_program_length)):
            code_id = row[0]
            tokenized_program = row[1]

            if code_id in validation_code_id_list:
                key = 'validation'
            if code_id in test_code_id_list:
                key = 'test'
            if code_id in training_code_id_list:
                key = 'train'

            # number of tokens
            program_length = row[2]  # row[2] is codelength
            program_lengths.append(program_length)

            if program_length > min_program_length and program_length < max_program_length:
                # start to mutate

                total_mutate_calls += 1
                try:
                    iterator = mutate(tokenized_program, max_mutations,
                                      max_variants)
                except FailedToMutateException:
                    print code_id
                    exceptions_in_mutate_call += 1
                except LoopCountThresholdExceededException:
                    print code_id
                    exceptions_in_mutate_call += 1
                except ValueError:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                except AssertionError:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                except Exception:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                else:
                    tokenized_program = remove_empty_new_lines(
                        convert_to_new_line_format(tokenized_program))
                    for corrupt_program, fix in iterator:
                        corrupt_program_length = len(corrupt_program.split())
                        fix_length = len(fix.split())
                        fix_lengths.append(fix_length)
                        if corrupt_program_length >= min_program_length and \
                        corrupt_program_length <= max_program_length and fix_length <= max_fix_length:
                            corrupt_program = remove_empty_new_lines(
                                convert_to_new_line_format(corrupt_program))
                            total_variant_cnt += 1
                            try:
                                token_strings[key][code_id] += [
                                    (code_id, corrupt_program,
                                     tokenized_program)
                                ]
                            except:
                                token_strings[key][code_id] = [
                                    (code_id, corrupt_program,
                                     tokenized_program)
                                ]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(
        program_lengths), '\t95th %ile =', program_lengths[int(
            0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(
            fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 *
                                                            len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    return token_strings, mutator_obj.get_mutation_distribution()
Ejemplo n.º 2
0
    def __init__(self,
                 dataset,
                 step_penalty,
                 seed,
                 GE_ratio=None,
                 top_down_movement=True,
                 single_delete=True,
                 reject_spurious_edits=True,
                 compilation_error_store=None,
                 train_data_size=0,
                 valid_data_size=0,
                 test_data_size=0,
                 GE_code_ids=None,
                 actions=None,
                 verbose=False,
                 single_program=None,
                 sparse_rewards=True):

        assert (GE_ratio is None
                and GE_code_ids is not None) or (GE_ratio is not None
                                                 and GE_code_ids is None)

        Env_engine.__init__(self,
                            dataset.get_tl_dict(),
                            seed,
                            step_penalty=step_penalty,
                            top_down_movement=top_down_movement,
                            reject_spurious_edits=reject_spurious_edits,
                            compilation_error_store=compilation_error_store,
                            single_delete=single_delete,
                            actions=actions,
                            sparse_rewards=sparse_rewards)

        if single_program is not None:
            td = self.tl_dict
            tokenized_program, name_dict, name_sequence = C_Tokenizer(
            ).tokenize_single_program(single_program)
            single_ex_dataset = namedtuple('single_ex_dataset',
                                           ['single_ex', 'name_dict_store'],
                                           verbose=True)
            self.dataset = single_ex_dataset(
                single_ex={
                    'single': (self.vectorize(tokenized_program),
                               [td['EOF'], td['-new-line-'], td['_pad_']])
                },
                name_dict_store={'single': (name_dict, name_sequence)})
            self.data_sizes = {'single': 1}
            self.code_ids = {'single': ['single']}
        else:
            self.verbose = verbose
            self.dataset = dataset
            train_data_size = dataset.data_size[
                0] if train_data_size == 0 else min(train_data_size,
                                                    dataset.data_size[0])
            valid_data_size = dataset.data_size[
                1] if valid_data_size == 0 else min(valid_data_size,
                                                    dataset.data_size[1])
            test_data_size = dataset.data_size[
                2] if test_data_size == 0 else min(test_data_size,
                                                   dataset.data_size[2])

            train_code_ids = self.dataset.train_ex.keys()[:train_data_size]

            guided_train_data_size = int(
                GE_ratio *
                train_data_size) if GE_code_ids is None else len(GE_code_ids)
            if GE_code_ids is None:
                guided_train_code_ids = set(
                    self.rng.choice(train_code_ids,
                                    guided_train_data_size,
                                    replace=False))
            else:
                guided_train_code_ids = GE_code_ids

            # raw test dataset
            real_data_size = 0
            real_data_keys = []
            try:
                self.real_test_data = self.dataset.real_test_data
            except AttributeError:
                pass
            else:
                real_data_size = len(self.real_test_data)
                real_data_keys = self.real_test_data.keys()

            # seeded test dataset
            seeded_data_size = 0
            seeded_data_keys = []
            try:
                self.seeded_test_data = self.dataset.seeded_test_data
            except AttributeError:
                pass
            else:
                seeded_data_size = len(self.seeded_test_data)
                seeded_data_keys = self.seeded_test_data.keys()

            self.data_sizes = {
                'train': train_data_size,
                'valid': valid_data_size,
                'test': test_data_size,
                'real': real_data_size,
                'seeded': seeded_data_size,
                'GE_train': guided_train_data_size
            }
            self.code_ids = {
                'train': train_code_ids,
                'GE_train': guided_train_code_ids,
                'valid': self.dataset.valid_ex.keys(),
                'test': self.dataset.test_ex.keys(),
                'real': real_data_keys,
                'seeded': seeded_data_keys
            }
            self.rng.shuffle(self.code_ids['train'])
Ejemplo n.º 3
0
import os
import sqlite3
import json
from util.c_tokenizer import C_Tokenizer
tokenize = C_Tokenizer().tokenize

db_path = 'C:\\UNI\\projects\\rlassist\\data\\iitk-dataset\\prutor_b.db'
#
# with sqlite3.connect(db_path) as conn:
#     conn.execute('''ALTER TABLE Code ADD tokenized_code text;''')
#     conn.execute('''ALTER TABLE Code ADD name_dict;''')
#     conn.execute('''ALTER TABLE Code ADD name_seq;''')
#     conn.execute('''ALTER TABLE Code ADD codelength integer;''')

tuples = []
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    for row in cursor.execute("SELECT code_id, code FROM Code;"):
        code_id = str(row[0])
        if code_id == "prog56277":
            print("code id:", code_id)
            code = row[1].encode('utf-8')
            print(code)
            tokenized_code, name_dict, name_seq = tokenize(code)
            print(tokenized_code)
            print(name_dict)
            print(name_seq)
            codelength = len(tokenized_code.split())
            tuples.append((tokenized_code, json.dumps(name_dict),
                           json.dumps(name_seq), codelength, code_id))
#
import argparse
import sqlite3
import numpy as np

from data_processing.training_data_generator import load_dictionaries
from util.helpers import remove_empty_new_lines
from util.c_tokenizer import C_Tokenizer

deepfix_base_dir = 'data/deepfix-test-data/'
RLAssist_base_dir = 'data/network_inputs/RLAssist-seed-1189/'
iitk_db_path = 'data/iitk-dataset/dataset.db'
max_program_len = 45000

dummy_correct_program = '_eos_ -new-line- _pad_'

tokenize = C_Tokenizer().tokenize
convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format
convert_to_rla_format = lambda x: remove_empty_new_lines(
    convert_to_new_line_format(x))

raw_test_data = {}
seeded_test_data = {}


def vectorize(tokens, tldict, max_vector_length=max_program_len):
    vec_tokens = []
    for token in tokens.split():
        try:
            vec_tokens.append(tldict[token])
        except Exception:
            return None
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \
                                    max_fix_length, max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)
    tokenize = C_Tokenizer().tokenize
    convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format

    mutator_obj = Typo_Mutate(rng)
    mutate = partial(typo_mutate, mutator_obj)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in problem_list:
            for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)):
                user_id, code_id, tokenized_program = map(str, row)
                key = 'validation' if user_id in validation_users[problem_id] else 'train'

                program_length = len(tokenized_program.split())
                program_lengths.append(program_length)

                if program_length >= min_program_length and program_length <= max_program_length:

                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(tokenized_program, max_mutations, max_variants)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        raise
                    else:
                        tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program))

                        for corrupt_program, fix in iterator:
                            corrupt_program_length = len(corrupt_program.split())
                            fix_length             = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                            corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program))
                                try:
                                    token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)]
                                except:
                                    token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
        print fix_lengths
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    for key in token_strings:
        print key
        for problem_id in token_strings[key]:
            print problem_id, len(token_strings[key][problem_id])

    return token_strings, mutator_obj.get_mutation_distribution()