import numpy as np

from data_processing.training_data_generator import load_dictionaries
from util.helpers import remove_empty_new_lines
from util.c_tokenizer import C_Tokenizer

deepfix_base_dir = 'data/deepfix-test-data/'
RLAssist_base_dir = 'data/network_inputs/RLAssist-seed-1189/'
iitk_db_path = 'data/iitk-dataset/dataset.db'
max_program_len = 45000

dummy_correct_program = '_eos_ -new-line- _pad_'

tokenize = C_Tokenizer().tokenize
convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format
convert_to_rla_format = lambda x: remove_empty_new_lines(
    convert_to_new_line_format(x))

raw_test_data = {}
seeded_test_data = {}


def vectorize(tokens, tldict, max_vector_length=max_program_len):
    vec_tokens = []
    for token in tokens.split():
        try:
            vec_tokens.append(tldict[token])
        except Exception:
            return None

    if len(vec_tokens) > max_vector_length:
        return None
def generate_training_data(db_path, bins, min_program_length, max_program_length, \
                                    max_fix_length, max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)
    convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format

    mutator_obj = Typo_Mutate_Java(rng)
    mutate = partial(typo_mutate, mutator_obj)

    token_strings = {'train': {}, 'validation': {}, 'test': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    code_id_list = []
    for bin_ in bins:
        for problem_id in bin_:
            code_id_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()

        code_id_list = []
        query = "SELECT code_id FROM Code WHERE codelength>? and codelength<?;"
        for row in cursor.execute(query,
                                  (min_program_length, max_program_length)):
            # get all the code_id
            code_id_list.append(row[0])

        rng.shuffle(code_id_list)

        # split into train, valiation and test test: 80%, 10%, 10%
        validation_code_id_list = code_id_list[0:int(0.1 * len(code_id_list))]
        test_code_id_list = code_id_list[int(0.1 * len(code_id_list)
                                             ):int(0.1 * len(code_id_list)) *
                                         2]
        training_code_id_list = code_id_list[int(0.1 * len(code_id_list)) * 2:]

        # make sure they do not intersect
        assert list(set(training_code_id_list)
                    & set(validation_code_id_list)) == []
        assert list(set(training_code_id_list) & set(test_code_id_list)) == []
        assert list(set(validation_code_id_list)
                    & set(test_code_id_list)) == []

        query = "SELECT code_id, tokenized_code, codelength FROM Code " + "WHERE codelength>? and codelength<?;"
        total_variant_cnt = 0
        for row in cursor.execute(query,
                                  (min_program_length, max_program_length)):
            code_id = row[0]
            tokenized_program = row[1]

            if code_id in validation_code_id_list:
                key = 'validation'
            if code_id in test_code_id_list:
                key = 'test'
            if code_id in training_code_id_list:
                key = 'train'

            # number of tokens
            program_length = row[2]  # row[2] is codelength
            program_lengths.append(program_length)

            if program_length > min_program_length and program_length < max_program_length:
                # start to mutate

                total_mutate_calls += 1
                try:
                    iterator = mutate(tokenized_program, max_mutations,
                                      max_variants)
                except FailedToMutateException:
                    print code_id
                    exceptions_in_mutate_call += 1
                except LoopCountThresholdExceededException:
                    print code_id
                    exceptions_in_mutate_call += 1
                except ValueError:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                except AssertionError:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                except Exception:
                    print code_id
                    exceptions_in_mutate_call += 1
                    raise
                else:
                    tokenized_program = remove_empty_new_lines(
                        convert_to_new_line_format(tokenized_program))
                    for corrupt_program, fix in iterator:
                        corrupt_program_length = len(corrupt_program.split())
                        fix_length = len(fix.split())
                        fix_lengths.append(fix_length)
                        if corrupt_program_length >= min_program_length and \
                        corrupt_program_length <= max_program_length and fix_length <= max_fix_length:
                            corrupt_program = remove_empty_new_lines(
                                convert_to_new_line_format(corrupt_program))
                            total_variant_cnt += 1
                            try:
                                token_strings[key][code_id] += [
                                    (code_id, corrupt_program,
                                     tokenized_program)
                                ]
                            except:
                                token_strings[key][code_id] = [
                                    (code_id, corrupt_program,
                                     tokenized_program)
                                ]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(
        program_lengths), '\t95th %ile =', program_lengths[int(
            0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(
            fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 *
                                                            len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    return token_strings, mutator_obj.get_mutation_distribution()
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \
                                    max_fix_length, max_mutations, max_variants, seed):
    rng = np.random.RandomState(seed)
    tokenize = C_Tokenizer().tokenize
    convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format

    mutator_obj = Typo_Mutate(rng)
    mutate = partial(typo_mutate, mutator_obj)

    token_strings = {'train': {}, 'validation': {}}

    exceptions_in_mutate_call = 0
    total_mutate_calls = 0
    program_lengths, fix_lengths = [], []

    problem_list = []
    for bin_ in bins:
        for problem_id in bin_:
            problem_list.append(problem_id)

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;"
        for problem_id in problem_list:
            for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)):
                user_id, code_id, tokenized_program = map(str, row)
                key = 'validation' if user_id in validation_users[problem_id] else 'train'

                program_length = len(tokenized_program.split())
                program_lengths.append(program_length)

                if program_length >= min_program_length and program_length <= max_program_length:

                    # Mutate
                    total_mutate_calls += 1
                    try:
                        iterator = mutate(tokenized_program, max_mutations, max_variants)

                    except FailedToMutateException:
                        exceptions_in_mutate_call += 1
                    except LoopCountThresholdExceededException:
                        exceptions_in_mutate_call += 1
                    except ValueError:
                        exceptions_in_mutate_call += 1
                        raise
                    except AssertionError:
                        exceptions_in_mutate_call += 1
                        raise
                    except Exception:
                        exceptions_in_mutate_call += 1
                        raise
                    else:
                        tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program))

                        for corrupt_program, fix in iterator:
                            corrupt_program_length = len(corrupt_program.split())
                            fix_length             = len(fix.split())
                            fix_lengths.append(fix_length)

                            if corrupt_program_length >= min_program_length and \
                            corrupt_program_length <= max_program_length and fix_length <= max_fix_length:

                                corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program))
                                try:
                                    token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)]
                                except:
                                    token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)]

    program_lengths = np.sort(program_lengths)
    fix_lengths = np.sort(fix_lengths)

    print 'Statistics'
    print '----------'
    print 'Program length:  Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))]
    try:
        print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))]
    except Exception as e:
        print e
        print 'fix_lengths'
        print fix_lengths
    print 'Total mutate calls:', total_mutate_calls
    print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n'

    for key in token_strings:
        print key
        for problem_id in token_strings[key]:
            print problem_id, len(token_strings[key][problem_id])

    return token_strings, mutator_obj.get_mutation_distribution()