def mix_data(normal_file, attack_file, label_dict, output_dir):
    def open_file(input_file):
        X = []
        with open(input_file, 'r') as in_hdl:
            for line in in_hdl:
                line_arr = line.strip().split(',')
                X.append(line_arr)

        return X

    X_normal = open_file(normal_file)
    y_normal = [label_dict['normal']] * len(X_normal)

    X_attack = open_file(attack_file)
    y_attack = [label_dict['attack']] * len(X_attack)

    min_val = min(len(y_normal), len(y_attack))

    X = []
    y = []
    X_normal = X_normal[:min_val]
    y_normal = y_normal[:min_val]
    X_attack = X_attack[:min_val]
    y_attack = y_attack[:min_val]

    X.extend(X_normal)
    y.extend(y_normal)
    X.extend(X_attack)
    y.extend(y_attack)

    return X, y
def plot_data(gan_loss_file, gan_decision_file, name='gan'):
    X, _ = open_file(gan_loss_file, has_y_flg=False)
    show_figures(X[:, 0], X[:, 1], name)
    X, _ = open_file(input_f=gan_decision_file, has_y_flg=False)
    show_figures_2(X, name)
def main(normal_f='',
         attack_f='',
         gan_type='dcgan',
         epochs=10,
         label_dict={
             'normal': '0',
             'attack': '1'
         },
         output_dir='./log',
         select_train_size=0.01,
         show_flg=False,
         random_state=1,
         tp_tn_train_flg=True,
         time_str=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())):
    """

    :param normal_f:
    :param attack_f:
    :param output_dir:
    :param gan_type= 'naive_gan' or 'dcgan'
    :param tp_tn_train_flg: train normal and attack gan only the svm predict correctly values (tp and tn) from case2 train set.
    :return:
    """

    # step 1 achieve raw mix normal and attack data, and make (attack data size == normal data size).
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    orig_mix_data = mix_data(normal_f, attack_f, label_dict, output_dir)
    X, y = orig_mix_data
    print(
        f"original data shape: {np.asarray(X, dtype=float).shape, np.asarray(y,dtype=int).shape} and y_label:{Counter(y)}"
    )
    if show_flg:
        pca_show(X, y, name='pca n_components=2 on all data')

    # step 1.2 features reduction
    orig_mix_data = dimension_reduction(X, y, n_components=3)
    print()

    # step 1.3 split train, val and test
    print('case1...')
    # 70% train, 10% val and 20% test
    train_70, val_10, test_20 = split_train_val_test_data(
        orig_mix_data, train_val_test_ratio=[0.7, 0.1, 0.2])
    case1_data = {
        'train_set': train_70,
        'val_set': val_10,
        'test_set': test_20
    }
    norm_X_train_70, min_val, max_val, range_val = normalizate_data(
        case1_data['train_set'][0])
    norm_X_val_10 = (case1_data['val_set'][0] - min_val) / range_val
    print(
        f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}'
    )
    norm_X_test_20 = (case1_data['test_set'][0] - min_val) / range_val
    print(
        f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}'
    )
    case1_norm_data = {
        'train_set': (norm_X_train_70, train_70[1]),
        'val_set': (norm_X_val_10, case1_data['val_set'][1]),
        'test_set': (norm_X_test_20, case1_data['test_set'][1])
    }
    print(
        f"case1 train_set:{case1_norm_data['train_set'][0].shape, case1_norm_data['train_set'][1].shape}, "
        f"val_set:{case1_norm_data['val_set'][0].shape, case1_norm_data['val_set'][1].shape}, "
        f"test_set:{case1_norm_data['test_set'][0].shape, case1_norm_data['test_set'][1].shape}"
    )
    print(f"case1 y_train:{Counter(case1_norm_data['train_set'][1])}, "
          f"y_val:{Counter(case1_norm_data['val_set'][1])}, "
          f"y_test:{Counter(case1_norm_data['test_set'][1])}\n")

    case1_train_f = save_numpy_data(case1_norm_data['train_set'],
                                    output_f=os.path.join(
                                        output_dir, 'case1_train_file.txt'))
    case1_val_f = save_numpy_data(case1_norm_data['val_set'],
                                  output_f=os.path.join(
                                      output_dir, 'case1_val_file.txt'))
    case1_test_f = save_numpy_data(case1_norm_data['test_set'],
                                   output_f=os.path.join(
                                       output_dir, 'case1_test_file.txt'))

    print('case2...')
    ## 70%*0.3 train, 10% val and 20% test
    X_train, y_train = train_70
    X_train, _, y_train, _ = train_test_split(X_train,
                                              y_train,
                                              train_size=select_train_size,
                                              random_state=random_state)
    case2_data = {
        'train_set': (X_train, y_train),
        'val_set': val_10,
        'test_set': test_20
    }
    norm_X_train_70_03, min_val, max_val, range_val = normalizate_data(
        np.asarray(
            case2_data['train_set'][0],
            dtype=float))  # min, max, range maybe change on different train.
    norm_X_val_10 = (case2_data['val_set'][0] - min_val) / range_val
    print(
        f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}'
    )
    norm_X_test_20 = (case2_data['test_set'][0] - min_val) / range_val
    print(
        f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}'
    )
    case2_norm_data = {
        'train_set': (norm_X_train_70_03, y_train),
        'val_set': (norm_X_val_10, case2_data['val_set'][1]),
        'test_set': (norm_X_test_20, case2_data['test_set'][1])
    }
    print(
        f"case2 train_set:{case2_norm_data['train_set'][0].shape, case2_norm_data['train_set'][1].shape}, "
        f"val_set:{case2_norm_data['val_set'][0].shape, case2_norm_data['val_set'][1].shape}, "
        f"test_set:{case2_norm_data['test_set'][0].shape, case2_norm_data['test_set'][1].shape}"
    )
    print(f"case2 y_train:{Counter(case2_norm_data['train_set'][1])}, "
          f"y_val:{Counter(case2_norm_data['val_set'][1])}, "
          f"y_test:{Counter(case2_norm_data['test_set'][1])}\n")

    case2_train_f = save_numpy_data(case2_norm_data['train_set'],
                                    output_f=os.path.join(
                                        output_dir, 'case2_train_file.txt'))
    # case2_val and case2_test are the same with case1 and case3
    save_numpy_data(
        case2_norm_data['val_set'],
        output_f=os.path.join(output_dir, 'case2_val_file.txt')
    )  # case2_val and case2_test are the same with case1 and case3, however, the range_val will be different.
    save_numpy_data(case2_norm_data['test_set'],
                    output_f=os.path.join(output_dir, 'case2_test_file.txt'))
    X_normal, y_normal, X_attack, y_attack = split_mix_data(
        case2_norm_data['train_set'], label_dict)
    save_numpy_data((X_normal, y_normal),
                    output_f=os.path.join(output_dir,
                                          'case2_normal_train_file.txt'))
    save_numpy_data((X_attack, y_attack),
                    output_f=os.path.join(output_dir,
                                          'case2_attack_train_file.txt'))

    # step2 train and evaluate on traditional machine learning
    print('case1 train svm on 70% train-set, 30% test-set')
    # svm_evalution(case1_norm_data['train_set'], case1_norm_data['val_set'], case1_norm_data['test_set'],
    #               name='svm on 70% train set')
    print(
        f'case2 train svm on 70%*{select_train_size*100}% train-set, 30% test-set'
    )
    tp_normal, tn_attack = svm_evalution(
        case2_norm_data['train_set'],
        case2_norm_data['val_set'],
        case2_norm_data['test_set'],
        name=f'svm on 70%*{select_train_size*100}% train set')
    if tp_tn_train_flg:
        print(
            'normal and attack gan will be trained on different train set size according to the confusion matrix of svm.'
        )
        X_normal, y_normal = tp_normal  # for training GAN, only save svm predict correctly normal values (True Positive)
        X_attack, y_attack = tn_attack  # for training GAN, only save svm predict correctly attack values (True Negative)
    else:
        print('normal and attack gan will be trained on equal train set size.')
    tp_normal_train_f = save_numpy_data(
        (X_normal, y_normal),
        output_f=os.path.join(output_dir, 'case2_tp_normal_train_file.txt'))
    tn_attack_train_f = save_numpy_data(
        (X_attack, y_attack),
        output_f=os.path.join(output_dir, 'case2_tn_attack_train_file.txt'))

    # step 3. build different gan for normal and attack data separately, just use train_size = 0.3.
    num = 10000
    print(
        f'\nnormal_gan on train set:{np.asarray(X_normal, dtype=float).shape, np.asarray(y_normal,dtype=int).shape}'
    )
    #   3.1 noraml_gan
    new_gen_normal_f, normal_gan_loss_file, normal_gan_decision_file = run_gan_main(
        input_f=tp_normal_train_f,
        name='normal',
        generated_num=num,
        output_dir=output_dir,
        epochs=epochs,
        show_flg=show_flg,
        gan_type=gan_type,
        time_str=time_str)
    print(
        f'\nattack_gan on train set:{np.asarray(X_attack, dtype=float).shape, np.asarray(y_attack,dtype=int).shape}'
    )
    new_gen_attack_f, attack_gan_loss_file, attack_gan_decision_file = run_gan_main(
        input_f=tn_attack_train_f,
        name='attack',
        generated_num=num,
        output_dir=output_dir,
        epochs=epochs,
        show_flg=show_flg,
        gan_type=gan_type,
        time_str=time_str)
    #   3.3 merge the generated data (normal and attack data)
    (_, _), generate_train_f = mix_normal_attack_and_label(
        new_gen_normal_f,
        new_gen_attack_f,
        label_dict=label_dict,
        start_feat_idx=[0, '-'],
        output_f=os.path.join(output_dir,
                              'case3_generated_%d_mix_data.csv' % num))
    # step 4. mix original train data and new generated data
    new_train_f = mix_two_files(case2_train_f,
                                generate_train_f,
                                output_f=os.path.join(
                                    output_dir, 'case3_new_train_set.csv'))
    print(
        '\'new train set\' (includes original train set and generated train) is in \'%s\'\n.'
        % new_train_f)

    # step 5. train and test on ML on the new data
    print('case3...')
    X_new_train, y_new_train = open_file(
        new_train_f
    )  # GAN use sigmoid, so there does not need to do normalization again.
    # case3_data = {'train_set':(X_new_train, y_new_train),
    #                    'val_set': (val_10[0], val_10[1]),
    #                    'test_set': (test_20[0], test_20[1])}
    # # case3 use the same range_val as case 2 on train set because both of them have the same original train set(70%*0.3)
    # norm_X_train_new, min_val, max_val, range_val = normalizate_data(case3_data['train_set'][0])
    # norm_X_val_10 = (case3_data['val_set'][0] - min_val) / range_val
    # print(f'after normalization, val_set range is {np.max(norm_X_val_10, axis=0)- np.min(norm_X_val_10, axis=0)}')
    # norm_X_test_20 = (case3_data['test_set'][0] - min_val) / range_val
    # print(f'after normalization, test_set range is {np.max(norm_X_test_20, axis=0)- np.min(norm_X_test_20, axis=0)}')
    # case3_norm_data = {'train_set': (norm_X_train_new, y_new_train),
    #               'val_set': (norm_X_val_10, case3_data['val_set'][1]),
    #               'test_set': (norm_X_test_20, case3_data['test_set'][1])}
    # np.random.shuffle(zip(X_new_train, y_new_train))  # cannot successful
    X_new_train, y_new_train = shuffle(X_new_train,
                                       y_new_train,
                                       random_state=random_state)

    case3_norm_data = {
        'train_set': (X_new_train, y_new_train),
        'val_set': case2_norm_data['val_set'],
        'test_set': case2_norm_data['test_set']
    }
    print(
        f"case3 train_set:{case3_norm_data['train_set'][0].shape, case3_norm_data['train_set'][1].shape}, "
        f"val_set:{case3_norm_data['val_set'][0].shape, case3_norm_data['val_set'][1].shape}, "
        f"test_set:{case3_norm_data['test_set'][0].shape, case3_norm_data['test_set'][1].shape}"
    )
    print(f"case3 y_train:{Counter(case3_norm_data['train_set'][1])}, "
          f"y_val:{Counter(case3_norm_data['val_set'][1])}, "
          f"y_test:{Counter(case3_norm_data['test_set'][1])}\n")
    print('case3 train svm on new train and evaluate on 30% test')
    svm_evalution(case3_norm_data['train_set'],
                  case3_norm_data['val_set'],
                  case3_norm_data['test_set'],
                  name='svm on new train set')

    return case1_train_f, case2_train_f, new_train_f, case1_val_f, case1_test_f
def main(normal_f='',
         attack_f='',
         epochs=10,
         label_dict={
             'normal': '0',
             'attack': '1'
         },
         output_dir='./log',
         select_train_size=0.03,
         show_flg=False):
    """

    :param normal_f:
    :param attack_f:
    :param output_dir:
    :return:
    """

    # step 1 load data  (attack data size == normal data size)
    # step 1.1. mix normal (label = 0)and attack (label=1) data, then normalize the mixed data [0,1]
    (X, y), output_f = mix_normal_attack_and_label(
        normal_f,
        attack_f,
        label_dict=label_dict,
        start_feat_idx=[0, '-'],
        output_f=os.path.join(output_dir, 'original_mix_data.csv'))
    if show_flg:
        pca_show(X, y, name='pca n_components=2 on all data')
    # X= np.asarray(X, dtype=float)
    X = normalizate_data(np.asarray(X, dtype=float))
    X, y = dimension_reduction(X, y, n_components=5)
    if show_flg:
        pca_show(X, y, name='pca n_components=2 on dimension reduction data')
    X = normalizate_data(np.asarray(X, dtype=float))

    y = np.asarray(y, dtype=int)
    _ = save_numpy_data(
        (X, y),
        output_f=os.path.join(output_dir, 'original_normalized_mix_data.txt'))
    # t_sne_show(X, y)

    # step 1.2. split train (70%) and test (30%) on original mixed data
    X_train, X_test, y_train, y_test = train_test_split(np.asarray(
        X, dtype=float),
                                                        np.asarray(y,
                                                                   dtype=int),
                                                        test_size=0.3,
                                                        random_state=1)
    train_set = (X_train, y_train)
    if show_flg:
        pca_show(X_train, y_train, name='pca n_components=2, 70% train_set')
    test_set = (X_test, y_test)
    if show_flg:
        pca_show(X_test, y_test, name='pca n_components=2, 30% test_set')
    original_train_f = save_numpy_data(
        train_set,
        output_f=os.path.join(output_dir, 'original_normalized_train_set.csv'))
    original_test_f = save_numpy_data(
        test_set,
        output_f=os.path.join(
            output_dir, 'original_normalized_test_set.csv'))  # for evalution
    print('test svm on 70% train-set, 30% test-set')
    test_SVM(X_train, y_train, X_test, y_test, name='svm on 70% train set')

    # step 4. separate normal and attack data in original train set
    _, X_selected, _, y_selected = train_test_split(
        X_train, y_train, test_size=select_train_size, random_state=1)
    original_select_train_f = save_numpy_data(
        (X_selected, y_selected),
        output_f=os.path.join(output_dir, 'original_selected_mix_data.csv'))

    print('test svm on %.2f%% train-set, 30%% test-set' %
          (select_train_size * 100))
    X_predict_normal,y_predict_normal, X_predict_attack, y_predict_attack=\
        test_SVM(X_selected,y_selected, X_test, y_test, name='svm on %.2f%% train set'%(select_train_size*100))
    normal_train_set_f = save_numpy_data(
        (X_predict_normal, y_predict_normal),
        output_f=os.path.join(output_dir, 'normal_data_for_GAN.csv'))
    attack_train_set_f = save_numpy_data(
        (X_predict_attack, y_predict_attack),
        output_f=os.path.join(output_dir, 'attack_data_for_GAN.csv'))

    # step 5. build different gan for normal and attack data separately, just use train_size = 0.3.
    num = 10000
    #   5.1 noraml_gan
    new_gen_normal_f, normal_gan_loss_file, normal_gan_decision_file = run_gan_main(
        input_f=normal_train_set_f,
        name='normal',
        generated_num=num,
        output_dir=output_dir,
        epochs=epochs,
        show_flg=show_flg)
    #   5.2 attack_gan
    new_gen_attack_f, attack_gan_loss_file, attack_gan_decision_file = run_gan_main(
        input_f=attack_train_set_f,
        name='attack',
        generated_num=num,
        output_dir=output_dir,
        epochs=epochs,
        show_flg=show_flg)
    #   5.3 merge the generated data (normal and attack data)
    (_, _), generate_train_f = mix_normal_attack_and_label(
        new_gen_normal_f,
        new_gen_attack_f,
        label_dict=label_dict,
        start_feat_idx=[0, '-'],
        output_f=os.path.join(output_dir, 'generated_%d_mix_data.csv' % num))
    # step 6. mix original train data and new generated data
    new_train_set_f = mix_two_files(original_select_train_f,
                                    generate_train_f,
                                    output_f=os.path.join(
                                        output_dir, '0_new_train_set.csv'))

    print(
        '\'new train set\' (includes original train set and generated train_set) is in \'%s\'.'
        % new_train_set_f)

    print('test svm on new train-set, 30% test-set')
    X_new_train, y_new_train = open_file(new_train_set_f)
    test_SVM(X_new_train,
             y_new_train,
             X_test,
             y_test,
             name='svm on new train set')

    return original_train_f, original_select_train_f, new_train_set_f, original_test_f