Ejemplo n.º 1
0
def make_preprocess():
    '''
        Read interim.csv and clean more data.
        1. Read StartTime as DateTime
        2. Perform binning on source and destination ports
        3. Add attribute indicating direction of flow
        4. Write to preproccessed.csv
    '''
    config = load_yaml(CONFIG_PATH)
    interim_output_path = config['interim_output_path']
    preprocessed_output_path = config['preprocessed_output_path']
    proto_dict = load_json(config['proto_dict_path'])
    dir_dict = load_json(config['dir_dict_path'])
    state_dict = load_json(config['state_dict_path'])
    # Well-known ports range from 0 through 1023
    # Registered ports are 1024 to 49151
    # Dynamic ports (also called private ports) are 49152 to 65535
    port_bins = [0, 1023, 49151, 65535]
    port_labels = [0, 1, 2]

    interim_df = pd.read_csv(interim_output_path, sep=',', escapechar='\\')
    preprocessed_df = interim_df
    preprocessed_df['StartTime'] = pd.to_datetime(preprocessed_df['StartTime'])

    preprocessed_df['Proto_Int'] = preprocessed_df['Proto'].map(proto_dict)
    preprocessed_df['Proto_Int'].fillna(proto_dict['Unknown'])
    preprocessed_df['Proto_Int'] = preprocessed_df['Proto_Int'].astype(
        'category')

    preprocessed_df['Sport_Int'] = pd.cut(preprocessed_df['Sport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Sport_Int'] = preprocessed_df['Sport_Int'].astype(
        'category')

    preprocessed_df['Dir_Int'] = preprocessed_df['Dir'].map(dir_dict)
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].fillna(
        dir_dict['Unknown'])
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].astype('category')

    preprocessed_df['Dport_Int'] = pd.cut(preprocessed_df['Dport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Dport_Int'] = preprocessed_df['Dport_Int'].astype(
        'category')

    preprocessed_df['State_Int'] = preprocessed_df['State'].map(state_dict)
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].fillna(
        state_dict['Unknown'])
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].astype(
        'category')

    preprocessed_df['is_fwd'] = preprocessed_df['Sport']
    preprocessed_df.loc[preprocessed_df['Sport'] >= 1024, 'is_fwd'] = 1
    preprocessed_df.loc[preprocessed_df['Sport'] < 1024, 'is_fwd'] = 0

    makedirs(dirname(preprocessed_output_path), exist_ok=True)
    preprocessed_df.to_csv(preprocessed_output_path, index=False)
Ejemplo n.º 2
0
def make_raw_data():
    ''' create input.csv in project/data/raw/ directory '''
    config = load_yaml(CONFIG_PATH)
    binetflow_path = config['binet_output_path']
    raw_output_path = config['raw_output_path']
    dataset_path = config['dataset_path']
    dataset_json = load_json(dataset_path)
    dict_mal_hosts = dict_infected_hosts(dataset_json)
    file_list = get_file_list(binetflow_path)
    create_input_csv(file_list, binetflow_path, raw_output_path,
                     dict_mal_hosts)
Ejemplo n.º 3
0
def main():
    '''main'''
    config = load_yaml(CONFIG_PATH)
    processed_csv_path = config['processed_path']
    figure_output_path = config['figure_output_path']

    sns.set(style='ticks', color_codes=True)
    print('Creating Figures....')
    processed_df = pd.read_csv(processed_csv_path)
    plot_base_frequencies(processed_df, figure_output_path)
    plot_base_features(processed_df, figure_output_path)
    plot_engineered_features(processed_df, figure_output_path)
    print(f'Creation Completed. Find output at: {figure_output_path}')
def main():
    '''main'''
    start_time = time.time()
    config = load_yaml(CONFIG_PATH)
    model_path = config['model_path']
    val_processed_path = config['val_processed_path']
    validation_path = config['validation_path']
    metric_path = config['metric_path']
    val_df = pd.read_csv(val_processed_path)
    val_df.State_Int = val_df.State_Int.astype('category')
    val_df.Dir_Int = val_df.Dir_Int.astype('category')
    val_df.Dport_Int = val_df.Dport_Int.astype('category')
    val_df.Sport_Int = val_df.Sport_Int.astype('category')
    val_df.loc[val_df.Label == 0, 'Label'] = -1

    print('ONE CLASS')
    with open(f'{model_path}oc_scaler.pickle', 'rb') as file:
        oc_scaler = pickle.load(file)
    with open(f'{model_path}oneclass.pickle', 'rb') as file:
        oc_model = pickle.load(file)
    results = oc_model.predict(oc_scaler.transform(df_train_subset(val_df)))
    save_performance(val_df.Label, results, metric_path, 'oneclass', 'validate')
    save_confuse_matrix(val_df.Label, results, metric_path, 'oneclass', 'validate')

    print('LINEAR REGRESSION')
    with open(f'{model_path}lr_scaler.pickle', 'rb') as file:
        lr_scaler = pickle.load(file)
    with open(f'{model_path}lr.pickle', 'rb') as file:
        lr_model = pickle.load(file)
    conf_score = oc_model.decision_function(oc_scaler.transform(df_train_subset(val_df)))
    true_label = val_df.Label
    val_df.drop(columns=['Label'], inplace=True, axis=1)
    val_df['Label'] = true_label
    val_df['Predicted_Label'] = results
    val_df['Confidence_Score'] = conf_score
    results = lr_model.predict(lr_scaler.transform(df_train_subset(val_df)))
    save_performance(val_df.Label, results, metric_path, 'lr', 'validate')
    save_confuse_matrix(val_df.Label, results, metric_path, 'lr', 'validate')
    print(f'Time To Predict: {time.time() - start_time}')
    start_time = time.time()
    ncs = lr_model.predict_proba(lr_scaler.transform(df_train_subset(val_df)))
    lr_classes = lr_model.classes_
    val_df[f'CS_LR_{lr_classes[0]}'] = [prob[0] for prob in ncs]
    val_df[f'CS_LR_{lr_classes[1]}'] = [prob[1] for prob in ncs]
    print(f'Normalize Results: {time.time() - start_time}')
    start_time = time.time()
    makedirs(dirname(validation_path), exist_ok=True)
    val_df.to_csv(validation_path, index=False)
    print(f'Saving CSV to Validation: {time.time() - start_time}')
Ejemplo n.º 5
0
def main():
    '''Retrieves dataset_json and sends request to get the bi netflow
    for each object'''
    config = load_yaml(CONFIG_PATH)
    json_path = config['dataset_path']
    output_base_path = config['binet_output_path']
    dataset_json = get_dataset_json(json_path)
    urllib3.disable_warnings()
    for obj in dataset_json:
        download_url = obj['source']
        file_name = download_url.split('/')[-2]
        if not os.path.isfile(output_base_path + '/' + file_name + '.csv'):
            binet_flow = download_binetflow(download_url)
            if binet_flow:
                write_binetflow_to_file(output_base_path + '/' + file_name,
                                        binet_flow)
            time.sleep(8)
        else:
            print(file_name + ' already exists')
Ejemplo n.º 6
0
def make_interim():
    '''
        Read input.csv and remove rows null  {srcaddr, dstaddr, srcport, dstport}
    '''
    config = load_yaml(CONFIG_PATH)
    raw_output_path = config['raw_output_path']
    interim_output_path = config['interim_output_path']
    makedirs(dirname(interim_output_path), exist_ok=True)
    with open(raw_output_path, 'r') as input_file:
        with open(interim_output_path, 'w') as interim_file:
            line = input_file.readline()
            row_l = remove_srcu_dstu(line)
            interim_file.write(','.join(row_l))
            line = input_file.readline()
            while line:
                row_l = remove_srcu_dstu(line)
                if not mising_addr_info(row_l[3], row_l[4], row_l[6], row_l[7]):
                    try:
                        socket.inet_aton(row_l[3])
                        socket.inet_aton(row_l[6])
                        interim_file.write(','.join(row_l))
                    except OSError:
                        pass
                line = input_file.readline()
'''This is a python program which is used to automate the process of
updating ioc JSON file with new api information, no new rows added'''
import argparse
import os.path as path
import pathlib
import sys
import datetime
import bulk_json_generator as generator
sys.path.append('../')
import utils.validator as validator
import utils.file_util as util

# Declaring globals
FILE_PATH = pathlib.Path(__file__).parent.absolute()
# Holds VT API information
CONFIG = util.load_yaml('{}/config.yml'.format(FILE_PATH))


def argparser():
    '''Parses the user arguments and checks path correct paths'''
    parser = argparse.ArgumentParser(
        description="Updating/Refreshing existing JSON latest API information")
    parser.add_argument('-path', '--path', required=True, dest='json_path', metavar='',
                        help='Path to bulk api JSON to be updated\
                            (e.g. C:/GitHub-Projects/Ransomware-Detection-Mechanism/ioc_list.json',
                        action='store')

    args = parser.parse_args()
    # Checking validity pf paths
    if args.json_path and not path.exists(args.json_path):
        parser.error(f"The file {args.json_path} does not exist!")
Ejemplo n.º 8
0
def main():
    """
        Retrieves an already preprocessed version of the dataset.
        The final CSV will be sorted on StartTime.
        Build DstBytes
        Build features based on the following:
            Total flows in the forward direction in the window
            Total flows in the backward direction in the window
            Total size of netflows in forward direction in the window
            Total size of netflows in backward direction in the window
            Minimum size of flow in forward direction in the window
            Minimum size of flow in backward direction in the window
            Maximum size of flow in forward direction in the window
            Maximum size of flow in backward direction in the window
            Mean size of flow in forward direction in the window
            Mean size of flow in backward direction in the window
            Standard Deviation size of flow in forward direction in the window
            Standard Deviation size of flow in backward direction in the window
            Time between 2 flows in the window in the forward direction
            Time between 2 flows in the window in the backward direction
        A similar approach is down on TotBytes, TotPkts, SrcBytes.
        Window is 10k elements and 10 Minutes.
        This window is done again with focus on source and destination addresses
        A sample containing the first 50 rows will be saved.
        A new CSV with raw + discretized + engineered will be saved.
    """
    start = time.time()
    config = load_yaml(CONFIG_PATH)
    preprocessed_path = config['preprocessed_path']
    processed_output_path = config['processed_path']
    sample_output_path = config['sample_processed_path']
    sample_size = config['sample_size']
    #Window for N elements
    num_window_size = config['num_window_size']
    minutes_window_size = config['minutes_window_size']
    pd_roll_time_size = config['pd_roll_time_size']
    feature_df = pd.read_csv(preprocessed_path)
    feature_df['StartTime'] = pd.to_datetime(feature_df['StartTime'])
    feature_df = feature_df.sort_values('StartTime').reset_index(drop=True)
    feature_df['epoch'] = ((feature_df['StartTime'] - pd.Timestamp('1970-01-01'))
                           // pd.Timedelta('1ms')) / 1000
    #Roughly 191 extra columns should be added

    #Bytes
    feature_df['DstBytes'] = feature_df['TotBytes'] - feature_df['SrcBytes']

    print(f'Building Total Flow in {minutes_window_size} minutes')
    build_gen_total_flows(feature_df, pd_roll_time_size)

    print(f'Building Total Flow in on SrcAddr and DstAddr in minutes and num elements')
    build_addr_total_flows(feature_df, 'SrcAddr', pd_roll_time_size)
    build_addr_total_flows(feature_df, 'DstAddr', pd_roll_time_size)
    build_addr_total_flows(feature_df, 'SrcAddr', num_window_size)
    build_addr_total_flows(feature_df, 'DstAddr', num_window_size)

    print(f'Building Time Between 2 Flows with {minutes_window_size} minutes')
    feature_df = build_time_between_2_flow_time(feature_df, minutes_window_size)
    print(f'Building Time Between 2 Flows with {num_window_size} elements')
    feature_df = build_time_between_2_flow_num(feature_df, num_window_size)

    print(f'Building TotBytes, TotPkts, SrcBytes, metrics in {minutes_window_size} minutes')
    build_gen_features(feature_df, 'TotBytes', pd_roll_time_size)
    build_gen_features(feature_df, 'TotPkts', pd_roll_time_size)
    build_gen_features(feature_df, 'SrcBytes', pd_roll_time_size)

    print(f'Building TotBytes, TotPkts, SrcBytes with {minutes_window_size} minutes on SrcAddr')
    build_addr_features(feature_df, 'SrcAddr', 'TotBytes', pd_roll_time_size)
    build_addr_features(feature_df, 'SrcAddr', 'TotPkts', pd_roll_time_size)
    build_addr_features(feature_df, 'SrcAddr', 'SrcBytes', pd_roll_time_size)

    print(f'Building TotBytes, TotPkts, SrcBytes with {minutes_window_size} minutes on DstAddr')
    build_addr_features(feature_df, 'DstAddr', 'TotBytes', pd_roll_time_size)
    build_addr_features(feature_df, 'DstAddr', 'TotPkts', pd_roll_time_size)
    build_addr_features(feature_df, 'DstAddr', 'SrcBytes', pd_roll_time_size)

    print(f'Building TotBytes, TotPkts, SrcBytes with {num_window_size} elements on Src and Dst')
    build_addr_features(feature_df, 'SrcAddr', 'TotBytes', num_window_size)
    build_addr_features(feature_df, 'SrcAddr', 'TotPkts', num_window_size)
    build_addr_features(feature_df, 'SrcAddr', 'SrcBytes', num_window_size)
    build_addr_features(feature_df, 'DstAddr', 'TotBytes', num_window_size)
    build_addr_features(feature_df, 'DstAddr', 'TotPkts', num_window_size)
    build_addr_features(feature_df, 'DstAddr', 'SrcBytes', num_window_size)

    #Write Sample to CSV
    makedirs(dirname(processed_output_path), exist_ok=True)
    feature_df.drop(columns=['epoch'], inplace=True, axis=1)
    feature_df.head(sample_size).to_csv(sample_output_path, index=False)
    #Write Raw and Features to CSV file.
    feature_df.to_csv(processed_output_path, index=False)
    print(time.time() - start)
Ejemplo n.º 9
0
def main():
    '''main'''
    total_start_time = time.time()
    config = load_yaml(CONFIG_PATH)
    metric_path = config['metric_path']
    model_path = config['model_path']
    processed_path = config['processed_path']
    trained_path = config['trained_path']
    feature_df = pd.read_csv(processed_path)
    feature_df['StartTime'] = pd.to_datetime(feature_df['StartTime'])
    feature_df.loc[feature_df.Label == 0, 'Label'] = -1
    feature_df.Proto_Int = feature_df.Proto_Int.astype('category')
    feature_df.Sport_Int = feature_df.Sport_Int.astype('category')
    feature_df.Dir_Int = feature_df.Dir_Int.astype('category')
    feature_df.Dport_Int = feature_df.Dport_Int.astype('category')
    feature_df.State_Int = feature_df.State_Int.astype('category')
    malicious_df = feature_df.loc[feature_df.Label == 1]
    mal_forward_df = malicious_df.loc[malicious_df.is_fwd == 1]
    mal_back_df = malicious_df.loc[malicious_df.is_fwd == 0]
    benign_df = feature_df.loc[feature_df.Label == -1]
    del feature_df, malicious_df
    X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test = train_test_split(
        mal_forward_df, mal_forward_df['Label'], test_size=0.2, random_state=0)
    X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test = train_test_split(
        mal_back_df, mal_back_df['Label'], test_size=0.2, random_state=0)
    del mal_forward_df, mal_back_df
    X_train = pd.concat([X_fwd_train, X_bwd_train])

    X_test = pd.concat([X_fwd_test, X_bwd_test])
    X_test = pd.concat([X_test, benign_df])

    y_train = X_train.Label
    y_test = X_test.Label

    del X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test
    del X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test
    del benign_df
    # Hyper Tuning One Class
    # sample_size = 100000
    # if len(X_train) < sample_size:
    #     sample_size = len(X_train)
    # X_train_sample = X_train.sample(sample_size, random_state=0)
    # y_train_sample = X_train_sample.Label
    # start_time = time.time()
    # print(f'Hyper Tune with Size {sample_size}')
    # oc_params = tune_oneclass(df_train_subset(X_train_sample), y_train_sample, 'f1')
    # print(f'Time (param search) {sample_size} size. 3 Folds. 18 tot Fits: {time.time()-start_time}')
    oc_kernel = 'rbf'
    oc_nu = 1e-2
    oc_gamma = 1e-6
    oc_clf = OneClassSVM(kernel=oc_kernel,
                         nu=oc_nu,
                         gamma=oc_gamma,
                         cache_size=7000,
                         verbose=True)
    oc_model_name = 'oneclass'
    oc_scaler = preprocessing.StandardScaler()
    oc_scaler.fit(df_train_subset(X_train))
    save_model(oc_scaler, model_path, 'oc_scaler')

    #Fit One Class
    start_time = time.time()
    oc_predict_train = oc_clf.fit_predict(oc_scaler.transform(
        df_train_subset(X_train)),
                                          y=y_train)
    print(
        f'Time One Class Train Size {len(X_train)} :{time.time() - start_time}'
    )
    save_model(oc_clf, model_path, oc_model_name)

    #Confusion Matrix
    save_confuse_matrix(y_train, oc_predict_train, metric_path, oc_model_name,
                        'train')
    oc_predict_test = oc_clf.predict(
        oc_scaler.transform(df_train_subset(X_test)))
    save_confuse_matrix(y_test, oc_predict_test, metric_path, oc_model_name,
                        'test')

    #Performance
    save_performance(y_train, oc_predict_train, metric_path, oc_model_name,
                     'train')
    save_performance(y_test, oc_predict_test, metric_path, oc_model_name,
                     'test')

    #Get confidence scores
    start_time = time.time()
    data_f = pd.concat([X_train, X_test])
    data_f.sort_values('StartTime', inplace=True)
    oc_conf_score = oc_clf.decision_function(
        oc_scaler.transform(df_train_subset(data_f)))
    print(f'Time Confidence Scores: {time.time() - start_time}')
    del data_f, oc_kernel, oc_nu, oc_gamma, oc_clf, oc_scaler

    #Saving to CSV
    start_time = time.time()
    x_test_label = X_test['Label']
    X_test.drop(columns=['Label'], inplace=True, axis=1)
    X_test['Label'] = x_test_label
    X_test['Predicted_Label'] = oc_predict_test

    mal_train_label = X_train['Label']
    X_train.drop(columns=['Label'], inplace=True, axis=1)
    X_train['Label'] = mal_train_label
    X_train['Predicted_Label'] = oc_predict_train

    final_df = pd.concat([X_train, X_test])
    del X_train, X_test, y_train, y_test
    final_df.sort_values('StartTime', inplace=True)

    final_df['Confidence_Score'] = oc_conf_score
    makedirs(dirname(f'{trained_path}'), exist_ok=True)
    final_df.to_csv(f'{trained_path}{oc_model_name}.csv', index=False)
    print(f'Saving one_class_features csv: {time.time() - start_time}')

    # Train Logistic Regression
    #Hypter tune with 10 perent of data.
    # start_time = time.time()
    # lr_train_size = 0.1
    # if len(final_df) < 100000:
    #     lr_train_size = 0.95
    # final_df, X_test_sample, y_train_s, y_test_s = train_test_split(final_df,
    #                                                                 final_df.Label,
    #                                                                 train_size=lr_train_size,
    #                                                                 stratify=final_df.Label)
    # del X_test_sample, y_train_s, y_test_s
    # lr_params = tune_log_reg(df_train_subset(final_df), final_df.Label, 'average_precision')
    # print(f'Time Hyper Tuning LR: {time.time() - start_time}')
    lr_params = {'C': 69.54618247583652, 'tol': 0.0009555227427965779}
    lr_clf = LogisticRegression(solver='saga',
                                penalty='l2',
                                dual=False,
                                tol=lr_params['tol'],
                                C=lr_params['C'],
                                max_iter=80000)
    lr_model_name = 'lr'
    lr_scaler = preprocessing.StandardScaler()
    lr_scaler.fit(df_train_subset(final_df))
    #Save LR Scaler
    save_model(lr_scaler, model_path, 'lr_scaler')

    #Fit Logistic Regression
    start_time = time.time()
    lr_train_transformed = lr_scaler.transform(df_train_subset(final_df))
    lr_clf.fit(lr_train_transformed, y=final_df.Label)
    save_model(lr_clf, model_path, lr_model_name)
    print(f'Time Train LR Size {len(final_df)}: {time.time() - start_time}')

    #Performance (Write afterwards)
    lr_predicted = lr_clf.predict(lr_train_transformed)
    save_performance(final_df.Label, lr_predicted, metric_path, lr_model_name,
                     'train')

    #Confusion Matrix
    save_confuse_matrix(final_df.Label, lr_predicted, metric_path,
                        lr_model_name, 'train')

    #Normalize Confidence Score
    start_time = time.time()
    ncs = normalize_confidence_score(lr_clf, lr_scaler,
                                     df_train_subset(final_df))
    final_df['LR_Predicted'] = lr_predicted
    lr_classes = lr_clf.classes_
    final_df[f'CS_LR_{lr_classes[0]}'] = [prob[0] for prob in ncs]
    final_df[f'CS_LR_{lr_classes[1]}'] = [prob[1] for prob in ncs]
    print(f'Time Normalize Conf Score: {time.time() - start_time}')

    #Save to CSV
    start_time = time.time()
    final_df.to_csv(f'{trained_path}{lr_model_name}.csv', index=False)
    print(f'Time Saving Normalized DF to CSV: {time.time() - start_time}')
    print(
        f'Training Complete - Time Elapsed: {time.time() - total_start_time}')