def evaluator(args):
    is_flow_cache_experiment = True
    K = 10
    samplerdir, classifier_name, benign_threshold = args
    print('treshold at ', benign_threshold)

    clf_dir = get_classifier_dir(samplerdir,
                                 classifier_name,
                                 class_weight=None)
    gt_classes_str = pd.read_csv(join(samplerdir, '{}fold_0.csv'.format(K)),
                                 usecols=['Label'])['Label'].unique()
    gt_classes = sorted(encode_label(gt_classes_str))

    C = len(gt_classes)
    cm_any_sum = np.zeros((C, C), dtype=float)
    cm_majority_sum = np.zeros((C, C), dtype=float)
    cm_all_sum = np.zeros((C, C), dtype=float)

    col_names = ['Timestamp'] + get_cols4eval()
    for test_index in range(K):
        runs_dir = join(clf_dir, 'K_{}/log/{}'.format(K, test_index))
        if is_flow_cache_experiment:
            runs_dir = replace_w_unlimited_FC(runs_dir)
        clf = load_classifier(classifier_name, runs_dir)

        test_csv_file = join(samplerdir, '{}fold_{}.csv'.format(K, test_index))
        df = pd.read_csv(test_csv_file,
                         usecols=col_names,
                         dtype=get_dtype4normalized())  #,skiprows=skip_idx)
        df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(
            str)  # type string
        df = df.sort_values(by=['Flow ID', 'Day', 'Label'
                                ])  #used when deriving flow level metric

        pred_per_record = predict_proba_per_record(df, clf, benign_threshold)
        flowids, flowlabels_str, grouped = group_data(df)

        y = encode_label(flowlabels_str)
        pred_any, pred_maj, pred_all = evaluate_per_flow(
            grouped, y, pred_per_record)

        any_cm = confusion_matrix(y, pred_any)
        majority_cm = confusion_matrix(y, pred_maj)
        all_cm = confusion_matrix(y, pred_all)

        cm_any_sum += any_cm
        cm_majority_sum += majority_cm
        cm_all_sum += all_cm
        #gt_classes = np.unique(y)
        result_logger_ids18(
            join(clf_dir,
                 'K_{}_benign_threshold_{}'.format(K, benign_threshold)),
            gt_classes, (any_cm, majority_cm, all_cm),
            'fold_{}_'.format(test_index))
    result_logger_ids18(
        join(clf_dir, 'K_{}_benign_threshold_{}'.format(K, benign_threshold)),
        gt_classes, (cm_any_sum, cm_majority_sum, cm_all_sum),
        'fold_avg_'.format(K))
Beispiel #2
0
def load_dataset(title):
    if title == 'glass-identification':
        #Glass Identification Data Set [214,9, 7, N, N]
        #URL: https://archive.ics.uci.edu/ml/datasets/glass+identification
        X, y = utils.load_data(dir_path + '/glass/glass.data',
                               9,
                               index_col=None)
    elif title == 'soybean-large':
        #Soybeans (Large) Data Set [307, 35, 19, Y, C]
        #https://archive.ics.uci.edu/ml/datasets/Soybean+(Large)
        X, y = utils.load_data(dir_path + '/soybean/soybean-large.data',
                               35,
                               last=False)
        y = utils.encode_label(y)
    elif title == 'primary-tumor':
        #Primary Tumor Data Set [339x17x22xY,N]
        #https://archive.ics.uci.edu/ml/datasets/primary+tumor
        X, y = utils.load_data(dir_path + '/primary-tumor/primary-tumor.data',
                               17,
                               last=False)
    elif title == 'winequality-red':
        #7a. Wine Quality Red Data Set [1599,11, 10, N, N]
        #[accuracy is not upto 54% with red and white wine data]
        #URL: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
        X, y = utils.load_data(dir_path + '/wine_quality/winequality-red.csv',
                               11,
                               header='infer',
                               sep=';',
                               col_name=True,
                               target=['quality'])
    else:
        print('No dataset found for loading, please check again...')
    return X, y
Beispiel #3
0
 def __getitem__(self, item) -> Dict[str, torch.Tensor]:
     return {
         'label': torch.tensor(encode_label(self.data['labels'][item])),
         'input_ids': self.data['input_ids'][item],
         'token_type_ids': self.data['token_type_ids'][item],
         'attention_mask': self.data['attention_mask'][item],
     }
Beispiel #4
0
def per_record_evaluation(df, pred_per_record):
    print("----------per record analyusis-----------")
    y_per_record = encode_label(df.Label.values)

    acc_per_record = metrics.balanced_accuracy_score(y_per_record,
                                                     pred_per_record)
    print(acc_per_record)
    print("end of per_record analysis")
Beispiel #5
0
def load_folds(dataroot, fold_prefix, K):
    df_list = [
        pd.read_csv(join(dataroot, fold_prefix.format(i)),
                    usecols=get_cols4ml(),
                    dtype=get_dtype4normalized()) for i in range(K)
    ]

    fold_data = [ (df.drop(columns=['Label']).values, encode_label(df.Label.values)) \
    for df in df_list]
    return fold_data
def classify(dataroot,classifier_name):
        K=5
        balance = get_balancing_technique()
        train_data = []
        #single fold 29M records
        # 4 folds 120M records
        # if 20M records require 5% RAM
        # then 120M records require 30% memory
        print("Reading the data...")
        tick=time.time()
        label_to_id, id_to_label, _ = get_ids18_mappers()
        num_train_records = 0
        print("Reading 4 folds ")
        
        if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': 
            regex  = 'r_fold_{}.csv'
        elif balance=='explicit':
            regex = 'bal_fold_{}.csv'
            
        for fold_index in tqdm(range(K)):
            if fold_index==0:
                continue
            reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6
            # remove the extra header row
            for df in tqdm(reader):
                y_str = df.Label.values
                x = df.drop(columns=['Label']).values
                train_data.append((x,encode_label(y_str)))
                num_train_records +=df.shape[0]
                print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 ))
        tock = time.time()
        print("read data in {:.2f}".format(tock-tick)) # 24min

        classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
            
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        ensure_dir(logdir)
               
        X_train = np.concatenate([fold[0] for fold in train_data ],axis=0)
        y_train = np.concatenate([fold[1] for fold in train_data ],axis=0)
        classifier_args['runs_dir']=logdir

        print("Start training")
        tick = time.time()
        clf= get_classifier(classifier_args)
        print("classes")
        print(np.unique(y_train))
        clf.fit(X_train, y_train)
        fn = classifier_args['runs_dir']+'.pkl'
        pickle.dump(clf,open(fn,'wb'))
        print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
def evaluator(dataroot, classifier_name):
    print('evaluating ', ntpath.basename(dataroot))

    test_csv_file = join(dataroot, 'fold_0.csv')
    result_test = subprocess.run(['wc', '-l', test_csv_file],
                                 stdout=subprocess.PIPE)
    test_records = int(result_test.stdout.split()[0])

    # load Classifier
    class_weight = get_class_weights(
        dataroot)  # because it is not important for evaluation
    num_class = 14  # because we remove Label,FlowID,Timestamp columns from X
    classifier_args, config = get_args(classifier_name, num_class,
                                       class_weight)

    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    print('clf fingerprint', ntpath.basename(fingerprint))
    classifier_args['runs_dir'] = join(fingerprint, 'log')
    clf = ClassifierLoader().load(classifier_args)
    # classifier loaded

    # load data
    col_names = get_cols4eval()
    col_names.append('Timestamp')
    df = pd.read_csv(test_csv_file,
                     usecols=col_names,
                     dtype=get_dtype4normalized())
    print("Record distribution:")
    print(df.Label.value_counts())
    df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str)  # type string

    #group data
    df = df.sort_values(by=['Flow ID', 'Label'
                            ])  # replaces ordering task in per_flow_eval
    flowids, flowlabels, grouped = group_data(df)
    y = encode_label(flowlabels)
    print("data is grouped and labels are encoded")
    pred_any, pred_maj, pred_all, _ = evaluate_per_flow(clf, y, grouped, df)

    any_cm = confusion_matrix(y, pred_any)
    maj_cm = confusion_matrix(y, pred_maj)
    all_cm = confusion_matrix(y, pred_all)

    any_acc = metrics.balanced_accuracy_score(y, pred_any)
    maj_acc = metrics.balanced_accuracy_score(y, pred_maj)
    all_acc = metrics.balanced_accuracy_score(y, pred_all)
    print(any_acc, maj_acc, all_acc)

    result_logger_ids18(fingerprint, np.unique(y), (any_cm, maj_cm, all_cm),
                        'test')
 def __getitem__(self, item):
     item_path = self.videos_list[item]
     cap = cv2.VideoCapture(item_path)
     nframes = cap.get(cv2.CAP_PROP_FRAME_COUNT)
     frames_indices = [int(x * (nframes / self.t)) for x in range(self.t)]
     x = []
     for id in frames_indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, id)
         ret, frame = cap.read()
         x.append(torch.FloatTensor(frame))
     cap.release()
     x = torch.stack(x, dim=0)
     y = utils.get_label_from_filename(item_path)
     # set_trace()
     y = utils.encode_label([y])
     return x, y
Beispiel #9
0
    def load_val_dataset(self, val_filename, num_records):
        print("=========loading validation dataset========")
        #skip_ratio = 0.95
        #skip_idx = np.random.choice(num_records,int(num_records*skip_ratio)) #% skipping 10% of rows
        #skip_idx = skip_idx[skip_idx!=0]
        #df = pd.read_csv(val_filename, usecols=get_cols4ml(), skiprows=skip_idx )
        #print(df.Label.value_counts())
        df = pd.read_csv(val_filename, usecols=get_cols4ml(), nrows=400000)
        print(df.Label.value_counts())

        y = encode_label(df.Label.values)
        y = torch.LongTensor(y)
        x = torch.FloatTensor(df.drop(columns=['Label']).values)

        dataset = utils.TensorDataset(x, y)
        loader = utils.DataLoader(dataset, batch_size=1024 * 4)
        return loader
def per_record_evaluation(df, pred):
    y_per_record = np.array(encode_label(df.Label.values))
    acc_per_record = metrics.balanced_accuracy_score(y_per_record, pred)
    print("----------per record acc: {:.2f}-----------".format(acc_per_record))
Beispiel #11
0
def evaluator(dataroot, classifier_name):
    print(ntpath.basename(dataroot))
    test_csv_file = join(dataroot, 'fold_0.csv')

    # load Classifier
    classifier_args, config = get_args(classifier_name,
                                       num_class='dummy',
                                       class_weight=None)
    print("Balancing technique: ", classifier_args['balance'])
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    logdir = join(fingerprint, 'log')

    if 'mem_const_exp' == 'mem_const_exp':
        # for mem constraint exp
        start = logdir.find('CSVs_r')
        end = logdir.find('_m_')
        CSV_dirname = logdir[start:end]
        logdir = logdir.replace(CSV_dirname, 'CSVs_r_1.0')
        # end

    print(logdir)
    classifier_args['runs_dir'] = logdir

    loader = ClassifierLoader()
    clf = loader.load(classifier_args)

    if 'noprint_clf_attr' == 'print_clf_attr' and 'tree' in classifier_name:
        print("maximum depth of the tree ", clf.tree_.max_depth)
        import matplotlib.pyplot as plt
        from sklearn.tree import plot_tree
        plt.figure()
        plot_tree(clf, filled=True)
        plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000)
        return
    if 'norf_attr' == 'rf_attr' and 'forest' in classifier_name:
        depth = [est.tree_.max_depth for est in clf.estimators_]
        print(depth)
        depth = np.array(depth)
        print("forest depth", depth.mean(), depth.max(), depth.min())
        print("maximum depth of the tree ", clf.base_estimator_.max_depth)
        return
        import matplotlib.pyplot as plt
        from sklearn.tree import plot_tree
        plt.figure()
        plot_tree(clf, filled=True)
        plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000)
        return

    print("Classifier Loaded!")
    # classifier loaded

    # load data
    col_names = get_cols4eval()
    col_names.append('Timestamp')
    df = pd.read_csv(test_csv_file,
                     usecols=col_names,
                     dtype=get_dtype4normalized())  #,skiprows=skip_idx)
    df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str)  # type string
    df = df.sort_values(by=['Flow ID', 'Day', 'Label'])
    print(df.Label.value_counts())

    # Done
    pred_per_record = predict_per_record(df, clf)
    per_record_evaluation(df, pred_per_record)

    tick = time.time()
    flowids, flowlabels_str, grouped = group_data(df)
    print("Grouped in {:.2f} min".format((time.time() - tick) / 60))
    y = encode_label(flowlabels_str)
    print("data is grouped and labels are encoded")

    pred_any, pred_maj, pred_all, y = evaluate_per_flow(
        clf, y, grouped, df, pred_per_record)

    gt_classes = np.unique(y)
    pred_classes = np.unique(pred_any)
    nunique_gt = len(gt_classes)
    nunique_pred = len(pred_classes)

    assert nunique_gt >= nunique_pred, "should not predict non existing class(es), but \n{} < \n{}".format(
        gt_classes, pred_classes)
    any_cm = confusion_matrix(y, pred_any)
    majority_cm = confusion_matrix(y, pred_maj)
    all_cm = confusion_matrix(y, pred_all)

    any_acc = metrics.balanced_accuracy_score(y, pred_any)
    maj_acc = metrics.balanced_accuracy_score(y, pred_maj)
    all_acc = metrics.balanced_accuracy_score(y, pred_all)
    print(any_acc, maj_acc, all_acc)
    result_logger_ids18(fingerprint, gt_classes, (any_cm, majority_cm, all_cm),
                        'test')
    if pressed:
        #Select interesting columns
        data = data.loc[:, [session_state.X, session_state.y]]

        # BINARIZE LABEL
        if data[session_state.y].dtype in ["object", "str"]:
            #First value is transorfmed to 1 with Binary Labelization
            val_base = data.iloc[0, 1]
            #Save other values
            val_others = [
                lab for lab in list(set(data[session_state.y].values))
                if lab != val_base
            ]
            assert len(data[
                session_state.y].value_counts()) <= 2, "y doit être binaire"
            data[session_state.y] = utils.encode_label(data[session_state.y])
            st.sidebar.write("Label Encoding: ")
            st.sidebar.write(f"- {val_base}: transform to 1")
            st.sidebar.write(f"- {val_others[0]}: transform to 0")
            # Improvment: Word2vec to similarity
            if str.lower(val_base) in [
                    "good", "positive", "pos", "p", "kind", "cool"
            ]:
                session_state.good = 1
                session_state.hate = 0
            else:
                session_state.good = 0
                session_state.hate = 1

        else:
            # Define label by selecting good and bad values for comment