def evaluator(args): is_flow_cache_experiment = True K = 10 samplerdir, classifier_name, benign_threshold = args print('treshold at ', benign_threshold) clf_dir = get_classifier_dir(samplerdir, classifier_name, class_weight=None) gt_classes_str = pd.read_csv(join(samplerdir, '{}fold_0.csv'.format(K)), usecols=['Label'])['Label'].unique() gt_classes = sorted(encode_label(gt_classes_str)) C = len(gt_classes) cm_any_sum = np.zeros((C, C), dtype=float) cm_majority_sum = np.zeros((C, C), dtype=float) cm_all_sum = np.zeros((C, C), dtype=float) col_names = ['Timestamp'] + get_cols4eval() for test_index in range(K): runs_dir = join(clf_dir, 'K_{}/log/{}'.format(K, test_index)) if is_flow_cache_experiment: runs_dir = replace_w_unlimited_FC(runs_dir) clf = load_classifier(classifier_name, runs_dir) test_csv_file = join(samplerdir, '{}fold_{}.csv'.format(K, test_index)) df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized()) #,skiprows=skip_idx) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype( str) # type string df = df.sort_values(by=['Flow ID', 'Day', 'Label' ]) #used when deriving flow level metric pred_per_record = predict_proba_per_record(df, clf, benign_threshold) flowids, flowlabels_str, grouped = group_data(df) y = encode_label(flowlabels_str) pred_any, pred_maj, pred_all = evaluate_per_flow( grouped, y, pred_per_record) any_cm = confusion_matrix(y, pred_any) majority_cm = confusion_matrix(y, pred_maj) all_cm = confusion_matrix(y, pred_all) cm_any_sum += any_cm cm_majority_sum += majority_cm cm_all_sum += all_cm #gt_classes = np.unique(y) result_logger_ids18( join(clf_dir, 'K_{}_benign_threshold_{}'.format(K, benign_threshold)), gt_classes, (any_cm, majority_cm, all_cm), 'fold_{}_'.format(test_index)) result_logger_ids18( join(clf_dir, 'K_{}_benign_threshold_{}'.format(K, benign_threshold)), gt_classes, (cm_any_sum, cm_majority_sum, cm_all_sum), 'fold_avg_'.format(K))
def load_dataset(title): if title == 'glass-identification': #Glass Identification Data Set [214,9, 7, N, N] #URL: https://archive.ics.uci.edu/ml/datasets/glass+identification X, y = utils.load_data(dir_path + '/glass/glass.data', 9, index_col=None) elif title == 'soybean-large': #Soybeans (Large) Data Set [307, 35, 19, Y, C] #https://archive.ics.uci.edu/ml/datasets/Soybean+(Large) X, y = utils.load_data(dir_path + '/soybean/soybean-large.data', 35, last=False) y = utils.encode_label(y) elif title == 'primary-tumor': #Primary Tumor Data Set [339x17x22xY,N] #https://archive.ics.uci.edu/ml/datasets/primary+tumor X, y = utils.load_data(dir_path + '/primary-tumor/primary-tumor.data', 17, last=False) elif title == 'winequality-red': #7a. Wine Quality Red Data Set [1599,11, 10, N, N] #[accuracy is not upto 54% with red and white wine data] #URL: https://archive.ics.uci.edu/ml/datasets/Wine+Quality X, y = utils.load_data(dir_path + '/wine_quality/winequality-red.csv', 11, header='infer', sep=';', col_name=True, target=['quality']) else: print('No dataset found for loading, please check again...') return X, y
def __getitem__(self, item) -> Dict[str, torch.Tensor]: return { 'label': torch.tensor(encode_label(self.data['labels'][item])), 'input_ids': self.data['input_ids'][item], 'token_type_ids': self.data['token_type_ids'][item], 'attention_mask': self.data['attention_mask'][item], }
def per_record_evaluation(df, pred_per_record): print("----------per record analyusis-----------") y_per_record = encode_label(df.Label.values) acc_per_record = metrics.balanced_accuracy_score(y_per_record, pred_per_record) print(acc_per_record) print("end of per_record analysis")
def load_folds(dataroot, fold_prefix, K): df_list = [ pd.read_csv(join(dataroot, fold_prefix.format(i)), usecols=get_cols4ml(), dtype=get_dtype4normalized()) for i in range(K) ] fold_data = [ (df.drop(columns=['Label']).values, encode_label(df.Label.values)) \ for df in df_list] return fold_data
def classify(dataroot,classifier_name): K=5 balance = get_balancing_technique() train_data = [] #single fold 29M records # 4 folds 120M records # if 20M records require 5% RAM # then 120M records require 30% memory print("Reading the data...") tick=time.time() label_to_id, id_to_label, _ = get_ids18_mappers() num_train_records = 0 print("Reading 4 folds ") if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': regex = 'r_fold_{}.csv' elif balance=='explicit': regex = 'bal_fold_{}.csv' for fold_index in tqdm(range(K)): if fold_index==0: continue reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6 # remove the extra header row for df in tqdm(reader): y_str = df.Label.values x = df.drop(columns=['Label']).values train_data.append((x,encode_label(y_str))) num_train_records +=df.shape[0] print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 )) tock = time.time() print("read data in {:.2f}".format(tock-tick)) # 24min classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') ensure_dir(logdir) X_train = np.concatenate([fold[0] for fold in train_data ],axis=0) y_train = np.concatenate([fold[1] for fold in train_data ],axis=0) classifier_args['runs_dir']=logdir print("Start training") tick = time.time() clf= get_classifier(classifier_args) print("classes") print(np.unique(y_train)) clf.fit(X_train, y_train) fn = classifier_args['runs_dir']+'.pkl' pickle.dump(clf,open(fn,'wb')) print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
def evaluator(dataroot, classifier_name): print('evaluating ', ntpath.basename(dataroot)) test_csv_file = join(dataroot, 'fold_0.csv') result_test = subprocess.run(['wc', '-l', test_csv_file], stdout=subprocess.PIPE) test_records = int(result_test.stdout.split()[0]) # load Classifier class_weight = get_class_weights( dataroot) # because it is not important for evaluation num_class = 14 # because we remove Label,FlowID,Timestamp columns from X classifier_args, config = get_args(classifier_name, num_class, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config print('clf fingerprint', ntpath.basename(fingerprint)) classifier_args['runs_dir'] = join(fingerprint, 'log') clf = ClassifierLoader().load(classifier_args) # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized()) print("Record distribution:") print(df.Label.value_counts()) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str) # type string #group data df = df.sort_values(by=['Flow ID', 'Label' ]) # replaces ordering task in per_flow_eval flowids, flowlabels, grouped = group_data(df) y = encode_label(flowlabels) print("data is grouped and labels are encoded") pred_any, pred_maj, pred_all, _ = evaluate_per_flow(clf, y, grouped, df) any_cm = confusion_matrix(y, pred_any) maj_cm = confusion_matrix(y, pred_maj) all_cm = confusion_matrix(y, pred_all) any_acc = metrics.balanced_accuracy_score(y, pred_any) maj_acc = metrics.balanced_accuracy_score(y, pred_maj) all_acc = metrics.balanced_accuracy_score(y, pred_all) print(any_acc, maj_acc, all_acc) result_logger_ids18(fingerprint, np.unique(y), (any_cm, maj_cm, all_cm), 'test')
def __getitem__(self, item): item_path = self.videos_list[item] cap = cv2.VideoCapture(item_path) nframes = cap.get(cv2.CAP_PROP_FRAME_COUNT) frames_indices = [int(x * (nframes / self.t)) for x in range(self.t)] x = [] for id in frames_indices: cap.set(cv2.CAP_PROP_POS_FRAMES, id) ret, frame = cap.read() x.append(torch.FloatTensor(frame)) cap.release() x = torch.stack(x, dim=0) y = utils.get_label_from_filename(item_path) # set_trace() y = utils.encode_label([y]) return x, y
def load_val_dataset(self, val_filename, num_records): print("=========loading validation dataset========") #skip_ratio = 0.95 #skip_idx = np.random.choice(num_records,int(num_records*skip_ratio)) #% skipping 10% of rows #skip_idx = skip_idx[skip_idx!=0] #df = pd.read_csv(val_filename, usecols=get_cols4ml(), skiprows=skip_idx ) #print(df.Label.value_counts()) df = pd.read_csv(val_filename, usecols=get_cols4ml(), nrows=400000) print(df.Label.value_counts()) y = encode_label(df.Label.values) y = torch.LongTensor(y) x = torch.FloatTensor(df.drop(columns=['Label']).values) dataset = utils.TensorDataset(x, y) loader = utils.DataLoader(dataset, batch_size=1024 * 4) return loader
def per_record_evaluation(df, pred): y_per_record = np.array(encode_label(df.Label.values)) acc_per_record = metrics.balanced_accuracy_score(y_per_record, pred) print("----------per record acc: {:.2f}-----------".format(acc_per_record))
def evaluator(dataroot, classifier_name): print(ntpath.basename(dataroot)) test_csv_file = join(dataroot, 'fold_0.csv') # load Classifier classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) print("Balancing technique: ", classifier_args['balance']) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint, 'log') if 'mem_const_exp' == 'mem_const_exp': # for mem constraint exp start = logdir.find('CSVs_r') end = logdir.find('_m_') CSV_dirname = logdir[start:end] logdir = logdir.replace(CSV_dirname, 'CSVs_r_1.0') # end print(logdir) classifier_args['runs_dir'] = logdir loader = ClassifierLoader() clf = loader.load(classifier_args) if 'noprint_clf_attr' == 'print_clf_attr' and 'tree' in classifier_name: print("maximum depth of the tree ", clf.tree_.max_depth) import matplotlib.pyplot as plt from sklearn.tree import plot_tree plt.figure() plot_tree(clf, filled=True) plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000) return if 'norf_attr' == 'rf_attr' and 'forest' in classifier_name: depth = [est.tree_.max_depth for est in clf.estimators_] print(depth) depth = np.array(depth) print("forest depth", depth.mean(), depth.max(), depth.min()) print("maximum depth of the tree ", clf.base_estimator_.max_depth) return import matplotlib.pyplot as plt from sklearn.tree import plot_tree plt.figure() plot_tree(clf, filled=True) plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000) return print("Classifier Loaded!") # classifier loaded # load data col_names = get_cols4eval() col_names.append('Timestamp') df = pd.read_csv(test_csv_file, usecols=col_names, dtype=get_dtype4normalized()) #,skiprows=skip_idx) df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str) # type string df = df.sort_values(by=['Flow ID', 'Day', 'Label']) print(df.Label.value_counts()) # Done pred_per_record = predict_per_record(df, clf) per_record_evaluation(df, pred_per_record) tick = time.time() flowids, flowlabels_str, grouped = group_data(df) print("Grouped in {:.2f} min".format((time.time() - tick) / 60)) y = encode_label(flowlabels_str) print("data is grouped and labels are encoded") pred_any, pred_maj, pred_all, y = evaluate_per_flow( clf, y, grouped, df, pred_per_record) gt_classes = np.unique(y) pred_classes = np.unique(pred_any) nunique_gt = len(gt_classes) nunique_pred = len(pred_classes) assert nunique_gt >= nunique_pred, "should not predict non existing class(es), but \n{} < \n{}".format( gt_classes, pred_classes) any_cm = confusion_matrix(y, pred_any) majority_cm = confusion_matrix(y, pred_maj) all_cm = confusion_matrix(y, pred_all) any_acc = metrics.balanced_accuracy_score(y, pred_any) maj_acc = metrics.balanced_accuracy_score(y, pred_maj) all_acc = metrics.balanced_accuracy_score(y, pred_all) print(any_acc, maj_acc, all_acc) result_logger_ids18(fingerprint, gt_classes, (any_cm, majority_cm, all_cm), 'test')
if pressed: #Select interesting columns data = data.loc[:, [session_state.X, session_state.y]] # BINARIZE LABEL if data[session_state.y].dtype in ["object", "str"]: #First value is transorfmed to 1 with Binary Labelization val_base = data.iloc[0, 1] #Save other values val_others = [ lab for lab in list(set(data[session_state.y].values)) if lab != val_base ] assert len(data[ session_state.y].value_counts()) <= 2, "y doit être binaire" data[session_state.y] = utils.encode_label(data[session_state.y]) st.sidebar.write("Label Encoding: ") st.sidebar.write(f"- {val_base}: transform to 1") st.sidebar.write(f"- {val_others[0]}: transform to 0") # Improvment: Word2vec to similarity if str.lower(val_base) in [ "good", "positive", "pos", "p", "kind", "cool" ]: session_state.good = 1 session_state.hate = 0 else: session_state.good = 0 session_state.hate = 1 else: # Define label by selecting good and bad values for comment