def result_logger_ids18(fingerprint, cm_ids, cm_tuple,fold_index): cmdir = join(fingerprint,'cm') recalldir = join(fingerprint,'recall') evaldir = join(fingerprint,'eval') ensure_dirs([cmdir,recalldir,evaldir]) (cm_any, cm_majority, cm_all) = cm_tuple _, id_to_label, _ = get_ids18_mappers() cm_labels = np.array([id_to_label[cm_id] for cm_id in cm_ids]) #logging, plotting plot_confusion_matrix(join(cmdir,'any_{}.jpg'.format(fold_index)), [], [],cm=cm_any, classes=cm_labels, id_to_label=id_to_label) plot_confusion_matrix(join(cmdir,'majority_{}.jpg'.format(fold_index)), [], [],cm=cm_majority, classes=cm_labels, id_to_label=id_to_label) plot_confusion_matrix(join(cmdir,'all_{}.jpg'.format(fold_index)), [], [],cm=cm_all, classes=cm_labels, id_to_label=id_to_label) plot_confusion_matrix(join(cmdir,'any_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_any, classes=cm_labels,id_to_label=id_to_label, normalize=True) plot_confusion_matrix(join(cmdir,'majority_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_majority, classes=cm_labels,id_to_label=id_to_label, normalize=True) plot_confusion_matrix(join(cmdir,'all_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_all, classes=cm_labels,id_to_label=id_to_label, normalize=True) print_evaluation(cm_any, cm_labels, evaldir, fold_index, 'any') print_evaluation(cm_majority, cm_labels, evaldir, fold_index, 'majority') print_evaluation(cm_all, cm_labels, evaldir, fold_index, 'all') print_absolute_recall(cm_any, cm_labels, recalldir, fold_index, 'any') print_absolute_recall(cm_majority, cm_labels, recalldir, fold_index, 'majority') print_absolute_recall(cm_all, cm_labels, recalldir, fold_index, 'all')
def get_class_weights(dataroot, p=1): df = pd.read_csv(join(dataroot, 'label_dist.csv'), names=['Label', 'Count']) label_to_id, id_to_label, _ = get_ids18_mappers() #order of labels should be same as label ids on train data counts = [] print(id_to_label) for i in range(len(id_to_label)): label = id_to_label[i] if label in df['Label'].values: c = df[df['Label'] == label]['Count'].iloc[0] counts.append(c) else: print('not found', label) counts.append(0) counts = np.array(counts) normed_weights = [1 - (count / sum(counts)) for count in counts] return np.array(normed_weights) weight = 1. / counts # make it probabilyt where frequent class has smaller weight s = sum(weight) weight = weight / s # normalization np.set_printoptions(precision=3) print("class weights = ", weight) return weight
def classify(dataroot,classifier_name): K=5 balance = get_balancing_technique() train_data = [] #single fold 29M records # 4 folds 120M records # if 20M records require 5% RAM # then 120M records require 30% memory print("Reading the data...") tick=time.time() label_to_id, id_to_label, _ = get_ids18_mappers() num_train_records = 0 print("Reading 4 folds ") if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': regex = 'r_fold_{}.csv' elif balance=='explicit': regex = 'bal_fold_{}.csv' for fold_index in tqdm(range(K)): if fold_index==0: continue reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6 # remove the extra header row for df in tqdm(reader): y_str = df.Label.values x = df.drop(columns=['Label']).values train_data.append((x,encode_label(y_str))) num_train_records +=df.shape[0] print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 )) tock = time.time() print("read data in {:.2f}".format(tock-tick)) # 24min classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') ensure_dir(logdir) X_train = np.concatenate([fold[0] for fold in train_data ],axis=0) y_train = np.concatenate([fold[1] for fold in train_data ],axis=0) classifier_args['runs_dir']=logdir print("Start training") tick = time.time() clf= get_classifier(classifier_args) print("classes") print(np.unique(y_train)) clf.fit(X_train, y_train) fn = classifier_args['runs_dir']+'.pkl' pickle.dump(clf,open(fn,'wb')) print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
def __init__(self, csv_file, chunksize=10**4): self.csv_file = csv_file self.chunksize = chunksize self.seen_so_far = 0 # number of flow records seen so far self.seen_chunks = 0 self.iterableReader = pd.read_csv(csv_file, engine='c', usecols=get_cols4ml(), dtype=get_dtype4normalized(), chunksize=chunksize) label_to_id, id_to_label, _ = get_ids18_mappers() self.label_to_id = label_to_id
def get_class_weights(dataroot, p=1): df = pd.read_csv(join(dataroot, 'label_dist.csv'), names=['Label', 'Count']) label_to_id, id_to_label, _ = get_ids18_mappers() #order of labels should be same as label ids on train data counts = [] for i in range(len(id_to_label)): label = id_to_label[i] if label in df['Label'].values: c = df[df['Label'] == label]['Count'].iloc[0] counts.append(c) else: print('not found', label) counts.append(0) counts = np.array(counts) normed_weights = [1 - (count / sum(counts)) for count in counts] return np.array(normed_weights)
def classify(dataroot, classifier_name): K = 5 fraction = 1 #total_records = 6907705; # in fold fraction after removin small classes <K folds_df = [] fold_root = join(dataroot, 'folds_fraction_{}'.format(fraction)) print("Reading the data...") ds_list = [] for fold_index in range(K): df = pd.read_csv(join(fold_root, 'fold_{}.csv'.format(fold_index))) folds_df.append(df) ds_list.append(df.Label) total_df = pd.concat(folds_df) total_label_df = pd.concat(ds_list) labels = total_label_df.sort_values().unique() total_records = total_label_df.shape[0] #labels,labels_d = get_labels(total_label_df.unique()) label_to_id, id_to_label, _ = get_ids18_mappers() class_weight = get_class_weights( encode_label(total_label_df.values, label_to_id)) balance = get_balancing_technique() input_dim = folds_df[0].shape[ 1] - 2 # because we remove Label and FlowID columns from X gt_num_class = len(label_to_id) num_class = len(labels) assert gt_num_class == num_class, 'all classess should be observed gt_classes!=observed_classes {}!={}'.format( gt_num_class, num_class) classifier_args, config = get_args(classifier_name, total_records, gt_num_class, input_dim, class_weight, balance) pre_fingerprint = join( dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K))) fingerprint = pre_fingerprint + '_mem_constrained' + config logdir = join(pre_fingerprint + config, 'log') runs_dir = get_runs_dir(logdir) classifier_args['runs_dir'] = runs_dir clf = get_classifier(classifier_args) time_inference(classifier_name, clf, total_df, dataroot)
def classify(dataroot, classifier_name='cnn'): class_weight = get_class_weights(dataroot) balance = get_balancing_technique() print('balancing technique ', balance) if balance == 'explicit': train_csv = join(dataroot, 'bal_train.csv') val_csv = join(dataroot, 'bal_fold_1.csv' ) # no need to use bal__fold because it is shuffled else: train_csv = join(dataroot, 'r_train.csv') val_csv = join(dataroot, 'r_fold_1.csv') result_val = subprocess.run(['wc', '-l', val_csv], stdout=subprocess.PIPE) result_train = subprocess.run(['wc', '-l', train_csv], stdout=subprocess.PIPE) train_records = int(result_train.stdout.split()[0]) - 1 # for the header val_records = int(result_val.stdout.split()[0]) - 1 print("Number of train and val records ({},{})".format( train_records, val_records)) num_epochs = 40 label_to_id, id_to_label, _ = get_ids18_mappers() #class_weight = None class_weight = get_class_weights(dataroot) if balance == 'with_loss_inverse': class_weight = 1. / class_weight num_class = len(label_to_id) # we assume all the categories are observed classifier_args, config = get_args(classifier_name, num_class, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint, 'log') ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(train_csv, val_csv, num_epochs, train_records, val_records)
def classify(dataroot, classifier_name): K = 5 fraction = 1 label_to_id, id_to_label, _ = get_ids18_mappers() #class_weight = get_class_weights(encode_label(total_label_df.values,label_to_id)) class_weight = None balance = get_balancing_technique() input_dim = 78 # because we remove Label and FlowID columns from X gt_num_class = len(label_to_id) classifier_args, config = get_args(classifier_name, gt_num_class, input_dim, class_weight, balance) pre_fingerprint = join( dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K))) fingerprint = pre_fingerprint + '_mem_constrained' + config logdir = join(pre_fingerprint + config, 'log') cm_any = np.zeros((gt_num_class, gt_num_class), dtype=float) cm_majority = np.zeros((gt_num_class, gt_num_class), dtype=float) cm_all = np.zeros((gt_num_class, gt_num_class), dtype=float) kfold_feature_importance = np.zeros(input_dim, dtype=np.float) for fold_index in range(K): print('###################################') print("Fold ", fold_index) test_df = pd.read_csv(join(dataroot, 'fold_{}.csv'.format(fold_index))) runs_dir = join(logdir, 'fold_{}'.format(fold_index)) # for mem constrained experiemnt II, we need same classifier CSVs_r_1 for all memories start = runs_dir.find('CSVs_r_') end = runs_dir.find('SR_10') CSV_dirname = runs_dir[start:end - 1] #runs_dir = runs_dir.replace(CSV_dirname,'CSVs_r_1.0') classifier_args['runs_dir'] = runs_dir #---------------- loader = ClassifierLoader() clf = loader.load(classifier_args) print("Loaded Classifier!") if classifier_name == 'forest': kfold_feature_importance += clf.feature_importances_ flowids_test, y_flowid_test, grouped = group_data(test_df) y_flowid_test = encode_label(y_flowid_test, label_to_id) pred_any, pred_majority, pred_all, duration = predict_fold( classifier_name, clf, test_df, y_flowid_test, grouped, dataroot) assert pred_any.shape == pred_majority.shape, "any and majority shapes should be same {},{}".format( pred_any.shape, pred_majority.shape) acc_pred_any = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_any) acc_pred_majority = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_majority) acc_pred_all = 100 * metrics.balanced_accuracy_score( y_flowid_test, pred_all) print( "Fold Local Balanced accuracy(any,majority,all): ({:.2f},{:.2f},{:.2f})" .format(acc_pred_any, acc_pred_majority, acc_pred_all)) any_cm_i = confusion_matrix(y_flowid_test, pred_any) majority_cm_i = confusion_matrix(y_flowid_test, pred_majority) all_cm_i = confusion_matrix(y_flowid_test, pred_all) result_logger_ids18(fingerprint, y_flowid_test, (any_cm_i, majority_cm_i, all_cm_i), id_to_label, str(fold_index) + '_') cm_any += any_cm_i cm_majority += majority_cm_i cm_all += all_cm_i if classifier_name == 'forest': print_feature_importance( kfold_feature_importance, join(dataroot, 'folds_fraction_{}'.format(fraction), 'feature_selection.csv')) print(dataroot, classifier_name) result_logger_ids18(fingerprint, y_flowid_test, (cm_any, cm_majority, cm_all), id_to_label, 'avg_')
def encode_label(self, str_labels): label_to_id, id_to_label, _ = get_ids18_mappers() return [label_to_id[str_label] for str_label in str_labels]
def get_num_ws_classes(): label_to_id, id_to_label, _ = get_ids18_mappers() return len(label_to_id)