def run(): add_metrics_mode = "-add_metrics" in sys.argv add_rows_mode = "-add_rows" in sys.argv data = loading.load_dataset( sys.argv[ 1 ] ) print_dtype( data ) data2 = loading.load_dataset( sys.argv[ 2 ] ) print_dtype( data2 ) if add_metrics_mode and add_rows_mode: print( "Choose only one option: -add_metrics, -add_rows" ) exit() if add_metrics_mode: data = merge_datasets_new_metrics( data, data2 ) print_dtype( data ) elif add_rows_mode: data = merge_datasets( data, data2 ) print_dtype( data ) else: print( "Add one of flags: -add_metrics, -add_rows" ) exit() loading.save_binary( data, sys.argv[ 3 ] )
def run(): train_set = loading.load_dataset(sys.argv[1]) test_set = loading.load_dataset(sys.argv[2]) (index_labels, unique_labels) = teach_model.labels_to_int(train_set, "label") features_labels = ft.get_train_feature_labels() correct_rejections = list() incorrect_rejections = list() weights = [i * 10 + 1 for i in range(0, 50)] for weight in weights: params = teach_model.Parameters() params.classes_weights = dict() params.classes_weights[b"TRUE"] = weight params.classes_weights[b"FALSE"] = 1 params.criterion = "gini" params.max_depth = 7 params.min_samples_leaf = 4000 params.min_impurity_decrease = 0.0 classifier = teach_model.teach_tree(train_set, features_labels, params, sys.argv[3]) (error_matrix, unique_labels) = quality.classification_quality( test_set, classifier, "label") correct_rejections.append( quality.compute_correct_rejection_rate(error_matrix, unique_labels)) incorrect_rejections.append( quality.compute_incorrect_rejection_rate(error_matrix, unique_labels)) #data_analysis.show_multidata( correct_rejections, ) print(correct_rejections) print(incorrect_rejections) matplotlib.pyplot.plot(weights, correct_rejections) matplotlib.pyplot.plot(weights, incorrect_rejections) matplotlib.pyplot.xlabel("TRUE label weight") matplotlib.pyplot.ylabel("Quality") matplotlib.pyplot.show()
def main_loop(config): global screen global esc_pressed global scene_names data = loading.load_dataset(config.dataset) full_images = select_rows(data, config) scene_names = [ should_accept.get_scene_name(row["image"].decode('UTF-8')) for row in data ] load_next_row(data, full_images) screen = numpy.zeros(image.shape, numpy.uint8) cv2.namedWindow('Crops labeling') cv2.setMouseCallback('Crops labeling', mouse_select) while (1): screen = apply_mask(image, selected_crops) draw_grid_lines(screen) cv2.imshow('Crops labeling', screen) key = cv2.waitKey(20) if key == ord('m'): show_hide_mask() elif key == ord('d'): update_crops_selection(data, config) load_next_row(data, full_images) elif key == ord('a'): update_crops_selection(data, config) load_previous_row(data, full_images) elif key == ord("h"): print_help() elif key == ord("l"): show_grid_lines() elif key == ord("\r"): update_crops_selection(data, config) data_filtering.save_binary(data, config.dataset) elif key == ord("y"): if esc_pressed: update_crops_selection(data, config) data_filtering.save_binary(data, config.dataset) break else: pass elif key == ord("n"): if esc_pressed: break else: pass elif key == 27: print("Do you want to save your dataset labeling? (y/n)") esc_pressed = True cv2.destroyAllWindows()
def load_damaged_accepted_set(data_file): data = loading.load_dataset(data_file) damaged = filter_damaged_images(data) accepted = filter_accepted_images(data) return (damaged, accepted)
def add_variance_difference(data_file_path, output_path): data = load_dataset(data_file_path) variance_differences = [get_variance_difference(row) for row in data] output_data = append_fields(data, "variance_difference", variance_differences, usemask=False) numpy.save(output_path, output_data)
def label_data(src_file, dest_file): data = loading.load_dataset(src_file) labeled_data = label_dataset(data) print("Saving file [" + dest_file + "]") save_binary(labeled_data, dest_file)
def load_datasets(csv_file): data_file = csv_file data = loading.load_dataset(data_file) damage = data[data["samples"] == data["samples_reference"]] diffrent_sampling = data[data["samples"] != data["samples_reference"]] return (damage, diffrent_sampling)
def add_image_sizes(data_file_path, output_path, images_database_path): data = load_dataset(data_file_path) image_sizes = [ get_number_of_pixels(row, images_database_path) for row in data ] output_data = append_fields(data, "number_of_pixels", image_sizes, usemask=False) numpy.save(output_path, output_data)
def run(): classifier = classifiers.decision_tree.DecisionTree.load(sys.argv[2]) data = loading.load_dataset(sys.argv[1]) results = classifier.classify(data) result_mask = (results[:] == b"TRUE") & (data["label"] == b"FALSE") filtered_data = data[result_mask] data_analysis.save_filtered_dataset(filtered_data, sys.argv[3], sys.argv[4])
def split_train_test_set(data_file, output_dir, split_fun, params): data = loading.load_dataset(data_file) train_set, test_set = split_fun(data, params) basename = os.path.basename(data_file) (name, ext) = os.path.splitext(basename) test_name = os.path.join(output_dir, name + "_test" + ext) train_name = os.path.join(output_dir, name + "_train" + ext) loading.save_binary(train_set, train_name) loading.save_binary(test_set, test_name)
def run(): filters = [ "blured", "watermark", "noise", "noise_colored", "noise_peak", "enhancedcolor", "enhancedcontrast", "enhancedbrightness", "randomobjects", "wavelet_denoise", "channelsswitched", "smoothed", "sharpened", "scaled" ] #classifier = classifiers.ssim_threshold.ThresholdSSIM( 0.92 ) classifier = classifiers.decision_tree.DecisionTree.load(sys.argv[2]) classifier.print_info() data = loading.load_dataset(sys.argv[1]) data = data[data["is_cropped"] == True] num_inf = 0 for i, row in enumerate(data): if math.isinf(row["psnr"]): num_inf = num_inf + 1 data["psnr"][i] = numpy.finfo(numpy.float32).max label = "label" print_scenes(data) # print( data.dtype.names ) # load_and_classify( sys.argv[1], classifier, label ) classify_and_print(data, classifier, label) for filter in filters: filtered_data = data[data[filter] == True] classify_and_print_only_rate(filtered_data, classifier, label, filter) filtered_data = data[(data["samples"] != data["samples_reference"]) & (data["label"] == b"FALSE")] classify_and_print_only_rate(filtered_data, classifier, label, "different samples")
def run(): data_file = sys.argv[1] target_file = sys.argv[2] features = sys.argv[3:] data = loading.load_dataset(data_file) metrics_obj = [ metrics.ssim.MetricSSIM, metrics.psnr.MetricPSNR, metrics.variance.ImageVariance, metrics.edges.MetricEdgeFactor, metrics.wavelet.MetricWavelet, metrics.histograms_correlation.MetricHistogramsCorrelation, metrics.mass_center_distance.MetricMassCenterDistance ] #fix_psnr_infs( data ) #rescale_wavelets_to_resolution( data, 13 ) data = overwrite_features(data, metrics_obj, features) loading.save_binary(data, target_file)
def run(): data = loading.load_dataset(sys.argv[1]) data = data[data["is_cropped"] == True] (index_labels, unique_labels) = labels_to_int(data, "label") features_labels = [ "ssim", "psnr", "max_x_mass_center_distance", "histograms_correlation", "max_y_mass_center_distance", "edge_difference", "comp_edge_factor", "wavelet_mid", "wavelet_low", "wavelet_high", "wavelet_sym2_base", "wavelet_sym2_low", "wavelet_sym2_mid", "wavelet_sym2_high", "wavelet_db4_base", "wavelet_db4_low", "wavelet_db4_mid", "wavelet_db4_high", "wavelet_haar_base", "wavelet_haar_low", "wavelet_haar_mid", "wavelet_haar_high", "wavelet_haar_freq_x1", "wavelet_haar_freq_x2", "wavelet_haar_freq_x3" ] params = dict() params["criterion"] = "gini" params["splitter"] = "best" params["max_depth"] = 8 params["min_samples_leaf"] = 3000 params["min_impurity_decrease"] = 0.0 params["classes_weights"] = dict() params["classes_weights"][b"TRUE"] = 10 params["classes_weights"][b"FALSE"] = 1 print("Teaching: number True labels: " + str(len(data[data["label"] == b"TRUE"]))) print("Teaching: number False labels: " + str(len(data[data["label"] == b"FALSE"]))) clf = teach_tree(data, features_labels, params, sys.argv[2]) clf.save_graph(sys.argv[3]) clf.print_info()
def load_and_classify(data_file, classifier, label): data = loading.load_dataset(data_file) #data = data[ data[ "ref_edge_factor" ] > 25 ] classify_and_print(data, classifier, label)
from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold as kfold from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.metrics import roc_auc_score as ras import pickle ############################################################################### ############################################################################### # Cargar datos: Acc_test = pd.Series(np.zeros((100))) AUC = pd.Series(np.zeros((100))) for j in range(100): np.random.seed(j + 31) [data_original, data, output, features] = load_dataset() # IMD: Intensidad Media Diaria de tráfico. X_train, X_test, y_train, y_test = train_test_split(data, output, test_size=0.3, random_state=31) X_train = X_train.reset_index() X_train = X_train.drop('index', axis=1) y_train = y_train.reset_index() y_train = y_train.drop('index', axis=1) # Normalization of IMD and GPS: average_x = X_train['GPS_x'].mean() stderror_x = X_train['GPS_x'].std() average_y = X_train['GPS_y'].mean() stderror_y = X_train['GPS_y'].std() average_z = X_train['GPS_z'].mean()
def run(): data = loading.load_dataset(sys.argv[1]) change_paths(data) loading.save_binary(data, sys.argv[2])
from loading import load_dataset import selection as s print print "Loading a test dataset" train, validation, test = load_dataset('test_data') print print "With all features..." acc, f1, auc = s.train_test_eval(train.data, train.target, test.data, test.target) s.print_metrics(acc, f1, auc) features = train.feature_names kfeatures = 2 print print "With Chi-squared..." selected = s.select_and_eval(s.metric_chi2, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected) print print "With random..." selected = s.select_and_eval(s.metric_random, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected) print print "With infogain..." selected = s.select_and_eval(s.metric_infogain, kfeatures, train.data, train.target, test.data, test.target) s.print_selected(features, selected)
def experiment(dataset_directory): train, validation, test = loading.load_dataset(dataset_directory) features = train.feature_names values_of_k = range(5, 50, 5) metrics = [ selection.metric_chi2, selection.metric_random, selection.metric_infogain ] results = [] for kfeatures in values_of_k: for metric in metrics: metric_name = metric.__name__ print print "Testing k=%d, metric=%s" % (kfeatures, metric_name) selector = selection.feature_selector(metric, kfeatures, train.data, train.target) selected_indices = selection.get_selected_feature_indices(selector) train_data_selected = selection.filter_features(selector, train.data) test_data_selected = selection.filter_features(selector, test.data) train_acc, train_f1, train_pr_auc = selection.train_test_eval(train_data_selected, train.target, train_data_selected, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval(train_data_selected, train.target, test_data_selected, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=kfeatures, metric=metric_name, train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) output_name = dataset_directory / "features_%s_%d.csv" % (metric_name, kfeatures) with open(output_name, 'wb') as out: selection.list_selected(features, selected_indices, out=out) print "Features saved to %s" % output_name print print "Using all the features (Logistic Regression):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=len(features), metric='logreg', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) print print "Using all the features (SVM):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target, model='svm') test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target, model='svm') selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append(dict( kfeatures=len(features), metric='svm', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) experiment_stats = dataset_directory / "feature_selection_experiment.csv" with open(experiment_stats, 'wb') as stats: writer = csv.DictWriter(stats, fieldnames=( 'metric', 'kfeatures', 'train_accuracy', 'train_f1', 'train_pr_auc', 'test_accuracy', 'test_f1', 'test_pr_auc')) writer.writeheader() writer.writerows(results) print "Saved results in %s" % experiment_stats
def run(): data = loading.load_dataset(sys.argv[1]) print_funked_scenes(data)
def load_damaged_images(data_file): data = loading.load_dataset(data_file) return filter_damaged_images(data)
import keras import numpy as np from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras import backend as K from loading import load_dataset from keras.models import load_model batch_size = 256 epochs = 25 X_train, X_test, y_train, y_test, num_classes = load_dataset() print(num_classes, 'classes') num_rows, num_cols = X_train[0].shape input_shape = (num_rows, num_cols, 1) X_train = X_train.reshape(X_train.shape[0], num_rows, num_cols, 1).astype('float32') / 255.0 X_test = X_test.reshape(X_test.shape[0], num_rows, num_cols, 1).astype('float32') / 255.0 print(X_train.shape) print(y_train.shape) model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape)) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25))
def experiment(dataset_directory): train, validation, test = loading.load_dataset(dataset_directory) features = train.feature_names values_of_k = range(5, 50, 5) metrics = [ selection.metric_chi2, selection.metric_random, selection.metric_infogain ] results = [] for kfeatures in values_of_k: for metric in metrics: metric_name = metric.__name__ print print "Testing k=%d, metric=%s" % (kfeatures, metric_name) selector = selection.feature_selector(metric, kfeatures, train.data, train.target) selected_indices = selection.get_selected_feature_indices(selector) train_data_selected = selection.filter_features( selector, train.data) test_data_selected = selection.filter_features(selector, test.data) train_acc, train_f1, train_pr_auc = selection.train_test_eval( train_data_selected, train.target, train_data_selected, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval( train_data_selected, train.target, test_data_selected, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=kfeatures, metric=metric_name, train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) output_name = dataset_directory / "features_%s_%d.csv" % ( metric_name, kfeatures) with open(output_name, 'wb') as out: selection.list_selected(features, selected_indices, out=out) print "Features saved to %s" % output_name print print "Using all the features (Logistic Regression):" train_acc, train_f1, train_pr_auc = selection.train_test_eval( train.data, train.target, train.data, train.target) test_acc, test_f1, test_pr_auc = selection.train_test_eval( train.data, train.target, test.data, test.target) selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=len(features), metric='logreg', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) print print "Using all the features (SVM):" train_acc, train_f1, train_pr_auc = selection.train_test_eval(train.data, train.target, train.data, train.target, model='svm') test_acc, test_f1, test_pr_auc = selection.train_test_eval(train.data, train.target, test.data, test.target, model='svm') selection.print_metrics(train_acc, train_f1, train_pr_auc, test_acc, test_f1, test_pr_auc) results.append( dict( kfeatures=len(features), metric='svm', train_accuracy=train_acc, train_f1=train_f1, train_pr_auc=train_pr_auc, test_accuracy=test_acc, test_f1=test_f1, test_pr_auc=test_pr_auc, )) experiment_stats = dataset_directory / "feature_selection_experiment.csv" with open(experiment_stats, 'wb') as stats: writer = csv.DictWriter(stats, fieldnames=('metric', 'kfeatures', 'train_accuracy', 'train_f1', 'train_pr_auc', 'test_accuracy', 'test_f1', 'test_pr_auc')) writer.writeheader() writer.writerows(results) print "Saved results in %s" % experiment_stats