def optimize(data, number_features, threshold, corr=None, batch=False): ''' This function computes the optimal set of odorants with a more efficient approach that includes two possible pre-processing steps of the data before a linear optimization is applied. First, one decides to omit highly correlated pairs of odorants from the data-set. Normally, these only contain redundant information. To activate set corr=True. In a second step, a fast backward elimination is performed to find a larger set (threshold), e.g. all other odorants are removed as well. Gurobi is then applied on the remaining odorants. If the values are chosen carefully, one can drastically speed up the computation of the optimal number_features. In my thesis I showed, that backward elimination is often close to optimal but much faster than Gurobi, so theoretically a combination of both should boost computation without losing stability. Args: data (numpy.array): The olfactory data matrix. x-dimension: receptors, y-dimension: odorants. number_features (int): Specifies the desired size of the optimal set. threshold (int): Speficies how many number_features are computed by backward_elimination. corr (bool): (default=False). If true, pairwise correlation between number_features is computed and one feature from pairs with high correlation is removed. Returns: f_list (numpy.array): The optimal odorant set. Odorants are sorted increasingly by odorants standard deviation. score (float): The score. Higher scores are better. ''' if threshold <= number_features: print "threshold must be larger than number_features" removables = [] if corr: c = analysis.correlation(np.transpose(data)) l = c[c[2] > corr] for i in l: if np.std(i[0]) > np.std(i[1]): removables.append(i[0]) else: removables.append(i[1]) f_list = np.setdiff1d(range(data.shape[1]), removables) f_tmp, score = featureselection.backward_elimination( data[:, f_list], threshold) # keep number_features computed by backward elimination f_list = np.intersect1d(f_list, f_list[f_tmp]) f, s = gurobi.optimize(data[:, f_list], number_features=number_features, batch=batch) for i, v in enumerate(f): f[i] = f_list[v] return f, s
if j > i: c = sp.stats.pearsonr(rowA, rowB)[0] if c > 0.92: if np.std(rowA) > np.std(rowB): removables.append(j) else: removables.append(i) removables = np.unique(removables) print "remove (%s)" % (len(removables)), removables data = np.delete(data, removables, axis=0) feature_names = np.delete(feature_names, removables) data = np.transpose(data) feature_list, scores = featureselection.backward_elimination(data) title = 'Backward Elimination with preprocessing of Hallem with %s features' % str( features) sub_list = feature_list[:features] print title print "Score:", scores[features] print sub_list print feature_names[sub_list] #path = "../figures/hallem/be/hallem_be_" + str(features) + "_performance_preprocessing.png" #plotting.plot_progress_results(scores, features, path=path) # #path = "../figures/hallem/be/hallem_be_" + str(features) + "_preprocessing.png" #plotting.plot_fingerprints(title,
#!/usr/bin/env python # encoding: utf-8 import numpy as np from example.mouse.data import MouseData from _core import plotting from _core.featureselection import backward_elimination mouse = MouseData() data = mouse.response feature_names = mouse.feature_names data_names = mouse.data_names feature_list, score = backward_elimination(data) features = 6 title = 'Backward Elimination on Mouse' print title print "Score:", score[-1] print feature_names[feature_list] path = "/Users/marcus/Desktop/mouse_be_" + str(features) + "_performance.png" plotting.plot_progress_results(score, features, path) #path = "/Users/marcus/Desktop/mouse_be_" + str(features) + ".png" #plotting.plot_fingerprints(title, feature_names[feature_list], # data[:, feature_list], # data_names)
""" Analysis of stability of backward prediction on DoOR dorsal dataset. """ import datetime import numpy as np import matplotlib.pyplot as pl from example.door import DoOR from _core import featureselection, validation door = DoOR() data, ors, odorants = door.get_dorsal_data() # compute features print data.shape feature_list, scores = featureselection.backward_elimination(data) # creating a list of feature-lists with increasing size top = [] for i in range(15): top.append((feature_list[:i])[::-1]) print top # # levels of noise which will be added sd_range = np.arange(0, 0.15, 0.01) results = validation.validate(data, top, noise=sd_range) x = sd_range y = np.asarray(range(1, results.shape[0] + 1)) X, Y = np.meshgrid(x, y)
rounds = 5 def write_csv(f, matrix): x = np.column_stack((np.transpose(size_data), matrix)) v = np.hstack(["Size", size_feat]) with open(f, 'wb') as ff: writer = csv.writer(ff, delimiter=";") writer.writerow(v) writer.writerows(x) for k in range(1, rounds + 1): for i, v_i in enumerate(size_data): for j, v_j in enumerate(size_feat): d = sample_generator.generate_random_data(v_i, v_j) start = time.time() feature_list, score = backward_elimination(d, 4) stop = time.time() delta = stop - start times[i, j] += delta scores[i, j] += score print "Round %s, data: %s, features: %s, time: %s" \ % (k, v_i, v_j, round(delta, 4)) write_csv("../results/be_times_pew.csv", times / k) write_csv("../results/be_scores_pew.csv", scores / k)
def optimize(data, number_features, threshold, corr=None, batch=False): ''' This function computes the optimal set of odorants with a more efficient approach that includes two possible pre-processing steps of the data before a linear optimization is applied. First, one decides to omit highly correlated pairs of odorants from the data-set. Normally, these only contain redundant information. To activate set corr=True. In a second step, a fast backward elimination is performed to find a larger set (threshold), e.g. all other odorants are removed as well. Gurobi is then applied on the remaining odorants. If the values are chosen carefully, one can drastically speed up the computation of the optimal number_features. In my thesis I showed, that backward elimination is often close to optimal but much faster than Gurobi, so theoretically a combination of both should boost computation without losing stability. Args: data (numpy.array): The olfactory data matrix. x-dimension: receptors, y-dimension: odorants. number_features (int): Specifies the desired size of the optimal set. threshold (int): Speficies how many number_features are computed by backward_elimination. corr (bool): (default=False). If true, pairwise correlation between number_features is computed and one feature from pairs with high correlation is removed. Returns: f_list (numpy.array): The optimal odorant set. Odorants are sorted increasingly by odorants standard deviation. score (float): The score. Higher scores are better. ''' if threshold <= number_features: print "threshold must be larger than number_features" removables = [] if corr: c = analysis.correlation(np.transpose(data)) l = c[c[2] > corr] for i in l: if np.std(i[0]) > np.std(i[1]): removables.append(i[0]) else: removables.append(i[1]) f_list = np.setdiff1d(range(data.shape[1]), removables) f_tmp, score = featureselection.backward_elimination(data[:, f_list], threshold) # keep number_features computed by backward elimination f_list = np.intersect1d(f_list, f_list[f_tmp]) f, s = gurobi.optimize(data[:, f_list], number_features=number_features, batch=batch) for i, v in enumerate(f): f[i] = f_list[v] return f, s
def batch_compute(job_id, features, methods): current_app.logger.debug("Computation for " + job_id) matrix = model.load_job_data(job_id) data = np.asarray(matrix[1:, 1:], dtype=float) for m in methods: if m == methodsmap.backward_elimination["key"]: try: f, r = featureselection.backward_elimination(data) f = [str(f[:i].tolist()) for i in range(features)] save_results(job_id, f[:features], r[:features], features, m) except Exception as inst: current_app.logger.error(inst) abort(500) elif m == methodsmap.forward_selection["key"]: try: f, r = featureselection.forward_selection(data) f = [str(f[:i].tolist()) for i in range(features)] save_results(job_id, f[:features], r[:features], features, m) except Exception as inst: current_app.logger.error(inst) abort(500) elif m == methodsmap.gurobi["key"]: try: f, r = gurobi.optimize(data, number_features=features, batch=True) #convert arrays to string f = [str(i.tolist()) for i in f] save_results(job_id, f, r, features, m) except Exception as inst: current_app.logger.error(inst) abort(500) elif m == methodsmap.pipelining["key"]: threshold = features + 1 if len(data[0]) > features else len(data[0]) f, r = pipelining.optimize(data, number_features=features, threshold=threshold, corr=0.94, batch=True) #convert arrays to string f = [str(i.tolist()) for i in f] save_results(job_id, f, r, features, m) #except Exception as inst: # current_app.logger.error(inst) # abort(500) else: pass