def get_classifier(classifier): if classifier["name"] == 'linear-ridge': c = RidgeClassifier() elif classifier["name"] == 'SVC': c = SVC() elif classifier["name"] == "l2-SVC": c = L2KernelClassifier() elif classifier["name"] == "fredholm": c = L2FredholmClassifier() elif classifier["name"] == "TSVM": c = SVMLight() elif classifier["name"] == "Lap-RLSC": c = LapRLSC() elif classifier["name"] == "fred_kernel_appr": c = FredholmKernelApprClassifier() else: raise NameError('Not existing classifier: ' + classifier["name"] + '.') c.set_params(**classifier["params"]) return c
def get_optimal_blend_weigth(exp_, best_param_, folder, fname, model_fname): clf = RidgeClassifier() X_test, y_test = exp_.get_test_data() clf.set_params(**best_param_) clf.fit(X_test, y_test) # dump2csv optimal linear weight names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values) coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64) optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names) optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'), folder, fname), index=False) # dump2cpkle for ridge model model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'wb') as gf: cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL) return True
def get_optimal_blend_weigth(exp_, best_param_, folder, fname, model_fname): clf = RidgeClassifier() X_test, y_test = exp_.get_test_data() clf.set_params(**best_param_) clf.fit(X_test, y_test) # dump2csv optimal linear weight names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values) coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64) optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names) optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'), folder, fname), index=False) # dump2cpkle for ridge model model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname) with gzip.open(model_fname, 'wb') as gf: cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL) return True
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True
def get_ridge_plot(best_param_, experiment_, param_keys_, param_vals_, png_folder, png_fname, score_threshold=0.8): parameters = dict(zip(param_keys_, param_vals_)) del parameters['model_type'] clf = RidgeClassifier() X_train, y_train = experiment_.get_train_data() clf.set_params(**best_param_) clf.fit(X_train, y_train) best_alpha = best_param_['alpha'] result = {'alphas':[], 'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ), 'scores':[], 'score':None} for i, alpha in enumerate(parameters.get('alpha',None)): result['alphas'].append(alpha) del best_param_['alpha'] best_param_['alpha'] = alpha clf.set_params(**best_param_) clf.fit(X_train, y_train) # regularization path tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32) if best_param_['fit_intercept']: tmp = np.append(clf.intercept_, clf.coef_) else: tmp[1:] = clf.intercept_ result['coefs'][i,:] = tmp result['scores'].append(experiment_.get_proba(clf, X_train)) del X_train, y_train # 2. tmp_len = len(experiment_.get_data_col_name()) index2feature = dict(zip(np.arange(1, tmp_len + 1), experiment_.get_data_col_name())) if best_param_['fit_intercept']: index2feature[0] = 'intercept' # 3. plot gs = GridSpec(2,2) ax1 = plt.subplot(gs[:,0]) ax2 = plt.subplot(gs[0,1]) ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name()) nrows, ncols = result['coefs'].shape for ncol in xrange(ncols): ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol]) ax1.legend(loc='best') ax1.set_xscale('log') ax1.set_title("Regularization Path:%1.3e" % (best_alpha)) ax1.set_xlabel("alpha", fontsize=10) # 3.2 PDF X_test, y_test = experiment_.get_test_data() result['score'] = clf.decision_function(X_test) sns.distplot(result['score'], kde=False, rug=False, ax=ax2) ax2.set_title("PDF : Decision_Function") # 3.3 CDF num_bins = 100 try: counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(result['score'], normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10) png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname) plt.tight_layout() plt.savefig(png_fname) plt.close() return True