def gradient_descent(CF, theta, eps, opt=1, verbose=False): """ Gradient descent Options: opt = 0: learning rate alpha is a constant opt = 1: learning rate alpha is optimized at each iteration Default is opt = 0 """ alpha = 0.05 #*10 prev_cost = 1000 cost = 0 diff = np.abs(prev_cost - cost) min_cost = [] while diff > eps: cost, delta = CF.cost_and_gradient(theta) min_cost.append(cost) theta[0] = theta[0] - alpha * delta[0] theta[1:] = (1 - alpha * CF.l * 1. / CF.m) * theta[1:] - alpha * delta[1:] if opt == 1: # Update alpha at each iteration (convergence fastened) hess = CF.compute_hessian(theta) #hess = 1./CF.m*hess #delta = np.matrix(delta).T #alpha = np.dot(delta.T,delta)/(np.dot(delta.T,np.dot(hess,delta))) #alpha = alpha[0,0] A = np.matrix(1. / CF.m * A) print type(delta[1:]), type(A) alpha = np.dot(delta[1:].T, delta[1:]) / (np.dot( delta[1:].T, np.dot(A, delta[1:]))) alpha = alpha[0, 0] diff = np.abs(prev_cost - cost) prev_cost = cost if verbose: if CF.n == 2: plot_db(CF.x, CF.y, theta) if CF.n == 3: plot_db_3d(CF.x, CF.y, theta) plt.show() return theta, min_cost
def gradient_descent(CF,theta,eps,opt=1,verbose=False): """ Gradient descent Options: opt = 0: learning rate alpha is a constant opt = 1: learning rate alpha is optimized at each iteration Default is opt = 0 """ alpha = 0.05#*10 prev_cost = 1000 cost = 0 diff = np.abs(prev_cost-cost) min_cost = [] while diff > eps: cost, delta = CF.cost_and_gradient(theta) min_cost.append(cost) theta[0]=theta[0]-alpha*delta[0] theta[1:]=(1-alpha*CF.l*1./CF.m)*theta[1:]-alpha*delta[1:] if opt == 1: # Update alpha at each iteration (convergence fastened) hess = CF.compute_hessian(theta) #hess = 1./CF.m*hess #delta = np.matrix(delta).T #alpha = np.dot(delta.T,delta)/(np.dot(delta.T,np.dot(hess,delta))) #alpha = alpha[0,0] A = np.matrix(1./CF.m*A) print type(delta[1:]), type(A) alpha = np.dot(delta[1:].T,delta[1:])/(np.dot(delta[1:].T,np.dot(A,delta[1:]))) alpha = alpha[0,0] diff = np.abs(prev_cost-cost) prev_cost = cost if verbose: if CF.n == 2: plot_db(CF.x,CF.y,theta) if CF.n == 3: plot_db_3d(CF.x,CF.y,theta) plt.show() return theta, min_cost
def logistic_reg(x,y,theta,l=0,verbose=0,method='g'): """ Determines theta vector for a given polynomial degree and lambda x is a panda DataFrame y is a panda DataFrame l = 0: regularization coefficient / default is no regularization Methods for cost function minimization (default is gradient descent): 'g': gradient descent 'cg': conjugate gradient 'bfgs': BFGS (Broyden Fletcher Goldfarb Shanno) """ # Number of features n = x.shape[1] # Number of training set examples m = x.shape[0] # Number of classes K = len(y.columns) if len(theta[1]) != n+1: print "In logistic_reg.py:\nproblem of dimension between number of features and number of parameters !!" print "Number of features:", n print "Length of theta vector:", len(theta[1]) sys.exit() for k in range(1,K+1): theta[k] = np.array(theta[k],dtype=float) CF = CostFunction(x,y.values[:,k-1],l) if verbose: if n == 1: from plot_functions import plot_hyp_func_1f, plot_sep_1f syn, hyp = hypothesis(x.min(),x.max(),theta[k]) plot_hyp_func_1f(x,y[k],syn,hyp,threshold=.5) if n == 2: from plot_functions import plot_db plot_db(x,y[k],theta[k],lim=3,title='Initial decision boundary') if n == 3: from plot_functions import plot_db_3d plot_db_3d(x,y[k],theta[k],lim=3,title='Initial decision boundary') method = 'bfgs' stop = 10**-5 if method == 'cg': # Conjugate gradient from scipy.optimize import fmin_cg theta[k],allvecs = fmin_cg(CF.compute_cost,theta[k],fprime=CF.compute_gradient,gtol=stop,disp=False,retall=True)#,maxiter=1000) elif method == 'bfgs': # BFGS (Broyden Fletcher Goldfarb Shanno) from scipy.optimize import fmin_bfgs theta[k],allvecs = fmin_bfgs(CF.compute_cost,theta[k],fprime=CF.compute_gradient,gtol=stop,disp=False,retall=True) elif method == 'g': # Gradient descent theta[k],min_cost = gradient_descent(CF,theta[k],stop,opt=0) allvecs = None if verbose: if allvecs: min_cost = [] for vec in allvecs: min_cost.append(CF.compute_cost(vec)) nb_iter = len(min_cost) #plot_cost_function_iter(nb_iter,min_cost) #plt.show() if verbose: if n == 1 and K == 1: from plot_functions import plot_hyp_func_1f syn, hyp = hypothesis(x.min(),x.max(),theta[1]) plot_hyp_func_1f(x,y[1],syn,hyp,threshold=.5) if n == 2: if K != 1: from plot_functions import plot_multiclass_2d plot_multiclass_2d(x,theta) else: from plot_functions import plot_db plot_db(x,y,theta[1],title='Decision boundary') if n == 3: if K != 1: from plot_functions import plot_multiclass_3d plot_multiclass_3d(x,theta) else: from plot_functions import plot_db_3d plot_db_3d(x,y,theta[1],title='Decision boundary') return theta
def logistic_reg(x, y, theta, l=0, verbose=0, method='g'): """ Determines theta vector for a given polynomial degree and lambda x is a panda DataFrame y is a panda DataFrame l = 0: regularization coefficient / default is no regularization Methods for cost function minimization (default is gradient descent): 'g': gradient descent 'cg': conjugate gradient 'bfgs': BFGS (Broyden Fletcher Goldfarb Shanno) """ # Number of features n = x.shape[1] # Number of training set examples m = x.shape[0] # Number of classes K = len(y.columns) if len(theta[1]) != n + 1: print "In logistic_reg.py:\nproblem of dimension between number of features and number of parameters !!" print "Number of features:", n print "Length of theta vector:", len(theta[1]) sys.exit() for k in range(1, K + 1): theta[k] = np.array(theta[k], dtype=float) CF = CostFunction(x, y.values[:, k - 1], l) if verbose: if n == 1: from plot_functions import plot_hyp_func_1f, plot_sep_1f syn, hyp = hypothesis(x.min(), x.max(), theta[k]) plot_hyp_func_1f(x, y[k], syn, hyp, threshold=.5) if n == 2: from plot_functions import plot_db plot_db(x, y[k], theta[k], lim=3, title='Initial decision boundary') if n == 3: from plot_functions import plot_db_3d plot_db_3d(x, y[k], theta[k], lim=3, title='Initial decision boundary') method = 'bfgs' stop = 10**-5 if method == 'cg': # Conjugate gradient from scipy.optimize import fmin_cg theta[k], allvecs = fmin_cg(CF.compute_cost, theta[k], fprime=CF.compute_gradient, gtol=stop, disp=False, retall=True) #,maxiter=1000) elif method == 'bfgs': # BFGS (Broyden Fletcher Goldfarb Shanno) from scipy.optimize import fmin_bfgs theta[k], allvecs = fmin_bfgs(CF.compute_cost, theta[k], fprime=CF.compute_gradient, gtol=stop, disp=False, retall=True) elif method == 'g': # Gradient descent theta[k], min_cost = gradient_descent(CF, theta[k], stop, opt=0) allvecs = None if verbose: if allvecs: min_cost = [] for vec in allvecs: min_cost.append(CF.compute_cost(vec)) nb_iter = len(min_cost) #plot_cost_function_iter(nb_iter,min_cost) #plt.show() if verbose: if n == 1 and K == 1: from plot_functions import plot_hyp_func_1f syn, hyp = hypothesis(x.min(), x.max(), theta[1]) plot_hyp_func_1f(x, y[1], syn, hyp, threshold=.5) if n == 2: if K != 1: from plot_functions import plot_multiclass_2d plot_multiclass_2d(x, theta) else: from plot_functions import plot_db plot_db(x, y, theta[1], title='Decision boundary') if n == 3: if K != 1: from plot_functions import plot_multiclass_3d plot_multiclass_3d(x, theta) else: from plot_functions import plot_db_3d plot_db_3d(x, y, theta[1], title='Decision boundary') return theta
def classifier(opt): """ Classification of the different types of events. opt is an object of the class Options() """ list_attr = opt.__dict__.keys() if not 'x' in list_attr: opt.do_tri() X = opt.x Y = opt.y list_attr = opt.__dict__.keys() if 'train_x' in list_attr: X_TRAIN = opt.train_x Y_TRAIN = opt.train_y dic_results = {} for isc in sorted(opt.xs): print "==========",opt.trad[isc],"==========" subdic = {} if isc > 0: if opt.trad[isc][0] == sta_prev: marker_sta = 1 else: marker_sta = 0 sta_prev = opt.trad[isc][0] else: marker_sta = 0 sta_prev = opt.trad[isc][0] if len(opt.xs[isc]) == 0: continue # About the training set if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1 and 'train_x' not in list_attr: if os.path.exists(opt.opdict['train_file']): print opt.opdict['train_file'] TRAIN_Y = read_binary_file(opt.opdict['train_file']) else: TRAIN_Y = {} for tir in range(opt.opdict['boot']): TRAIN_Y[tir] = {} elif 'train_x' in list_attr: opt.x = opt.xs_train[isc] opt.y = opt.ys_train[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() g_train = opt.gaussians del opt.gaussians opt.classname2number() x_ref_train = opt.x y_ref_train = opt.y # About the test set opt.x = opt.xs[isc] opt.y = opt.ys[isc] if opt.opdict['plot_pdf']: opt.compute_pdfs() set = pd.DataFrame(index=opt.ys[isc].index,columns=['Otime']) set['Otime'] = opt.xs[isc].index opt.classname2number() x_test = opt.x y_ref = opt.y x_ref = opt.x if opt.opdict['plot_dataset']: opt.composition_dataset() #K = len(opt.types) ### ITERATE OVER TRAINING SET DRAWS ### for b in range(opt.opdict['boot']): print "\n-------------------- # iter: %d --------------------\n"%(b+1) subsubdic = {} print "WHOLE SET", x_ref.shape, y_ref.shape ### if there is no pre-defined training set ### if 'train_x' not in list_attr: x_train = x_test.copy() if len(opt.opdict['stations']) == 1 and opt.opdict['boot'] > 1: if len(TRAIN_Y[b]) > 0: y_train = y_ref.reindex(index=TRAIN_Y[b]['training_set']) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=TRAIN_Y[b]['cv_set']) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=TRAIN_Y[b]['test_set']) y_test = y_test.dropna(how='any') else: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) TRAIN_Y[b]['training_set'] = map(int,list(y_train.index)) TRAIN_Y[b]['cv_set'] = map(int,list(y_cv.index)) TRAIN_Y[b]['test_set'] = map(int,list(y_test.index)) ### multi-stations case ### else: if marker_sta == 0: y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref) list_ev_train = y_train.index list_ev_cv = y_cv.index list_ev_test = y_test.index else: y_train = y_ref.reindex(index=list_ev_train) y_train = y_train.dropna(how='any') y_cv = y_ref.reindex(index=list_ev_cv) y_cv = y_cv.dropna(how='any') y_test = y_ref.reindex(index=list_ev_test) y_test = y_test.dropna(how='any') x_train = x_ref.reindex(index=y_train.index) ### if a training set was pre-defined ### else: x_train = x_ref_train.copy() y_train = y_ref_train.copy() y_train, y_cv, y_test = generate_datasets(opt.opdict['proportions'],opt.numt,y_ref,y_train=y_train) x_cv = x_ref.reindex(index=y_cv.index) x_test = x_ref.reindex(index=y_test.index) i_train = y_train.index x_train.index = range(x_train.shape[0]) y_train.index = range(y_train.shape[0]) print "TRAINING SET", x_train.shape, y_train.shape if x_train.shape[0] != y_train.shape[0]: print "Training set: Incoherence in x and y dimensions" sys.exit() i_cv = y_cv.index x_cv.index = range(x_cv.shape[0]) y_cv.index = range(y_cv.shape[0]) print "CROSS-VALIDATION SET", x_cv.shape, y_cv.shape if x_cv.shape[0] != y_cv.shape[0]: print "Cross-validation set: Incoherence in x and y dimensions" sys.exit() subsubdic['list_ev'] = np.array(y_test.index) i_test = y_test.index x_test.index = range(x_test.shape[0]) y_test.index = range(y_test.shape[0]) print "TEST SET", x_test.shape, y_test.shape if x_test.shape[0] != y_test.shape[0]: print "Test set: Incoherence in x and y dimensions" sys.exit() opt.train_x = x_train opt.x = x_test opt.train_y = y_train opt.y = y_test if opt.opdict['plot_pdf']: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if 'train_x' in list_attr: opt.plot_superposed_pdfs(g_train,save=opt.opdict['save_pdf']) else: opt.plot_all_pdfs(save=opt.opdict['save_pdf']) if opt.opdict['method'] == '1b1': # EXTRACTEURS print "********** EXTRACTION 1-BY-1 **********" opt.opdict['boot'] = 1 one_by_one(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] == 'ova': print "********** EXTRACTION 1-VS-ALL **********" opt.opdict['boot'] = 1 one_vs_all(opt,x_ref,y_ref,set['Otime'],boot=10,method='svm') continue elif opt.opdict['method'] in ['svm','svm_nl']: # SVM print "********** SVM **********" if opt.opdict['method'] == 'svm': kern = 'Lin' else: kern = 'NonLin' out = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern=kern,proba=opt.opdict['probas']) if 'map' in sorted(out): opt.map = out['map'] if 'thetas' in sorted(out): theta_vec = out['thetas'] theta,threshold = {},{} for it in range(len(theta_vec)): theta[it+1] = np.append(theta_vec[it][-1],theta_vec[it][:-1]) threshold[it+1] = 0.5 out['thetas'] = theta out['threshold'] = threshold elif opt.opdict['method'] == 'lrsk': # LOGISTIC REGRESSION (scikit learn) print "********* Logistic regression (sklearn) **********" out = implement_lr_sklearn(x_train,x_test,y_train,y_test) threshold, theta = {},{} for it in range(len(out['thetas'])): threshold[it+1] = 0.5 theta[it+1] = np.append(out['thetas'][it][-1],out['thetas'][it][:-1]) out['threshold'] = threshold out['thetas'] = theta elif opt.opdict['method'] == 'lr': # LOGISTIC REGRESSION print "********* Logistic regression **********" from LR_functions import do_all_logistic_regression out = do_all_logistic_regression(x_train,x_test,x_cv,y_train,y_test,y_cv) theta = out['thetas'] threshold = out['threshold'] if 'learn_file' in sorted(opt.opdict): learn_filename = opt.opdict['learn_file'] if not os.path.exists(learn_filename): wtr = write_binary_file(learn_filename,i_train) CLASS_test = out['label_test'] CLASS_train = out['label_train'] # TRAINING SET print "\t *TRAINING SET" y_train_np = y_train.NumType.values.ravel() from sklearn.metrics import confusion_matrix cmat_train = confusion_matrix(y_train_np,CLASS_train) p_tr = dic_percent(cmat_train,opt.types,verbose=True) out['rate_train'] = p_tr print " Global : %.2f%%"%p_tr['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_train,opt.types,'Training',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/training_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) # TEST SET print "\t *TEST SET" y_test_np = y_test.NumType.values.ravel() cmat_test = confusion_matrix(y_test_np,CLASS_test) p_test = dic_percent(cmat_test,opt.types,verbose=True) out['rate_test'] = p_test print " Global : %.2f%%"%p_test['global'] if opt.opdict['plot_confusion'] or opt.opdict['save_confusion']: plot_confusion_mat(cmat_test,opt.types,'Test',opt.opdict['method'].upper()) if opt.opdict['save_confusion']: savefig = '%s/test_%s.png'%(opt.opdict['fig_path'],opt.opdict['result_file']) print "Confusion matrix saved in %s"%savefig plt.savefig(savefig) if opt.opdict['plot_confusion']: plt.show() else: plt.close() # PLOT PRECISION AND RECALL if opt.opdict['plot_prec_rec']: from LR_functions import normalize,plot_precision_recall x_train, x_test = normalize(x_train,x_test) plot_precision_recall(x_train,y_train.NumType,x_test,y_test.NumType,theta) pourcentages = (p_tr['global'],p_test['global']) out['method'] = opt.opdict['method'] out['types'] = opt.types opt.out = out # PLOT DECISION BOUNDARIES n_feat = x_train.shape[1] # number of features if n_feat < 4: if opt.opdict['plot_sep'] or opt.opdict['save_sep']: print "\nPLOTTING" print "Theta values:",theta print "Threshold:", threshold # COMPARE AND PLOT LR AND SVM RESULTS out_svm, out_nl = {},{} dir = '%s_SEP'%opt.opdict['method'].upper() if opt.opdict['method']=='lr' and opt.opdict['compare']: dir = 'LR_SVM_SEP' out_svm = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='Lin') cmat_svm_tr = confusion_matrix(y_train_np,out_svm['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_svm['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) theta_svm,t_svm = {},{} for it in range(len(out_svm['thetas'])): theta_svm[it+1] = np.append(out_svm['thetas'][it][-1],out_svm['thetas'][it][:-1]) t_svm[it+1] = 0.5 out_svm['thetas'] = theta_svm out_svm['threshold'] = t_svm out_svm['rate_test'] = svm_pt out_svm['rate_train'] = svm_ptr out_svm['method'] = 'SVM' if opt.opdict['method'] in ['lr','svm'] and opt.opdict['compare_nl']: dir = '%s_NL_SEP'%opt.opdict['method'].upper() out_nl = implement_svm(x_train,x_test,y_train,y_test,opt.types,opt.opdict,kern='NonLin') cmat_svm_tr = confusion_matrix(y_train_np,out_nl['label_train']) cmat_svm_test = confusion_matrix(y_test_np,out_nl['label_test']) svm_ptr = dic_percent(cmat_svm_tr,opt.types) svm_pt = dic_percent(cmat_svm_test,opt.types) out_nl['rate_test'] = svm_pt out_nl['rate_train'] = svm_ptr out_nl['method'] = 'SVM_NL' save_dir = os.path.join(opt.opdict['fig_path'],dir) opt.verify_and_create(save_dir) from LR_functions import normalize x_train, x_test = normalize(x_train,x_test) x_train_good = x_train.reindex(index=y_train[y_train.NumType.values==CLASS_train].index) x_train_bad = x_train.reindex(index=y_train[y_train.NumType.values!=CLASS_train].index) good_train = y_train.reindex(index=x_train_good.index) x_test_good = x_test.reindex(index=y_test[y_test.NumType.values==CLASS_test].index) x_test_bad = x_test.reindex(index=y_test[y_test.NumType.values!=CLASS_test].index) # PLOT FOR 1 ATTRIBUTE AND 2 CLASSES if n_feat == 1 and len(opt.opdict['types']) == 2: name = opt.opdict['feat_list'][0] from plot_functions import plot_hyp_func_1f, histo_pdfs if opt.opdict['method']=='lr' and opt.opdict['compare']: plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,th_comp=theta_svm,cmat_test=cmat_test,cmat_svm=cmat_svm_test,cmat_train=cmat_train) else: #histo_pdfs(x_test,y_test,x_train=x_train,y_train=y_train) plot_hyp_func_1f(x_train,y_train,theta,opt.opdict['method'],threshold=threshold,x_ok=x_test_good,x_bad=x_test_bad,cmat_test=cmat_test,cmat_train=cmat_train) # PLOT FOR 2 ATTRIBUTES AND 2 to 3 CLASSES elif n_feat == 2: name = '%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1]) if opt.opdict['method'] in ['lr','svm']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad) elif opt.opdict['method']=='lr' and opt.opdict['compare']: from plot_2features import plot_2f_all plot_2f_all(out,x_train,y_train,x_test,y_test,x_test_bad,out_comp=out_svm,map_nl=out_nl) elif opt.opdict['method'] == 'svm_nl': from plot_2features import plot_2f_nonlinear plot_2f_nonlinear(out,x_train,y_train,x_test,y_test,y_train=y_train) # PLOT FOR 3 ATTRIBUTES elif n_feat == 3: from plot_functions import plot_db_3d plot_db_3d(x_train,y_train.NumType,theta[1],title='Training set') plot_db_3d(x_test,y_test.NumType,theta[1],title='Test set') name = '%s_%s_%s'%(opt.opdict['feat_list'][0],opt.opdict['feat_list'][1],opt.opdict['feat_list'][2]) if opt.opdict['save_sep']: savename = '%s/CL_sep_%s.png'%(save_dir,name) print "Figure saved in %s"%savename plt.savefig(savename) if opt.opdict['plot_sep']: plt.show() else: plt.close() # WRITE RESULTS INTO A DICTIONARY subsubdic['%'] = pourcentages trad_CLASS_test = [] for i in CLASS_test: i = int(i) trad_CLASS_test.append(opt.types[i]) subsubdic['classification'] = trad_CLASS_test if opt.opdict['probas']: subsubdic['proba'] = out['probas'] if opt.opdict['plot_var']: subsubdic['out'] = out subdic[b] = subsubdic if opt.opdict['plot_var'] and opt.opdict['method'] in ['lr','svm','lrsk'] and n_feat==2 and len(opt.opdict['types'])==2: from plot_2features import plot_2f_variability plot_2f_variability(subdic,x_train,y_train,x_test,y_test) plt.savefig('%s/%s_variability_pas.png'%(opt.opdict['fig_path'],opt.opdict['method'].upper())) plt.show() dic_results[opt.trad[isc]] = subdic dic_results['header'] = {} dic_results['header']['features'] = opt.opdict['feat_list'] dic_results['header']['types'] = opt.opdict['types'] dic_results['header']['catalog'] = opt.opdict['label_test'] if opt.opdict['method'] in ['lr','lrsk','svm','svm_nl']: print "Save results in file %s"%opt.opdict['result_path'] write_binary_file(opt.opdict['result_path'],dic_results) if 'train_file' in sorted(opt.opdict): if not os.path.exists(opt.opdict['train_file']) and opt.opdict['boot'] > 1: write_binary_file(opt.opdict['train_file'],TRAIN_Y)