def output_graph(self, geography="nyc", filename=None): """Output a PDF of a plot showing the grades over time in a certain geography, either 'nyc' for the whole city, or each borough ('bronx', 'queens', 'brooklyn', 'manhattan', 'staten').""" print "Outputting figure...%s" % filename sys.stdout.flush() # Get counts and percents counts, pcts = self.get_grade_counts_by_year(self.cut_to_geography(geography)) counts.fillna(value=0, inplace=True) pcts.fillna(value=0, inplace=True) # Create axes and figure fig, axes = plt.subplots(2, 1, sharex=False) fig.set_size_inches(7, 11) fig.subplots_adjust(right=.8) percentage_graph(pcts, axes[0]) bar_graph(counts, axes[1]) if filename is not None: fig.savefig(filename) else: plt.show()
def select_best_model(training_X,training_Y,data_title=None): ##print training_X.shape[0] ##split data into 4 section by every 4th item list1x = training_X[0::10] list2x = training_X[1::10] list3x = training_X[2::10] list4x = training_X[3::10] list5x = training_X[4::10] list6x = training_X[5::10] list7x = training_X[6::10] list8x = training_X[7::10] list9x = training_X[8::10] list10x = training_X[9::10] list1y = training_Y[0::10] list2y = training_Y[1::10] list3y = training_Y[2::10] list4y = training_Y[3::10] list5y = training_Y[4::10] list6y = training_Y[5::10] list7y = training_Y[6::10] list8y = training_Y[7::10] list9y = training_Y[8::10] list10y = training_Y[9::10] train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x)) train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y)) validate_set_X = list8x validate_set_Y = list8y full_train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x,list8x)) full_train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y,list8y)) test_set_X = numpy.concatenate((list1x,list9x,list10x)) test_set_Y = numpy.concatenate((list1y,list9y,list10y)) ##evuate and find best alpha for each model start_time = time.clock() dict_alpha0 = optimize_alpha("dc_tree",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha0 = int(argmax(dict_alpha0)[0]) print "\talpha: "+str(best_alpha0) train_time0 = time.clock() - start_time start_time = time.clock() y_test0 = scikit_dc_tree(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0) predict0 = time.clock() - start_time print best_alpha1 = 0.5 y_test1 = 0.5 start_time = time.clock() dict_alpha2 = {"5":0.5} best_alpha2 = float(argmax(dict_alpha2)[0]) print "\talpha: "+str(best_alpha2) train_time2 = time.clock() - start_time start_time = time.clock() y_test2 = 0.5 predict2 = time.clock() - start_time print ##ensemble classifiers: start_time = time.clock() dict_alpha3 = optimize_alpha("ran_forest",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha3 = int(argmax(dict_alpha3)[0]) best_beta3 = int(argmax(dict_alpha3)[1]) print "\talpha: "+str(best_alpha3)+" beta: "+str(best_beta3) train_time3 = time.clock() - start_time start_time = time.clock() y_test3 = scikit_ran_forest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha=best_alpha3,beta=best_beta3) predict3 = time.clock() - start_time print start_time = time.clock() dict_alpha4 = optimize_alpha("ada_boost",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha4 = int(argmax(dict_alpha4)[0]) print "\talpha: "+str(best_alpha4) train_time4 = time.clock() - start_time start_time = time.clock() y_test4 = scikit_ada_boost(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha4) predict4 = time.clock() - start_time print """ SVM extras too slow start_time = time.clock() dict_alpha2 = optimize_alpha("svm",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha2 = float(argmax(dict_alpha2)) print "\t"+str(best_alpha2) train_time2 = time.clock() - start_time start_time = time.clock() y_test2 = scikit_onevsrest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0) predict2 = time.clock() - start_time print """ ## consolidate and output final result dict_choices = {"dc_tree":y_test0,"knn":y_test1,"log_reg":y_test2,"ran_forest":y_test3,"ada_boost":y_test4} dict_alpha = {"dc_tree":best_alpha0,"knn":best_alpha1,"log_reg":best_alpha2,"ran_forest":best_alpha3,"ada_boost":best_alpha4} dict_beta = {"ran_forest":best_beta3} print dict_choices final_model = argmax(dict_choices)[0] ##prep for bar graphs dict_errors = dict_choices for key, value in dict_errors.iteritems(): dict_errors[key] = 1- value errors = [dict_errors["dc_tree"],dict_errors["log_reg"]] traintimes = [train_time0,train_time2] predicttimes = [predict0,predict2] ##plot accuracy plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],errors,maxy=max(errors)*1.1,ylabel="errors") ##plot training time plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],traintimes,maxy=max(traintimes)*1.1,ylabel="training times(s)") ##plot prediction time plotting.bar_graph(data_title,"decisiontree","log_reg",[1,2],predicttimes,maxy=max(predicttimes)*1.1,ylabel="prediction times(s)") #prep for line graphs alpha_X =[] alpha_X_ints = [] for i in range(1, 2, 1): for factor in range(3,10,3): alpha = 10**(i)*factor; if (alpha > 1): alpha_X.append(str(alpha)) alpha_X_ints.append(alpha) alpha_Y_dc = [] alpha_Y_log_reg = [] dict_error0 = dict_alpha0 for key, value in dict_alpha0.iteritems(): dict_error0[key] = 1.0- value dict_error2 = dict_alpha2 for key, value in dict_alpha2.iteritems(): dict_error2[key] = 1.0- value for xvalue in alpha_X: alpha_Y_dc.append(dict_error0[xvalue]) alpha_Y_log_reg.append(dict_error2[xvalue]) alpha_Y = alpha_Y_dc,alpha_Y_log_reg #plot line graph plotting.line_graph_alpha_error(data_title,"dc_tree","log_reg",alpha_X_ints,alpha_Y) return final_model,dict_alpha[final_model],best_beta3
def select_best_model(training_X,training_Y,data_title=None): ##print training_X.shape[0] ##split data into 4 section by every 4th item list1x = training_X[0::10] list2x = training_X[1::10] list3x = training_X[2::10] list4x = training_X[3::10] list5x = training_X[4::10] list6x = training_X[5::10] list7x = training_X[6::10] list8x = training_X[7::10] list9x = training_X[8::10] list10x = training_X[9::10] list1y = training_Y[0::10] list2y = training_Y[1::10] list3y = training_Y[2::10] list4y = training_Y[3::10] list5y = training_Y[4::10] list6y = training_Y[5::10] list7y = training_Y[6::10] list8y = training_Y[7::10] list9y = training_Y[8::10] list10y = training_Y[9::10] ##split = training_X.shape[0] * .90 seed = 1 train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x)) train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y)) validate_set_X = list8x validate_set_Y = list8y full_train_set_X = numpy.concatenate((list2x,list3x,list4x,list5x,list6x,list7x,list8x)) full_train_set_Y = numpy.concatenate((list2y,list3y,list4y,list5y,list6y,list7y,list8y)) test_set_X = numpy.concatenate((list1x,list9x,list10x)) test_set_Y = numpy.concatenate((list1y,list9y,list10y)) #tables dict_choices =defaultdict(float) dict_alpha =defaultdict(float) dict_beta =defaultdict(float) traintimes =defaultdict(float) predicttime = defaultdict(float) ##evuate and find best alpha for each model start_time = time.clock() dict_alpha0 = optimize_alpha("dc_tree",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha0 = int(argmax(dict_alpha0)[0]) dict_alpha["dc_tree"] = best_alpha0 print "\talpha: "+str(best_alpha0) traintimes["dc_tree"] = time.clock() - start_time start_time = time.clock() dict_choices["dc_tree"] = scikit_dc_tree(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0) predicttime["dc_tree"] = time.clock() - start_time print start_time = time.clock() dict_alpha1 = optimize_alpha("knn",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha1 = int(argmax(dict_alpha1)[0]) dict_alpha["knn"] = best_alpha1 print "\talpha: "+str(best_alpha1) traintimes["knn"] = time.clock() - start_time start_time = time.clock() dict_choices["knn"] = scikit_knn_model(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha1) predicttime["knn"] = time.clock() - start_time print """ start_time = time.clock() dict_alpha2 = optimize_alpha("log_reg",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha2 = float(argmax(dict_alpha2)[0]) dict_alpha["log_reg"] = best_alpha2 print "\talpha: "+str(best_alpha2) traintimes["log_reg"] = time.clock() - start_time start_time = time.clock() dict_choices["log_reg"] = scikit_log_reg(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha2) predicttime["log_reg"] = time.clock() - start_time print """ ##ensemble classifiers: start_time = time.clock() dict_alpha3 = optimize_alpha("ran_forest",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha3 = int(argmax(dict_alpha3)[0]) best_beta3 = int(argmax(dict_alpha3)[1]) dict_alpha["ran_forest"] = best_alpha3 dict_beta["ran_forest"] = best_beta3 print "\talpha: "+str(best_alpha3)+" beta: "+str(best_beta3) traintimes["ran_forest"] = time.clock() - start_time start_time = time.clock() dict_choices["ran_forest"] = scikit_ran_forest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha=best_alpha3,beta=best_beta3) predicttime["ran_forest"] = time.clock() - start_time print """ start_time = time.clock() dict_alpha4 = optimize_alpha("ada_boost",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha4 = int(argmax(dict_alpha4)[0]) dict_alpha["ada_boost"] = best_alpha4 print "\talpha: "+str(best_alpha4) traintimes["ada_boost"] = time.clock() - start_time start_time = time.clock() dict_choices["ada_boost"] = scikit_ada_boost(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha4) predicttime["ada_boost"] = time.clock() - start_time print """ """ SVM extras too slow start_time = time.clock() dict_alpha2 = optimize_alpha("svm",train_set_X,train_set_Y,validate_set_X,validate_set_Y) best_alpha2 = float(argmax(dict_alpha2)) print "\t"+str(best_alpha2) train_time2 = time.clock() - start_time start_time = time.clock() y_test2 = scikit_onevsrest(full_train_set_X, full_train_set_Y, test_set_X,test_set_Y,alpha = best_alpha0) predict2 = time.clock() - start_time print """ ## consolidate and output final result print dict_choices final_model = argmax(dict_choices)[0] ##prep for bar graphs dict_errors = dict_choices for key, value in dict_errors.iteritems(): dict_errors[key] = 1- value errors = [dict_errors["dc_tree"],dict_errors["knn"],dict_errors["ran_forest"]] traintimes = [traintimes["dc_tree"],traintimes["knn"],traintimes["ran_forest"]] predicttimes = [predicttime["dc_tree"],predicttime["knn"],predicttime["ran_forest"]] ##plot accuracy plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],errors,maxy=max(errors)*1.1,ylabel="errors") ##plot training time plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],traintimes,maxy=max(traintimes)*1.1,ylabel="training times(s)") ##plot prediction time plotting.bar_graph(data_title,"decisiontree","knn","ran_forest",[1,2,3],predicttimes,maxy=max(predicttimes)*1.1,ylabel="prediction times(s)") #prep for line graphs alpha_X =[] alpha_X_ints = [] for i in range(0, 2, 1): for factor in range(3,10,3): alpha = 10**(i)*factor; if (alpha > 1): alpha_X.append(str(alpha)) alpha_X_ints.append(alpha) alpha_Y_dc = [] alpha_Y_knn = [] alpha_Y_ran_forest = [] dict_error0 = dict_alpha0 for key, value in dict_alpha0.iteritems(): dict_error0[key] = 1.0- value dict_error1 = dict_alpha1 for key, value in dict_alpha1.iteritems(): dict_error1[key] = 1.0- value dict_error3 = dict_alpha3 for key, value in dict_alpha3.iteritems(): dict_error3[key] = 1.0- value for xvalue in alpha_X: alpha_Y_dc.append(dict_error0[xvalue]) alpha_Y_knn.append(dict_error1[xvalue]) alpha_Y_ran_forest.append(dict_error3[str(int(xvalue)*3)+","+str(50)]) alpha_Y = alpha_Y_dc,alpha_Y_knn,alpha_Y_ran_forest #plot line graph plotting.line_graph_alpha_error(data_title,"dc_tree","knn","ran_forest",alpha_X_ints,alpha_Y) return final_model,dict_alpha[final_model],best_beta3