def progressive(system_val): global system system = system_val if configs.strategy == 'progressive': configs.show_actual_lc = False if configs.plot is True or configs.plot_real_cost is True: plot.curr_system = system_val data = load_data() perf_values = load_perf_values() data[data == 'Y'] = 1 data[data == 'N'] = 0 data = data.astype(bool) repeat = configs.repeat if configs.th == 2 or configs.th == 3: total_range = range((details_map[system][1]//10)//configs.th) else: total_range = range(int((details_map[system][1]//10)/configs.th)) results = np.empty((len(total_range),repeat)) data_list = [] for j in range(repeat): for i in total_range: np.random.seed(j) if configs.fix_test_set is True: test_set_indices = np.random.choice(data.shape[0],details_map[system][1] // configs.fix_test_ratio,replace=False) curr_size = 10*(i+1) if configs.fix_test_set is True: train_opt_indices = set(range(data.shape[0])) - set(test_set_indices) training_set_indices = np.random.choice(np.array(list(train_opt_indices)),curr_size,replace=False) else: training_set_indices = np.random.choice(data.shape[0],curr_size,replace=False) diff_indices = set(range(data.shape[0])) - set(training_set_indices) training_set = data[training_set_indices] if configs.fix_test_set is True: test_set_indices = test_set_indices else: test_set_indices = np.random.choice(np.array(list(diff_indices)),curr_size,replace=False) test_set = data[test_set_indices] X = training_set y = perf_values[training_set_indices] if configs.model is 'cart': built_tree = cart(X, y) out = predict(built_tree, test_set, perf_values[test_set_indices]) else: clf = SVR(C=1.0, epsilon=0.2) clf.fit(X, y) out = predict(clf, test_set, perf_values[test_set_indices]) results[i][j] = calc_accuracy(out,perf_values[test_set_indices]) if print_detail is True: print('['+system+']' + " iteration :"+str(j+1)) print() out_file = open(os.path.join(base_dir_out,system)+"_out_"+strategy+"_"+str(configs.model),'w') out_file.truncate() cost_prev = sys.maxsize size_prev = 0 acc_prev = 0 opt_cost = 0 cost_list = [] if configs.show_actual_lc is True: local_xdata = [] local_ydata = [] opt_found = False for i in range(results.shape[0]): size = (i+1)*10 error = mean(results[i]) out_file.write(str(size)+","+ str(error)) out_file.write('\n') if configs.plot is True or configs.plot_real_cost is True: plot.x_data_prog.append(size) if configs.show_actual_lc is True: local_xdata.append(size) if configs.plot_real_cost is False: if error > 100: plot.y_data_prog.append(100-100) if configs.show_actual_lc is True: local_ydata.append(100-100) else: plot.y_data_prog.append(100-error) if configs.show_actual_lc is True: local_ydata.append(100-error) if configs.calc_prog_opt is True: R = configs.r S = configs.details_map[system][1]//3 cost_curr = cost_eqn(configs.th,size,error,S,R) cost_list.append(cost_curr) if configs.plot_real_cost is True: plot.y_data_prog.append(cost_curr) if cost_curr > cost_prev and opt_found is False: plot.opt_size = size_prev plot.opt_accu = acc_prev plot.opt_cost = cost_prev opt_cost = cost_prev opt_found = True else: cost_prev = cost_curr size_prev = size acc_prev = 100-error if configs.show_actual_lc is True: data_list.append((local_xdata,local_ydata)) real_cost = min(cost_list) plot.real_min_cost = real_cost if configs.print_detail is True: print('Accuracy at optimal:',acc_prev) if configs.plot is True and configs.strategy == 'progressive': plot.plot_now() return data_list,opt_cost,real_cost
def sample(system): configs.extend_lambda = False data_train = load_data(True) perf_values_train = load_perf_values(True) data_test = load_data(False) perf_values_test = load_perf_values(False) data_train[data_train == 'Y'] = 1 data_train[data_train == 'N'] = 0 data_train = data_train.astype(bool) data_test[data_test == 'Y'] = 1 data_test[data_test == 'N'] = 0 data_test = data_test.astype(bool) repeat = configs.repeat if print_detail is True: print('Size of '+str(system)+' '+str(configs.tway)+'-way sample is: '+str(data_train.shape[0])) corr_list = [] for s in range(repeat): if print_detail is True: print('Iteration',s) results = dict() j = random.randint(1,30*100100) if configs.fix_test_set is True: test_set_indices = np.random.choice(data_test.shape[0],configs.details_map[system][1] // configs.fix_test_ratio,replace=False) i=0 while True: if i==data_train.shape[0]: break else: i=i+1 curr_size = i np.random.seed(j) training_set_indices = np.random.choice(data_train.shape[0],curr_size,replace=False) training_set = data_train[training_set_indices] if configs.fix_test_set is True: test_set_indices = test_set_indices else: test_set_indices = np.random.choice(data_test.shape[0],curr_size,replace=False) test_set = data_test[test_set_indices] X = training_set y = perf_values_train[training_set_indices] built_tree = base.cart(X, y) out = base.predict(built_tree, test_set, perf_values_test[test_set_indices]) if curr_size in results: print('%%%%%%%%%%%%%%%%%%%% SHOCK!! &&&&&&&&&&&&&&&&&&&') else: accu = base.calc_accuracy(out,perf_values_test[test_set_indices]) if accu <= 100: results[curr_size] = accu result_in_cluster = base.check_result_cluster(results) if configs.add_origin_to_lambda is True and result_in_cluster is True: results[0] = 100 if configs.transform_lambda is True: results = base.transform_lambda_set(results) if print_detail is True: print('Size of lambda set: '+ str(len(results))) ''' Transform the axes and calculate pearson correlation with each learning curve ''' curve_data = base.transform_axes(base.smooth(base.dict_to_array(results))) parameter_dict = dict() correlation_data = dict() ''' keys here are individual curves for a given system. Values are 2-d array. x: transformed "no. of sample" values and y: transformed accuracy at that sample value''' for keys in curve_data: slope, intercept, rvalue, pvalue, stderr = sp.stats.linregress(curve_data[keys][configs.ignore_initial:,0],curve_data[keys][configs.ignore_initial:,1]) if print_detail is True: print(keys,intercept,slope) value_a = base.get_intercept(intercept,keys) value_b = base.get_slope(slope,keys) parameter_dict[keys] = {'a' : value_a, 'b':value_b} value_r = configs.r value_s = configs.details_map[system][1]/3 optimal_size = base.get_optimal(value_a,value_b,value_r,value_s,keys) estimated_error = 100 weiss_within_range = True if keys == 'weiss' and (abs(value_a) + abs(value_b)) > 100: weiss_within_range = False if optimal_size <= (data_train.shape[0]+data_test.shape[0])//configs.th and optimal_size > 1 and weiss_within_range is True: mean_accu,sd = get_projected_accuracy(optimal_size,data_train,perf_values_train,data_test,perf_values_test,test_set_indices) r = configs.r th = configs.th total_cost = base.cost_eqn(th,optimal_size, 100-float(mean_accu), configs.details_map[system][1] // 3, r) estimated_error = base.get_error_from_curve(value_a,value_b,optimal_size,keys) estimated_cost = base.cost_eqn(th,optimal_size,estimated_error,configs.details_map[system][1] // 3, r) else: mean_accu,sd,total_cost,estimated_cost = (None,None,None,None) correlation_data[keys] = {'correlation' : rvalue, 'p-value' : str(pvalue), 'optimal sample size' :optimal_size, 'accuracy' :mean_accu, 'estimated accuracy': 100 - estimated_error, 'standard deviation' :sd, 'total cost' :total_cost, 'estimated cost' : estimated_cost, 'a' : value_a, 'b' : value_b, 'lambda size' : len(results)} selected_curve = base.select_curve(correlation_data) if print_detail is True: print() print('Detailed learning projections:') print('<curve-id> : {<details>}') print() for keys in correlation_data: if keys in selected_curve: correlation_data[keys]['selected'] = True if print_detail is True: print(str(keys) +"**:"+str(correlation_data[keys])) else: correlation_data[keys]['selected'] = False if print_detail is True: print(str(keys) +":"+str(correlation_data[keys])) if print_detail is True: print("-----------------------------------------------") print() corr_list.append(correlation_data) if configs.plot is True and configs.sense_curve is True: plot.curr_system = system plot.prog_data.append((results,correlation_data)) if configs.plot is True and configs.sense_curve is True: plot.plot_now() return base.mean_corr_list(corr_list) else: return base.mean_corr_list(corr_list)
def projective(system_val): if print_detail is True: print('System-id : '+system_val) print('R value : '+str(configs.r)) print('th value : '+str(configs.th)) global system if configs.plot is not True: configs.show_actual_lc = False system = system_val data = load_data() perf_values = load_perf_values() data[data == 'Y'] = 1 data[data == 'N'] = 0 data = data.astype(bool) repeat = configs.repeat corr_list = [] if configs.plot is True and configs.show_actual_lc is True: real_curve_points = progressive(system_val) plot.real_curve_pts = real_curve_points[0] for s in range(repeat): if print_detail is True: print('Running iteration :' +str(s)) if configs.fix_test_set is True: test_set_indices = np.random.choice(data.shape[0],details_map[system][1] // configs.fix_test_ratio,replace=False) else: test_set_indices = [] results = dict() results = build_data_points(results,repeat, data, perf_values, True,None,test_set_indices) if print_detail is True: print('Size of lambda set: '+ str(len(results))) ''' Transform the axes and calculate pearson correlation with each learning curve ''' if configs.smooth is True: curve_data = transform_axes(smooth(dict_to_array(results))) else: curve_data = transform_axes(dict_to_array(results)) parameter_dict = dict() correlation_data = dict() ''' keys here are individual curves for a given system. Values are 2-d array. x: transformed "no. of sample" values and y: transformed accuracy at that sample value''' for keys in curve_data: slope, intercept, rvalue, pvalue, stderr = sp.stats.linregress(curve_data[keys][configs.ignore_initial:,0],curve_data[keys][configs.ignore_initial:,1]) value_a = get_intercept(intercept,keys) value_b = get_slope(slope,keys) parameter_dict[keys] = {'a' : value_a, 'b':value_b} value_r = configs.r value_s = details_map[system][1]/3 optimal_size = get_optimal(value_a,value_b,value_r,value_s,keys) estimated_error = 100 weiss_within_range = True if keys == 'weiss' and (abs(value_a) + abs(value_b)) > 100: weiss_within_range = False if optimal_size <= data.shape[0]//configs.th and optimal_size > 1 and weiss_within_range is True: mean_accu,sd = get_projected_accuracy(optimal_size,repeat,data,perf_values,test_set_indices) r = configs.r th = configs.th total_cost = cost_eqn(th,optimal_size, 100-float(mean_accu), details_map[system][1] // 3, r) estimated_error = get_error_from_curve(value_a,value_b,optimal_size,keys) estimated_cost = cost_eqn(th,optimal_size,estimated_error,details_map[system][1] // 3, r) else: mean_accu,sd,total_cost,estimated_cost,optimal_size = (None,None,None,None,None) correlation_data[keys] = {'correlation' : rvalue, 'p-value' : str(pvalue), 'optimal sample size' :optimal_size, 'accuracy' :mean_accu, 'estimated accuracy': 100 - estimated_error, 'standard deviation' :sd, 'total cost' :total_cost, 'estimated cost' : estimated_cost, 'a' : value_a, 'b' : value_b, 'stderr' : stderr, 'lambda size' : len(results)} if configs.curve_selection == 'dynamic': selected_curve,results = select_curve_dynamic(correlation_data,data,perf_values,parameter_dict,results,test_set_indices) elif configs.curve_selection == 'static': selected_curve = select_curve(correlation_data) else: selected_curve = select_curve_composite(correlation_data) if print_detail is True: print() print('Detailed learning projections:') print('<curve-id> : {<details>}') print() for keys in correlation_data: if keys in selected_curve: correlation_data[keys]['selected'] = True if print_detail is True: print(str(keys) +"**:"+str(correlation_data[keys])) else: correlation_data[keys]['selected'] = False if print_detail is True and float(correlation_data[keys]['correlation']) < configs.min_corr: print(str(keys) +":"+str(correlation_data[keys])) if print_detail is True: print("-----------------------------------------------") print() corr_list.append(correlation_data) if configs.plot is True and configs.sense_curve is True: plot.prog_data.append((results,correlation_data)) plot.curr_system = system_val if configs.plot is True and configs.sense_curve is True: p_value = mean_corr_list(corr_list) plot.plot_now() else: p_value = mean_corr_list(corr_list) return p_value