Esempio n. 1
0
def progressive(system_val):
    global system
    system = system_val    
    if configs.strategy == 'progressive':
        configs.show_actual_lc = False
    
    if configs.plot is True or configs.plot_real_cost is True:
        plot.curr_system = system_val
    data = load_data()
    perf_values = load_perf_values()
    data[data == 'Y'] = 1
    data[data == 'N'] = 0
    data = data.astype(bool)
    repeat = configs.repeat
    if configs.th == 2 or configs.th == 3: 
        total_range = range((details_map[system][1]//10)//configs.th)
    else:
        total_range = range(int((details_map[system][1]//10)/configs.th))
    results = np.empty((len(total_range),repeat))
    data_list = []
    for j in range(repeat):
        for i in total_range:
            np.random.seed(j)
            if configs.fix_test_set is True:
                test_set_indices = np.random.choice(data.shape[0],details_map[system][1] // configs.fix_test_ratio,replace=False)
            curr_size = 10*(i+1)
            if configs.fix_test_set is True:
                train_opt_indices = set(range(data.shape[0])) - set(test_set_indices)
                training_set_indices = np.random.choice(np.array(list(train_opt_indices)),curr_size,replace=False)
            else:
                training_set_indices = np.random.choice(data.shape[0],curr_size,replace=False)
                
            diff_indices = set(range(data.shape[0])) - set(training_set_indices)
            training_set = data[training_set_indices]
            if configs.fix_test_set is True:
                test_set_indices = test_set_indices
            else:
                test_set_indices = np.random.choice(np.array(list(diff_indices)),curr_size,replace=False)
            test_set = data[test_set_indices]
            X = training_set
            y = perf_values[training_set_indices]
            if configs.model is 'cart':
                built_tree = cart(X, y)
                out = predict(built_tree, test_set, perf_values[test_set_indices])
            else:
                clf = SVR(C=1.0, epsilon=0.2)
                clf.fit(X, y)
                out = predict(clf, test_set, perf_values[test_set_indices])
            results[i][j] = calc_accuracy(out,perf_values[test_set_indices])
        if print_detail is True:    
            print('['+system+']' + " iteration :"+str(j+1))
    print()
    out_file = open(os.path.join(base_dir_out,system)+"_out_"+strategy+"_"+str(configs.model),'w')
    out_file.truncate()
    cost_prev = sys.maxsize
    size_prev = 0
    acc_prev = 0
    opt_cost = 0
    cost_list = []
    if configs.show_actual_lc is True:
        local_xdata = []
        local_ydata = []
    opt_found = False
    for i in range(results.shape[0]):
        size = (i+1)*10
        error = mean(results[i])
        out_file.write(str(size)+","+ str(error))
        out_file.write('\n')
        if configs.plot is True or configs.plot_real_cost is True:
            plot.x_data_prog.append(size)
            if configs.show_actual_lc is True:
                local_xdata.append(size)
            if configs.plot_real_cost is False:
                if error > 100:
                    plot.y_data_prog.append(100-100)
                    if configs.show_actual_lc is True:
                        local_ydata.append(100-100)
                else:
                    plot.y_data_prog.append(100-error)
                    if configs.show_actual_lc is True:
                        local_ydata.append(100-error)
                        
        if configs.calc_prog_opt is True:
            R = configs.r
            S = configs.details_map[system][1]//3    
            cost_curr = cost_eqn(configs.th,size,error,S,R)
            cost_list.append(cost_curr)
            if configs.plot_real_cost is True:
                plot.y_data_prog.append(cost_curr)
            if cost_curr > cost_prev and opt_found is False:
                plot.opt_size = size_prev
                plot.opt_accu = acc_prev
                plot.opt_cost = cost_prev
                opt_cost = cost_prev
                opt_found = True
            else:
                cost_prev = cost_curr
                size_prev = size
                acc_prev = 100-error    
        if configs.show_actual_lc is True:
            data_list.append((local_xdata,local_ydata))        
    real_cost = min(cost_list)
    plot.real_min_cost = real_cost
    if configs.print_detail is True:
        print('Accuracy at optimal:',acc_prev)
    if configs.plot is True and configs.strategy == 'progressive':
        plot.plot_now()   
    return data_list,opt_cost,real_cost
Esempio n. 2
0
def sample(system):
    configs.extend_lambda = False
    data_train = load_data(True)
    perf_values_train = load_perf_values(True)
    data_test = load_data(False)
    perf_values_test = load_perf_values(False)
    
    data_train[data_train == 'Y'] = 1
    data_train[data_train == 'N'] = 0
    data_train = data_train.astype(bool)    
    
    data_test[data_test == 'Y'] = 1
    data_test[data_test == 'N'] = 0
    data_test = data_test.astype(bool)
    
    repeat = configs.repeat
    if print_detail is True:
        print('Size of '+str(system)+' '+str(configs.tway)+'-way sample is: '+str(data_train.shape[0]))
    corr_list = []
    
    for s in range(repeat):
        if print_detail is True:
            print('Iteration',s)
        results = dict()
        j = random.randint(1,30*100100)
        if configs.fix_test_set is True:
            test_set_indices = np.random.choice(data_test.shape[0],configs.details_map[system][1] // configs.fix_test_ratio,replace=False)
        i=0
        while True:
            if i==data_train.shape[0]:
                break
            else:
                i=i+1
            curr_size = i
            np.random.seed(j)
            training_set_indices = np.random.choice(data_train.shape[0],curr_size,replace=False)
            training_set = data_train[training_set_indices]
            
            if configs.fix_test_set is True:
                test_set_indices = test_set_indices
            else:    
                test_set_indices = np.random.choice(data_test.shape[0],curr_size,replace=False)
            test_set = data_test[test_set_indices]
            
            X = training_set
            y = perf_values_train[training_set_indices]
            
            built_tree = base.cart(X, y)
            out = base.predict(built_tree, test_set, perf_values_test[test_set_indices])
            
            if curr_size in results:
                print('%%%%%%%%%%%%%%%%%%%% SHOCK!! &&&&&&&&&&&&&&&&&&&')
            else:
                accu = base.calc_accuracy(out,perf_values_test[test_set_indices])
                if accu <= 100:
                    results[curr_size] = accu
        result_in_cluster = base.check_result_cluster(results)        
        if configs.add_origin_to_lambda is True and result_in_cluster is True:
            results[0] = 100
        if configs.transform_lambda is True:
            results = base.transform_lambda_set(results)
        if print_detail is True:    
            print('Size of lambda set: '+ str(len(results)))    
        '''
        Transform the axes and calculate pearson correlation with
        each learning curve
        '''
        curve_data = base.transform_axes(base.smooth(base.dict_to_array(results)))
        parameter_dict = dict()
        correlation_data = dict()
        ''' keys here are individual curves for a given system. Values are 2-d array. x: transformed "no. of sample" values
        and y: transformed accuracy at that sample value'''
        for keys in curve_data:
            slope, intercept, rvalue, pvalue, stderr = sp.stats.linregress(curve_data[keys][configs.ignore_initial:,0],curve_data[keys][configs.ignore_initial:,1])
            if print_detail is True:
                print(keys,intercept,slope)
            value_a = base.get_intercept(intercept,keys)
            value_b = base.get_slope(slope,keys)
            parameter_dict[keys] = {'a' : value_a, 'b':value_b}
            value_r = configs.r
            value_s = configs.details_map[system][1]/3
            optimal_size = base.get_optimal(value_a,value_b,value_r,value_s,keys)
            estimated_error = 100
            weiss_within_range = True
            if keys == 'weiss' and (abs(value_a) + abs(value_b)) > 100:
                weiss_within_range = False
            if optimal_size <= (data_train.shape[0]+data_test.shape[0])//configs.th and optimal_size > 1 and weiss_within_range is True:
                mean_accu,sd = get_projected_accuracy(optimal_size,data_train,perf_values_train,data_test,perf_values_test,test_set_indices)
                r = configs.r
                th = configs.th
                total_cost = base.cost_eqn(th,optimal_size, 100-float(mean_accu), configs.details_map[system][1] // 3, r)
                estimated_error = base.get_error_from_curve(value_a,value_b,optimal_size,keys)
                estimated_cost = base.cost_eqn(th,optimal_size,estimated_error,configs.details_map[system][1] // 3, r)
            else:
                mean_accu,sd,total_cost,estimated_cost = (None,None,None,None)
            
            correlation_data[keys] = {'correlation' : rvalue,
                                      'p-value' : str(pvalue),
                                      'optimal sample size' :optimal_size,
                                      'accuracy' :mean_accu,
                                      'estimated accuracy': 100 - estimated_error,
                                      'standard deviation' :sd,
                                      'total cost' :total_cost,
                                      'estimated cost' : estimated_cost,
                                      'a' : value_a,
                                      'b' : value_b,
                                      'lambda size' : len(results)}
        selected_curve = base.select_curve(correlation_data)
        
        if print_detail is True:
            print()
            print('Detailed learning projections:')
            print('<curve-id> : {<details>}')
            print()
            
        for keys in correlation_data:
            if keys in selected_curve:
                correlation_data[keys]['selected'] = True
                if print_detail is True:
                    print(str(keys) +"**:"+str(correlation_data[keys]))
            else:
                correlation_data[keys]['selected'] = False
                if print_detail is True:
                    print(str(keys) +":"+str(correlation_data[keys]))
        if print_detail is True:            
            print("-----------------------------------------------")
            print()
        corr_list.append(correlation_data)
        if configs.plot is True and configs.sense_curve is True:
            plot.curr_system = system
            plot.prog_data.append((results,correlation_data))
        
    if configs.plot is True and configs.sense_curve is True:
        plot.plot_now()
        return base.mean_corr_list(corr_list)   
    else:
        return base.mean_corr_list(corr_list) 
Esempio n. 3
0
def projective(system_val):
    if print_detail is True:
        print('System-id : '+system_val)
        print('R value : '+str(configs.r))
        print('th value : '+str(configs.th))
    global system
    if configs.plot is not True:
        configs.show_actual_lc = False
    system = system_val
    data = load_data()
    perf_values = load_perf_values()
    data[data == 'Y'] = 1
    data[data == 'N'] = 0
    data = data.astype(bool)    
    repeat = configs.repeat
    corr_list = []
    if configs.plot is True and configs.show_actual_lc is True:
        real_curve_points = progressive(system_val)
        plot.real_curve_pts = real_curve_points[0]
    for s in range(repeat):
        if print_detail is True:
            print('Running iteration :' +str(s))
        if configs.fix_test_set is True:
            test_set_indices = np.random.choice(data.shape[0],details_map[system][1] // configs.fix_test_ratio,replace=False)
        else:
            test_set_indices = []
        results = dict()
        results = build_data_points(results,repeat, data, perf_values, True,None,test_set_indices)
        
        if print_detail is True:
            print('Size of lambda set: '+ str(len(results)))    
        '''
        Transform the axes and calculate pearson correlation with
        each learning curve
        '''
        if configs.smooth is True:
            curve_data = transform_axes(smooth(dict_to_array(results)))
        else:
            curve_data = transform_axes(dict_to_array(results))
        parameter_dict = dict()
        correlation_data = dict()
        ''' keys here are individual curves for a given system. Values are 2-d array. x: transformed "no. of sample" values
        and y: transformed accuracy at that sample value'''
        for keys in curve_data:
            
            slope, intercept, rvalue, pvalue, stderr = sp.stats.linregress(curve_data[keys][configs.ignore_initial:,0],curve_data[keys][configs.ignore_initial:,1])
            value_a = get_intercept(intercept,keys)
            value_b = get_slope(slope,keys)
            parameter_dict[keys] = {'a' : value_a, 'b':value_b}
            value_r = configs.r
            value_s = details_map[system][1]/3
            optimal_size = get_optimal(value_a,value_b,value_r,value_s,keys)
            estimated_error = 100
            weiss_within_range = True
            if keys == 'weiss' and (abs(value_a) + abs(value_b)) > 100:
                weiss_within_range = False
            if optimal_size <= data.shape[0]//configs.th and optimal_size > 1 and weiss_within_range is True:
                mean_accu,sd = get_projected_accuracy(optimal_size,repeat,data,perf_values,test_set_indices)
                r = configs.r
                th = configs.th
                total_cost = cost_eqn(th,optimal_size, 100-float(mean_accu), details_map[system][1] // 3, r)
                estimated_error = get_error_from_curve(value_a,value_b,optimal_size,keys)
                estimated_cost = cost_eqn(th,optimal_size,estimated_error,details_map[system][1] // 3, r)
            else:
                mean_accu,sd,total_cost,estimated_cost,optimal_size = (None,None,None,None,None)
            
            correlation_data[keys] = {'correlation' : rvalue,
                                      'p-value' : str(pvalue),
                                      'optimal sample size' :optimal_size,
                                      'accuracy' :mean_accu,
                                      'estimated accuracy': 100 - estimated_error,
                                      'standard deviation' :sd,
                                      'total cost' :total_cost,
                                      'estimated cost' : estimated_cost,
                                      'a' : value_a,
                                      'b' : value_b,
                                      'stderr' : stderr,
                                      'lambda size' : len(results)}
            
        if configs.curve_selection == 'dynamic':
            selected_curve,results = select_curve_dynamic(correlation_data,data,perf_values,parameter_dict,results,test_set_indices)
        elif configs.curve_selection == 'static':
            selected_curve = select_curve(correlation_data)
        else:
            selected_curve = select_curve_composite(correlation_data)
        if print_detail is True:
            print()
            print('Detailed learning projections:')
            print('<curve-id> : {<details>}')
            print()
            
        for keys in correlation_data:
            if keys in selected_curve:
                correlation_data[keys]['selected'] = True
                if print_detail is True:
                    print(str(keys) +"**:"+str(correlation_data[keys]))
            else:
                correlation_data[keys]['selected'] = False
                if print_detail is True and float(correlation_data[keys]['correlation']) < configs.min_corr:
                    print(str(keys) +":"+str(correlation_data[keys]))
        if print_detail is True:            
            print("-----------------------------------------------")
            print()
        corr_list.append(correlation_data)
        if configs.plot is True and configs.sense_curve is True:
            plot.prog_data.append((results,correlation_data))
            plot.curr_system = system_val
            
    if configs.plot is True and configs.sense_curve is True:
        p_value = mean_corr_list(corr_list)
        plot.plot_now()
    else:
        p_value = mean_corr_list(corr_list)         
    return p_value