Ejemplo n.º 1
0
comparison_validate = np.array(pd.read_csv('PATH/comparison_validate_%s.csv' % (name), header=None))
print(np.sum(comparison_validate))
comparison_test = np.array(pd.read_csv('PATH/comparison_test_%s.csv' % (name), header=None))
print(np.sum(comparison_test))


used_train = np.array(pd.read_csv('PATH/used_train_%s.csv' % (name), header=None))
print(np.sum(used_train))
used_train_coarse = np.array(pd.read_csv('PATH/used_train_coarse_%s.csv' % (name), header=None))
print(np.sum(used_train))
used_validate = np.array(pd.read_csv('PATH/used_validate_coarse_%s.csv' %(name), header=None))
print(np.sum(used_validate))
used_test = np.array(pd.read_csv('PATH/used_test_coarse_%s.csv' %(name), header=None))
print(np.sum(used_test))

train_pairwise_data = fit_model_class.pairwise_comparisons(comparison_train, train_features, used_train, {}, standardized = True)
train_coarse_pairwise_data = fit_model_class.pairwise_comparisons(comparison_train, train_features, used_train_coarse, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject)
validate_pairwise_data = fit_model_class.pairwise_comparisons(comparison_validate, validate_features, used_validate, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject)
test_pairwise_data = fit_model_class.pairwise_comparisons(comparison_test, test_features, used_test, {}, standardized = True, StandardScalarObject = train_pairwise_data.StandardScalerObject)

# first number is the l2 regularization strength, second number is the number of hidden nodes
parameters = [(0.001, 400), (0.0001, 50), (0.0001, 200), (0.0001, 400), (0.0001, 600), (0.001, 50), (0.001, 200), (0.001, 600), (0.01, 50), (0.01, 200), (0.01, 400), (0.01, 600)]

c, nodes = parameters[idx]

print('c is', float(c))
print('nodes is', int(nodes))
total_comparisons = np.sum(train_pairwise_data.comparison_data)
total_comparisons = int(total_comparisons)
d, num_items = train_pairwise_data.standardized_features.shape
ct_to_item1 = []
Ejemplo n.º 2
0
def threshold_sweep_samples(dim,
                            num_items,
                            num_exp,
                            thresholds,
                            num_samples,
                            rescale=False):
    '''
    this function gets the estimation error and the kendall tau correlation as the threshold varies
    '''
    threshold_function = 'var'
    threshold_type = 'top'
    results = [[] for i in range(len(thresholds))]
    btl_results = [[] for i in range(len(thresholds))]

    kt_results = [[] for i in range(len(thresholds))]
    btl_kt_results = [[] for i in range(len(thresholds))]

    for i in range(num_exp):
        embedding_object = embedding_class.embedding(num_items, dim, 0,
                                                     np.sqrt(1 / dim))
        original_U = np.copy(embedding_object.U)
        for idx, k in enumerate(thresholds):
            print('on exp {} and threshold {}'.format(i, k))
            if rescale:
                embedding_object.U = (np.sqrt(dim) / np.sqrt(k)) * original_U
            synthetic_object = synthetic_pair_class.synthetic_pairs(
                embedding_object, {
                    'threshold_function': threshold_function,
                    'threshold_type': threshold_type,
                    'threshold': k,
                    'relative_flag': True
                })
            comparison_data = synthetic_object.get_comparison_matrix(
                num_samples)
            fit_model_instance = fit_model_class.pairwise_comparisons(
                comparison_data, embedding_object.U,
                np.ones((num_items, num_items)), {})
            result = fit_model_instance.fit_threshold(k,
                                                      [1 for i in range(dim)],
                                                      -1, [1000000000],
                                                      threshold_type,
                                                      threshold_function,
                                                      False,
                                                      solver='sag')

            orig_norm = np.linalg.norm(embedding_object.w)
            results[idx].append(
                synthetic_object.get_w_est_error(result[0][1000000000][2][0]) /
                orig_norm)
            kt, my_kt = synthetic_object.get_ranking_error_and_score_rsme(
                result[0][1000000000][2][0])
            kt_results[idx].append(my_kt)

            fit_model_instance = fit_model_class.pairwise_comparisons(
                comparison_data, embedding_object.U,
                np.ones((num_items, num_items)), {})
            result = fit_model_instance.fit_threshold(dim,
                                                      [1 for i in range(dim)],
                                                      -1, [1000000000],
                                                      threshold_type,
                                                      threshold_function,
                                                      False,
                                                      solver='sag')

            orig_norm = np.linalg.norm(embedding_object.w)
            btl_results[idx].append(
                synthetic_object.get_w_est_error(result[0][1000000000][2][0]) /
                orig_norm)
            kt, my_kt = synthetic_object.get_ranking_error_and_score_rsme(
                result[0][1000000000][2][0])
            btl_kt_results[idx].append(my_kt)

    return results, btl_results, kt_results, btl_kt_results
Ejemplo n.º 3
0
def sample_sweep(dim, num_items, num_exp, threshold, samples):
    '''
    this function gets the estimation error, kendall tau correlation, and the pairwise accuracy rates as the number of samples vary
    '''
    threshold_function = 'var'
    threshold_type = 'top'
    results = [[] for i in range(len(samples))]
    btl_results = [[] for i in range(len(samples))]
    prediction_results = [[] for i in range(len(samples))]

    kt_results = [[] for i in range(len(samples))]
    btl_kt_results = [[] for i in range(len(samples))]
    btl_prediction_results = [[] for i in range(len(samples))]

    embedding_object = embedding_class.embedding(num_items, dim, 0,
                                                 np.sqrt(1 / dim))
    synthetic_object = synthetic_pair_class.synthetic_pairs(
        embedding_object, {
            'threshold_function': threshold_function,
            'threshold_type': threshold_type,
            'threshold': threshold,
            'relative_flag': True
        })

    embedding_object_unseen = embedding_class.embedding(
        num_items, dim, 0, np.sqrt(1 / dim))
    embedding_object_unseen.w = np.copy(synthetic_object.w)
    parameters = synthetic_object.get_parameters()
    synthetic_object_unseen = synthetic_pair_class.synthetic_pairs(
        embedding_object_unseen, {
            'threshold_function': threshold_function,
            'threshold_type': threshold_type,
            'threshold': threshold,
            'relative_flag': True
        })
    print('object unsen@')
    parameters = synthetic_object_unseen.get_parameters()
    parameters = synthetic_object.get_parameters()
    bound = [[parameters[5] * (1 / np.sqrt(i)) for j in range(10)]
             for i in samples]
    print('samples', parameters[6], parameters[7])

    model_info = {
        'threshold_function': threshold_function,
        'threshold_type': threshold_type,
        'threshold': threshold,
        'relative_flag': True
    }

    for i in range(num_exp):
        for idx, s in enumerate(samples):
            print('on exp {} and samples {}'.format(i, s))

            comparison_data = synthetic_object.get_comparison_matrix(s)
            fit_model_instance = fit_model_class.pairwise_comparisons(
                comparison_data, embedding_object.U,
                np.ones((num_items, num_items)), {})
            result = fit_model_instance.fit_threshold(threshold,
                                                      [1 for i in range(dim)],
                                                      -1, [1000000000],
                                                      threshold_type,
                                                      threshold_function,
                                                      False,
                                                      solver='sag')
            results[idx].append(
                synthetic_object.get_w_est_error(result[0][1000000000][2][0]))
            kt, _ = synthetic_object_unseen.get_ranking_error_and_score_rsme(
                result[0][1000000000][2][0])
            kt_results[idx].append(kt)
            prediction_results[idx].append(
                synthetic_object_unseen.get_prediction_error(
                    result[0][1000000000][2][0], model_info, btl=False))

            fit_model_instance = fit_model_class.pairwise_comparisons(
                comparison_data, embedding_object.U,
                np.ones((num_items, num_items)), {})
            result = fit_model_instance.fit_threshold(dim,
                                                      [1 for i in range(dim)],
                                                      -1, [1000000000],
                                                      threshold_type,
                                                      threshold_function,
                                                      False,
                                                      solver='sag')
            btl_results[idx].append(
                synthetic_object.get_w_est_error(result[0][1000000000][2][0]))
            kt, _ = synthetic_object_unseen.get_ranking_error_and_score_rsme(
                result[0][1000000000][2][0])
            btl_kt_results[idx].append(kt)
            btl_prediction_results[idx].append(
                synthetic_object_unseen.get_prediction_error(
                    result[0][1000000000][2][0], model_info, btl=True))

    return results, btl_results, bound, kt_results, btl_kt_results, prediction_results, btl_prediction_results
Ejemplo n.º 4
0
def get_df(name, min_comparisons, confidence = 0):
    usable_features = ['points', 'var_xcoord', 'var_ycoord', 'varcoord_ratio', 'avgline', 'varline', 'boyce', 'lenwid', 'jagged', 'parts', 'hull', 'bbox', 'reock', 'polsby', 'schwartzberg', 'circle_area', 'circle_perim', 'hull_area', 'hull_perim', 'orig_area', 'district_perim', 'corners', 'xvar', 'yvar', 'cornervar_ratio', 'sym_x', 'sym_y']
    df = pd.read_csv('district_data/paired_comparisons.csv')
    features_df = pd.read_csv('district_data/subset_features.csv')

    if name != 'all':
        df = df.loc[df['study'] == name]

    df = df.loc[df['alternate_id_1'] != df['alternate_id_2']]
    districts_in_shiny = set(df.alternate_id_1.unique()).union(set(df.alternate_id_2.unique()))
    districts_in_shiny_item_num_dict = {}
    for idx, district in enumerate(districts_in_shiny):
        districts_in_shiny_item_num_dict[district] = idx
        districts_in_shiny_item_num_dict[idx] = district

    comparisons = np.zeros([len(districts_in_shiny), len(districts_in_shiny)])
    seen_pairs = np.zeros([len(districts_in_shiny), len(districts_in_shiny)])
    features = np.zeros([len(usable_features), len(districts_in_shiny)])

    for index, row in df.iterrows():
        district1 = districts_in_shiny_item_num_dict[row['alternate_id_1']]
        district2 = districts_in_shiny_item_num_dict[row['alternate_id_2']]
        winner = districts_in_shiny_item_num_dict[row['alternate_id_winner']]
        if winner == district1:
            comparisons[district1, district2] += 1
        else:
            comparisons[district2, district1] += 1
        seen_pairs[district1, district2] = 1
        seen_pairs[district2, district1] = 1

    their_compactness_measure = []

    for i in range(len(districts_in_shiny)):
        their_compactness_measure.append(float(features_df.loc[features_df['district'] == districts_in_shiny_item_num_dict[i]]['compactness']))

    their_ranking = np.argsort(np.array(their_compactness_measure))[::-1]
    print('their ranking', their_ranking)

    for i in range(len(districts_in_shiny)):
        for idx, feat in enumerate(usable_features):
            features[idx, i] = features_df.loc[features_df['district'] == districts_in_shiny_item_num_dict[i]][feat]

    for i in range(len(districts_in_shiny)):
        for j in range(i+1, len(districts_in_shiny)):
            if comparisons[i,j] + comparisons[j,i] < min_comparisons:
                comparisons[i,j] = 0
                comparisons[j,i] = 0
                seen_pairs[i,j] = 0
                seen_pairs[j,i] = 0

    if confidence != 0:
        for i in range(len(districts_in_shiny)):
            for j in range(i+1, len(districts_in_shiny)):
                if comparisons[i,j] + comparisons[j,i]> 0:
                    prob_i_beats_j = comparisons[i,j] / (comparisons[j,i] + comparisons[i,j])
                    if not (prob_i_beats_j > confidence or prob_i_beats_j < 1-confidence): #or comparisons[i,j] + comparisons[j,i] < max_compares:
                        print(i,j, prob_i_beats_j, comparisons[i,j] + comparisons[j,i] )
                        comparisons[i,j] = 0
                        comparisons[j,i] = 0
                        seen_pairs[i,j] = 0
                        seen_pairs[j,i] = 0

    pairwise_data = fit_model_class.pairwise_comparisons(comparisons, features, seen_pairs, districts_in_shiny_item_num_dict, standardized = True)
    return pairwise_data, their_ranking, districts_in_shiny_item_num_dict