Exemple #1
0
def prepare_data():
    df = read_data('ElectionsData.csv')
    test_new = read_data('ElectionsData_Pred_Features.csv', index=ID_COLUMN)

    test_new.rename(
        columns={
            'Financial_balance_score_.0.1.': 'Financial_balance_score_(0-1)',
            'X.Of_Household_Income': '%Of_Household_Income',
            'X.Time_invested_in_work': '%Time_invested_in_work',
            'X._satisfaction_financial_policy': '%_satisfaction_financial_policy'
        },
        inplace='True'
    )

    original_features = df.columns.values

    identify_and_set_feature_type(df)
    identify_and_set_feature_type(test_new)

    train, validate, test = train_validate_test_split(df)

    save_as_csv_original(train, validate, test)

    train, validate, test, test_new = handle_outliers(train, validate, test, test_new)
    train, validate, test, test_new = handle_imputation(train, validate, test, test_new)
    train, validate, test, test_new = handle_type_modification(train, validate, test, test_new)
    train, validate, test, test_new = handle_scaling(train, validate, test, test_new)

    train, validate, test, test_new = handle_right_feature_set(train, validate, test, test_new)
    # train, validate, test = handle_feature_selection(train, validate, test, 15)

    save_features_selected(original_features, train.columns.values)
    save_as_csv(train, validate, test, test_new)
Exemple #2
0
def test_combinations():
    train, validate, test, test_new = load_prepared_data()
    train_x, train_y = split_label(train)
    labels = sorted(
        list(filter(lambda x: x not in {3, 4, 7}, train_y.unique())))

    test_new_x = read_data('test_new.csv', index=ID_COLUMN)
    test_new_y = read_data(
        'results.csv',
        index=ID_COLUMN)['PredictVote'].astype('category').cat.codes

    result = []

    for r in range(1, min(len(labels) + 1, 11)):
        print(r)
        for c in itertools.combinations(labels, r):
            y = test_new_y.map(lambda x: 1 if x in c else 0)
            counter = Counter(y)
            if {i: counter[i] / len(y) * 100.0 for i in counter}[1] < 51:
                continue
            else:
                score = calinski_harabaz_score(test_new_x, y)
                print(c, score)
                result.append((c, score))

    return result
Exemple #3
0
def test_results():
    train = read_data('train_original.csv')
    validate = read_data('validate_original.csv')
    test = read_data('test_original.csv')
    train, validate, test = most_basic_preparation(train, validate, test)
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Basic')

    train = read_data('train.csv')
    test = read_data('test.csv')
    train_x, train_y = split_label(train)
    test_x, test_y = split_label(test)
    test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
Exemple #4
0
def run_k_means_all_data():
    ID_COLUMN = 'IdentityCard_Num'
    test_new_x = read_data('test_new.csv', index=ID_COLUMN)
    test_new_y = read_data('results.csv', index=ID_COLUMN)

    X, y = test_new_x, test_new_y.PredictVote.astype('category').cat.codes

    for k in [6, 9, 10, 11, 12]:
        print(k, '=========')
        kmeans = KMeans(n_clusters=k).fit(X)
        d = get_clusters_labels(kmeans, y)
        s = get_clusters_sizes_percent(kmeans)
        dist = get_clusters_distribution(kmeans, y)

        for i, v in d.items():
            print('{:>2} {:>6}%'.format(i, s[i]), v)
            print('{:>10}'.format('Percent'), np.array(dist[i]))

        print('=========')
Exemple #5
0
def prepare_data():
    df = read_data('ElectionsData.csv', online=True)

    original_features = df.columns.values

    identify_and_set_feature_type(df)

    train, validate, test = train_validate_test_split(df)

    save_as_csv_original(train, validate, test)

    train, validate, test = handle_outliers(train, validate, test)
    train, validate, test = handle_imputation(train, validate, test)
    train, validate, test = handle_type_modification(train, validate, test)
    train, validate, test = handle_scaling(train, validate, test)
    train, validate, test = handle_feature_selection(train, validate, test, 19)

    save_features_selected(original_features, train.columns.values)
    save_as_csv(train, validate, test)
Exemple #6
0
def load_unprepared_data():
    return (
        read_data(x) for x in
        ['train_original.csv', 'validate_original.csv', 'test_original.csv'])
Exemple #7
0
def load_prepared_data():
    return (read_data(x) for x in ['train.csv', 'validate.csv', 'test.csv'])
Exemple #8
0
def load_prepared_data():
    for x in ['train.csv', 'validate.csv', 'test.csv']:
        yield read_data(x, index=INDEX_COLUMN)
    yield read_data('test_new.csv', index=ID_COLUMN)