def prepare_data(): df = read_data('ElectionsData.csv') test_new = read_data('ElectionsData_Pred_Features.csv', index=ID_COLUMN) test_new.rename( columns={ 'Financial_balance_score_.0.1.': 'Financial_balance_score_(0-1)', 'X.Of_Household_Income': '%Of_Household_Income', 'X.Time_invested_in_work': '%Time_invested_in_work', 'X._satisfaction_financial_policy': '%_satisfaction_financial_policy' }, inplace='True' ) original_features = df.columns.values identify_and_set_feature_type(df) identify_and_set_feature_type(test_new) train, validate, test = train_validate_test_split(df) save_as_csv_original(train, validate, test) train, validate, test, test_new = handle_outliers(train, validate, test, test_new) train, validate, test, test_new = handle_imputation(train, validate, test, test_new) train, validate, test, test_new = handle_type_modification(train, validate, test, test_new) train, validate, test, test_new = handle_scaling(train, validate, test, test_new) train, validate, test, test_new = handle_right_feature_set(train, validate, test, test_new) # train, validate, test = handle_feature_selection(train, validate, test, 15) save_features_selected(original_features, train.columns.values) save_as_csv(train, validate, test, test_new)
def test_combinations(): train, validate, test, test_new = load_prepared_data() train_x, train_y = split_label(train) labels = sorted( list(filter(lambda x: x not in {3, 4, 7}, train_y.unique()))) test_new_x = read_data('test_new.csv', index=ID_COLUMN) test_new_y = read_data( 'results.csv', index=ID_COLUMN)['PredictVote'].astype('category').cat.codes result = [] for r in range(1, min(len(labels) + 1, 11)): print(r) for c in itertools.combinations(labels, r): y = test_new_y.map(lambda x: 1 if x in c else 0) counter = Counter(y) if {i: counter[i] / len(y) * 100.0 for i in counter}[1] < 51: continue else: score = calinski_harabaz_score(test_new_x, y) print(c, score) result.append((c, score)) return result
def test_results(): train = read_data('train_original.csv') validate = read_data('validate_original.csv') test = read_data('test_original.csv') train, validate, test = most_basic_preparation(train, validate, test) train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Basic') train = read_data('train.csv') test = read_data('test.csv') train_x, train_y = split_label(train) test_x, test_y = split_label(test) test_data_preparation(train_x, train_y, test_x, test_y, 'Advanced')
def run_k_means_all_data(): ID_COLUMN = 'IdentityCard_Num' test_new_x = read_data('test_new.csv', index=ID_COLUMN) test_new_y = read_data('results.csv', index=ID_COLUMN) X, y = test_new_x, test_new_y.PredictVote.astype('category').cat.codes for k in [6, 9, 10, 11, 12]: print(k, '=========') kmeans = KMeans(n_clusters=k).fit(X) d = get_clusters_labels(kmeans, y) s = get_clusters_sizes_percent(kmeans) dist = get_clusters_distribution(kmeans, y) for i, v in d.items(): print('{:>2} {:>6}%'.format(i, s[i]), v) print('{:>10}'.format('Percent'), np.array(dist[i])) print('=========')
def prepare_data(): df = read_data('ElectionsData.csv', online=True) original_features = df.columns.values identify_and_set_feature_type(df) train, validate, test = train_validate_test_split(df) save_as_csv_original(train, validate, test) train, validate, test = handle_outliers(train, validate, test) train, validate, test = handle_imputation(train, validate, test) train, validate, test = handle_type_modification(train, validate, test) train, validate, test = handle_scaling(train, validate, test) train, validate, test = handle_feature_selection(train, validate, test, 19) save_features_selected(original_features, train.columns.values) save_as_csv(train, validate, test)
def load_unprepared_data(): return ( read_data(x) for x in ['train_original.csv', 'validate_original.csv', 'test_original.csv'])
def load_prepared_data(): return (read_data(x) for x in ['train.csv', 'validate.csv', 'test.csv'])
def load_prepared_data(): for x in ['train.csv', 'validate.csv', 'test.csv']: yield read_data(x, index=INDEX_COLUMN) yield read_data('test_new.csv', index=ID_COLUMN)