def red_wine_quality(): # Read in Red Wine Quality red_wine_data = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data['quality'] = pd.cut(red_wine_data['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data['quality'] = le.fit_transform(red_wine_data['quality']) # Test-train split x = red_wine_data.drop('quality', axis=1).values y = red_wine_data['quality'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # K-Means k = 2 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start print('RWQ [k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'clustering/k-means/red-wine')
def fish_market(): # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Test-train split x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # K-Means k = 3 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start print('Fish market [k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'clustering/k-means/fish-market')
def fish_market(): # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Test-train split x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # EM k = 3 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start print('Fish market [EM] time (ms):', run_time*1000) em_as = accuracy_score(em.predict(x_test),y_test) print("EM clustering accuracy score: ",em_as) # Plot plot_clusters(k, x_test, em, 'clustering/em/fish-market')
def red_wine_quality(method='km'): if method == 'km': # Read in Red Wine Quality red_wine_data_km = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data_km['quality'] = pd.cut(red_wine_data_km['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data_km['quality'] = le.fit_transform( red_wine_data_km['quality']) # Group x, y x = red_wine_data_km.drop('quality', axis=1).values y = red_wine_data_km['quality'] create_start = time.process_time() # PCA c = 4 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # K-Means k = 2 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('RWQ [PCA + k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'dimen-reduction/pca/red-wine-k-means') elif method == 'em': # Read in Red Wine Quality red_wine_data_em = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data_em['quality'] = pd.cut(red_wine_data_em['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data_em['quality'] = le.fit_transform( red_wine_data_em['quality']) # Group x, y x = red_wine_data_em.drop('quality', axis=1).values y = red_wine_data_em['quality'] create_start = time.process_time() # PCA c = 3 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # EM k = 2 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('RWQ [PCA + EM] time (ms):', run_time * 1000) em_as = accuracy_score(em.predict(x_test), y_test) print("EM clustering accuracy score: ", em_as) # Plot plot_clusters(k, x_test, em, 'dimen-reduction/pca/red-wine-em') else: print('Invalid method: {}'.format(method))
def fish_market(method='km'): if method == 'km': # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Group x, y x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) create_start = time.process_time() # PCA c = 6 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # K-Means k = 3 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('Fish market [PCA + k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'dimen-reduction/pca/fish-market-k-means') elif method == 'em': # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Group x, y x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) create_start = time.process_time() # PCA c = 3 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # EM k = 3 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('Fish market [PCA + k-means] time (ms):', run_time * 1000) em_as = accuracy_score(em.predict(x_test), y_test) print("EM clustering accuracy score: ", em_as) # Plot plot_clusters(k, x_test, em, 'dimen-reduction/pca/fish-market-em') else: print('Invalid method: {}'.format(method))