Esempio n. 1
0
def red_wine_quality():
    # Read in Red Wine Quality
    red_wine_data = pd.read_csv('data/red-wine-quality/full.csv')

    # Label encode/transform
    red_wine_data['quality'] = pd.cut(red_wine_data['quality'],
                                      bins=[2, 5.5, 8],
                                      labels=['bad', 'good'])
    le = LabelEncoder()
    red_wine_data['quality'] = le.fit_transform(red_wine_data['quality'])

    # Test-train split
    x = red_wine_data.drop('quality', axis=1).values
    y = red_wine_data['quality']
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # K-Means
    k = 2
    kmeans = KMeans(n_clusters=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('RWQ [k-means] time (ms):', run_time * 1000)

    km_as = accuracy_score(kmeans.predict(x_test), y_test)
    print("k-means clustering accuracy score: ", km_as)

    # Plot
    plot_clusters(k, x_test, kmeans, 'clustering/k-means/red-wine')
Esempio n. 2
0
def fish_market():
    # Read in Fish market
    fish_data = pd.read_csv('data/fish-market/full.csv')
    species = fish_data['Species'].value_counts().index.tolist()

    # Test-train split
    x, y = fish_data.drop('Species', axis=1).values, []
    for fish in fish_data['Species']:
        y.append(species.index(fish))
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # K-Means
    k = 3
    kmeans = KMeans(n_clusters=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('Fish market [k-means] time (ms):', run_time * 1000)

    km_as = accuracy_score(kmeans.predict(x_test), y_test)
    print("k-means clustering accuracy score: ", km_as)

    # Plot
    plot_clusters(k, x_test, kmeans, 'clustering/k-means/fish-market')
Esempio n. 3
0
def fish_market():
    # Read in Fish market
    fish_data = pd.read_csv('data/fish-market/full.csv')
    species = fish_data['Species'].value_counts().index.tolist()

    # Test-train split
    x, y = fish_data.drop('Species', axis=1).values, []
    for fish in fish_data['Species']:
        y.append(species.index(fish))
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # EM
    k = 3
    em = GaussianMixture(n_components=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('Fish market [EM] time (ms):', run_time*1000)

    em_as = accuracy_score(em.predict(x_test),y_test)
    print("EM clustering accuracy score: ",em_as)

    # Plot
    plot_clusters(k, x_test, em, 'clustering/em/fish-market')
Esempio n. 4
0
def red_wine_quality(method='km'):
    if method == 'km':
        # Read in Red Wine Quality
        red_wine_data_km = pd.read_csv('data/red-wine-quality/full.csv')

        # Label encode/transform
        red_wine_data_km['quality'] = pd.cut(red_wine_data_km['quality'],
                                             bins=[2, 5.5, 8],
                                             labels=['bad', 'good'])
        le = LabelEncoder()
        red_wine_data_km['quality'] = le.fit_transform(
            red_wine_data_km['quality'])

        # Group x, y
        x = red_wine_data_km.drop('quality', axis=1).values
        y = red_wine_data_km['quality']

        create_start = time.process_time()

        # PCA
        c = 4
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # K-Means
        k = 2
        kmeans = KMeans(n_clusters=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('RWQ [PCA + k-means] time (ms):', run_time * 1000)

        km_as = accuracy_score(kmeans.predict(x_test), y_test)
        print("k-means clustering accuracy score: ", km_as)

        # Plot
        plot_clusters(k, x_test, kmeans,
                      'dimen-reduction/pca/red-wine-k-means')
    elif method == 'em':
        # Read in Red Wine Quality
        red_wine_data_em = pd.read_csv('data/red-wine-quality/full.csv')

        # Label encode/transform
        red_wine_data_em['quality'] = pd.cut(red_wine_data_em['quality'],
                                             bins=[2, 5.5, 8],
                                             labels=['bad', 'good'])
        le = LabelEncoder()
        red_wine_data_em['quality'] = le.fit_transform(
            red_wine_data_em['quality'])

        # Group x, y
        x = red_wine_data_em.drop('quality', axis=1).values
        y = red_wine_data_em['quality']

        create_start = time.process_time()

        # PCA
        c = 3
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # EM
        k = 2
        em = GaussianMixture(n_components=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('RWQ [PCA + EM] time (ms):', run_time * 1000)

        em_as = accuracy_score(em.predict(x_test), y_test)
        print("EM clustering accuracy score: ", em_as)

        # Plot
        plot_clusters(k, x_test, em, 'dimen-reduction/pca/red-wine-em')
    else:
        print('Invalid method: {}'.format(method))
Esempio n. 5
0
def fish_market(method='km'):
    if method == 'km':
        # Read in Fish market
        fish_data = pd.read_csv('data/fish-market/full.csv')
        species = fish_data['Species'].value_counts().index.tolist()

        # Group x, y
        x, y = fish_data.drop('Species', axis=1).values, []
        for fish in fish_data['Species']:
            y.append(species.index(fish))

        create_start = time.process_time()

        # PCA
        c = 6
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # K-Means
        k = 3
        kmeans = KMeans(n_clusters=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('Fish market [PCA + k-means] time (ms):', run_time * 1000)

        km_as = accuracy_score(kmeans.predict(x_test), y_test)
        print("k-means clustering accuracy score: ", km_as)

        # Plot
        plot_clusters(k, x_test, kmeans,
                      'dimen-reduction/pca/fish-market-k-means')

    elif method == 'em':
        # Read in Fish market
        fish_data = pd.read_csv('data/fish-market/full.csv')
        species = fish_data['Species'].value_counts().index.tolist()

        # Group x, y
        x, y = fish_data.drop('Species', axis=1).values, []
        for fish in fish_data['Species']:
            y.append(species.index(fish))

        create_start = time.process_time()

        # PCA
        c = 3
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # EM
        k = 3
        em = GaussianMixture(n_components=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('Fish market [PCA + k-means] time (ms):', run_time * 1000)

        em_as = accuracy_score(em.predict(x_test), y_test)
        print("EM clustering accuracy score: ", em_as)

        # Plot
        plot_clusters(k, x_test, em, 'dimen-reduction/pca/fish-market-em')
    else:
        print('Invalid method: {}'.format(method))