Ejemplo n.º 1
0
def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04):
    """Run ICA on specified dataset and saves mean kurtosis results as CSV
    file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    ica = FastICA(random_state=0, max_iter=max_iter, tol=tol)
    kurt = []
    loss = []

    X = StandardScaler().fit_transform(X)
    for dim in dims:
        print(dim)
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        df = pd.DataFrame(tmp)
        df = df.kurt(axis=0)
        kurt.append(kurtosistest(tmp).statistic.mean())
        proj = ica.inverse_transform(tmp)
        loss.append(((X - proj)**2).mean())

    res = pd.DataFrame({"kurtosis": kurt, "loss": loss})

    # save results as CSV
    resdir = 'results/ICA'
    resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir)
    res.to_csv(resfile, index_label='n')
Ejemplo n.º 2
0
    def perform(self):
        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/ICA.py
        self.log("Performing {}".format(self.experiment_name()))

        # %% Data for 1
        ica = FastICA(random_state=self._details.seed)
        kurt = {}
        for dim in self._dims:
            ica.set_params(n_components=dim)
            tmp = ica.fit_transform(self._details.ds.training_x)
            tmp = pd.DataFrame(tmp)
            tmp = tmp.kurt(axis=0)
            kurt[dim] = tmp.abs().mean()

        kurt = pd.Series(kurt)
        kurt.to_csv(self._out.format('{}_scree.csv'.format(self._details.ds_name)))

        # %% Data for 2
        grid = {'ica__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        ica = FastICA(random_state=self._details.seed)
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        pipe = Pipeline([('ica', ica), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, final_estimator = self.gs_with_best_estimator(pipe, grid)
        self.log("Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name)))
        self.log("Done")
Ejemplo n.º 3
0
def run_ica_2(X, dataset):
    model = FastICA(random_state=0)

    result_df = pd.DataFrame()

    k_max = X.shape[1]
    if k_max > 120: k_max = 120
    for i in range(2, k_max + 1):
        model.set_params(n_components=i)
        ica_data = pd.DataFrame(model.fit_transform(X)).kurt(
            axis=0)  #kurtosis of ica results
        result_df.loc[i, 'mean_kurtosis'] = ica_data.abs().mean()

    plt.clf()
    plt.title('ICA_Mean_Kurtosis_Per_K')
    plt.xlabel('K')
    plt.ylabel('Mean')
    plt.grid()

    plt.bar(range(2, result_df.shape[0] + 2),
            result_df['mean_kurtosis'],
            align='center',
            label='mean kurtosis')

    LOGGER.info('ica max kurtosis on {}: k={}'.format(
        dataset,
        result_df.idxmax(axis=0)['mean_kurtosis']))

    plt.savefig('plots/' + 'ica_kurt_' + dataset + '.png')
Ejemplo n.º 4
0
def ica_experiment(X, name, dims):
    """Run ICA on specified dataset and saves mean kurtosis results as CSV
    file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    ica = FastICA(random_state=0, max_iter=5000)
    kurt = {}

    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        df = pd.DataFrame(tmp)
        df = df.kurt(axis=0)
        kurt[dim] = df.abs().mean()

    res = pd.DataFrame.from_dict(kurt, orient='index')
    res.rename(columns={0: 'kurtosis'}, inplace=True)

    # save results as CSV
    resdir = 'results/ICA'
    resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir)
    res.to_csv(resfile, index_label='n')
Ejemplo n.º 5
0
def run_credit_ICA(creditX, creditY):
    dims_digits = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23
    ]
    print('Part 2B & 4B - Starting ICA for dataset...credit')
    ica = FastICA(random_state=5)
    kurt = {}
    for dim in dims_digits:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(creditX)
        tmp = pd.DataFrame(tmp)
        tmp2 = tmp.kurt(axis=0)
        kurt[dim] = tmp2.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv('./P2_Dimensionality_Reduction/Credit_ICA_kurtosis.csv')

    # Run Neural Networks
    # Transform X data
    sc = StandardScaler()
    creditX_tr = sc.fit_transform(creditX)

    nn_results = run_NN(dims_digits, ica, creditX_tr, creditY)
    nn_results.to_csv('./P4_Neural_Networks_Reduced/Credit_ICA_nn_results.csv')
def main():
    decomp1 = FastICA(random_state=10)
    decomp2 = FastICA(random_state=10)
    for i, val in enumerate(r_dims):  # [0.6, 0.7, 0.8, 0.9]:
        decomp1.set_params(n_components=r_dims[i])
        decomp2.set_params(n_components=c_dims[i])
        run_dim_alg(r_X, r_y, 'reviews', decomp1, r_dims[i], OUT)
        run_dim_alg(c_X, c_y, 'cancer', decomp2, c_dims[i], OUT)
Ejemplo n.º 7
0
def run_ica(X, dname, dims):
    ica = FastICA(random_state=5, max_iter=5000)
    kurt = {}
    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + '{}_ica.csv'.format(dname))
def get_gnnl_ica():
    best_ica_components = X_train_gnnl.shape[1]
    ica = FastICA(random_state=42, max_iter=500)
    print("Running ICA for {} components".format(best_ica_components))
    ica.set_params(n_components=best_ica_components)
    ica.fit(X_train_gnnl)
    X_train_gnnl_ica = ica.transform(X_train_gnnl)
    X_test_gnnl_ica = ica.transform(X_test_gnnl)
    X_train_gnnl_ica_df = pd.DataFrame(X_train_gnnl_ica)
    ica_kurt = X_train_gnnl_ica_df.kurt(axis=0)
    return X_train_gnnl_ica[:, ica_kurt > 200], X_test_gnnl_ica[:,
                                                                ica_kurt > 200]
Ejemplo n.º 9
0
    def perform(self):
        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/ICA.py
        self.log("Performing {}".format(self.experiment_name()))

        # %% Data for 1
        ica = FastICA(
            random_state=self._details.seed)  # google how this is done
        kurt = {}
        for dim in self._dims:
            ica.set_params(n_components=dim)
            tmp = ica.fit_transform(
                self._details.ds.training_x
            )  # performing ICA on training data --> data with dim components
            tmp = pd.DataFrame(tmp)
            tmp = tmp.kurt(
                axis=0
            )  # calculates fourth moment for each component (# components == dim)
            kurt[dim] = tmp.abs().mean(
            )  # average kurtosis among the components, why do this???

        kurt = pd.Series(kurt)
        kurt.to_csv(
            self._out.format('{}_scree.csv'.format(self._details.ds_name))
        )  # {dim: avg kurtosis of dim components} --> './output/ICA/bank_scree.csv'
        # i assume plotting will go through this .csv file and find the dim that results in highest kurtosis? (what transformation is considered best?)

        # %% Data for 2
        # learn NN on transformed input features, pick the best NN and best feature transformation
        grid = {
            'ica__n_components': self._dims,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        ica = FastICA(random_state=self._details.seed)
        mlp = MLPClassifier(
            activation='relu',
            max_iter=2000,
            early_stopping=True,
            random_state=self._details.seed)  # change to 'identity'???
        pipe = Pipeline([('ica', ica), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, final_estimator = self.gs_with_best_estimator(
            pipe, grid)  # WAIT, best_estimator != final_estimator
        self.log("Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)  # what is this?
        tmp.to_csv(
            self._out.format('{}_dim_red.csv'.format(
                self._details.ds_name)))  # --> './output/ICA/bank_ICA.csv
        self.log("Done")
Ejemplo n.º 10
0
def performICA(X, title):
    ica = FastICA(random_state=11, whiten=True)
    kurt = {}
    dims = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    for d in dims:
        ica.set_params(n_components=d)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[d] = tmp.abs().mean()
    kurt = pd.Series(kurt)
    kurt.plot()
    plt.xlabel("K")
    plt.title("ICA on " + title)
    plt.show()
Ejemplo n.º 11
0
def ica(X, problem):
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    ica = FastICA(random_state=5)
    if 'Blood' in problem:
        dims = range(2, len(X[0]))
    kurt = {}
    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + problem + 'ICA.csv')
Ejemplo n.º 12
0
def ICA_experiment(X, y, title, folder=""):
    n_components_range = list(np.arange(2, X.shape[1], 1))
    ica = ICA(random_state=200)
    kurtosis_scores = []

    for n in n_components_range:
        ica.set_params(n_components=n)
        ice_score = ica.fit_transform(X)
        ice_score = pd.DataFrame(ice_score)
        ice_score = ice_score.kurt(axis=0)
        kurtosis_scores.append(ice_score.abs().mean())

    plt.figure()
    plt.title("ICA Kurtosis: " + title)
    plt.xlabel("Independent Components")
    plt.ylabel("Avg Kurtosis Across IC")
    plt.plot(n_components_range, kurtosis_scores)
    plt.savefig(folder + '/ICA.png')
    plt.close()
Ejemplo n.º 13
0
def run_ICA(X, title):
    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    dims.append(X.shape[1])
    ica = ICA(random_state=5)
    kurt = []

    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt.append(tmp.abs().mean())

    plt.figure()
    plt.title("ICA Kurtosis: " + title)
    plt.xlabel("Independent Components")
    plt.ylabel("Avg Kurtosis Across IC")
    plt.plot(dims, kurt, 'b-')
    plt.grid(False)
    plt.show()
Ejemplo n.º 14
0
def run_adult_ICA(adultX, adultY):
    dims_digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    print('Part 2B & 4B - Starting ICA for dataset...adult')
    ica = FastICA(random_state=5)
    kurt = {}
    for dim in dims_digits:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(adultX)
        tmp = pd.DataFrame(tmp)
        tmp2 = tmp.kurt(axis=0)
        kurt[dim] = tmp2.abs().mean(
        )  #taking the mean kurtosis of all components

    kurt = pd.Series(kurt)
    kurt.to_csv('./P2_Dimensionality_Reduction/Adult_ICA_kurtosis.csv')

    # Transform X data
    sc = StandardScaler()
    adultX_tr = sc.fit_transform(adultX)

    nn_results = run_NN(dims_digits, ica, adultX_tr, adultY)
    nn_results.to_csv('./P4_Neural_Networks_Reduced/Adult_ICA_nn_results.csv')
Ejemplo n.º 15
0
def run_ICA(X, y, plot_path):

    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    #dims = list(np.arange(2,80,3))
    dims.append(X.shape[1])
    ica = ICA(random_state=1, max_iter=10)
    kurt = []

    for dim in dims:
        print(dim)
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt.append(tmp.abs().mean())

    plt.figure()
    plt.title("ICA Kurtosis")
    plt.xlabel("Independent Components")
    plt.ylabel("Avg Kurtosis Across IC")
    plt.plot(dims, kurt, 'b-')
    plt.grid(False)
    plt.savefig(plot_path + '/ICA_DR')
Ejemplo n.º 16
0
def part2():
    ica = FastICA(random_state=5, max_iter=1000, tol=0.75)
    kurt = {}
    for dim in range(1, 31):
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(cancer_x)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + 'cancer part 2.csv')

    ica = FastICA(random_state=5)
    kurt = {}
    for dim in dims_big:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(housing_x)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + 'housing part 2.csv')
Ejemplo n.º 17
0
def run_ICA(X, y, title):

    dims = list(np.arange(2, (X.shape[1] - 1), 3))
    dims.append(X.shape[1])
    ica = ICA(random_state=randomSeed, whiten=True)
    kurt = []

    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt.append(tmp.abs().mean())

    plt.figure()
    plt.title("ICA Kurtosis: " + title)
    plt.xlabel("Independent Components")
    plt.ylabel("Avg Kurtosis Across IC")
    plt.plot(dims, kurt, 'b-')
    plt.grid(False)
    d = plotsdir + "/" + title
    if not os.path.exists(d):
        os.makedirs(d)
    plt.savefig(d + "/ICA Kurtosis.png")
Ejemplo n.º 18
0
from helpers.dim_reduction import run_dim_alg, get_data
from helpers.constants import ICA_DIMS

r_dims = c_dims = ICA_DIMS

OUT = '{}/../../OUTPUT/ICA'.format(dir_path)
BASE = '{}/../../OUTPUT/BASE'.format(dir_path)

r, c = get_data(BASE)
r_X, r_y = r
c_X, c_y = c

ica = FastICA(random_state=5)
kurt = {}
for dim in r_dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(r_X)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt)
kurt.to_csv('{}/reviews kurtosis.csv'.format(OUT))

ica = FastICA(random_state=5)
kurt = {}
for dim in c_dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(c_X)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
Ejemplo n.º 19
0
#%% Baseline scores
datasets={}
datasets['Titanic']={'X_train':XT_train.copy(), 'y_train':yT_train.copy(), 'X_test':XT_test.copy(), 'y_test':yT_test.copy()}
datasets['Wilt']={'X_train':XW_train.copy(), 'y_train':yW_train.copy(), 'X_test':XW_test.copy(), 'y_test':yW_test.copy()}

clusters =  [2,3,4,5,6,8,10,12,15,20,25,30,35,40,50]
scores = hlp.explore_clustering(datasets, clusters)

#%% Part 2 ICA

# ICA for Titanic
icaT = FastICA(random_state=54)
dims = [2,3,4,5,6,7,8,9,10]
kurtT = {}
for dim in dims:
    icaT.set_params(n_components=dim)
    trans = icaT.fit_transform(XT)
    proj = icaT.inverse_transform(trans)
    tmp = pd.DataFrame(trans)
    tmp = tmp.kurt(axis=0)
    rec_err = ((XT - proj)**2).mean()
    kurtT[dim] = (round(tmp.abs().mean(),3), round(tmp.abs().min(),3), round(rec_err,3), tmp)
kurtT = pd.Series(kurtT) 
kurtT  # examine average and minimum kurtosis
kurtT[7]
kurtT[4]

# check what kurt returns on a normal distribution:
pd.DataFrame(np.random.normal(155, 72, 100000)).kurt(axis=0)

icaW = FastICA(random_state=54)
Ejemplo n.º 20
0
def ulICA(X, y, random_seed, filename, verbose=False):
    n_cols = len(X.columns)

    n_com = range(1, n_cols + 1)
    ica = FastICA(random_state=random_seed)

    kurt_scores = []

    for n in n_com:
        ica.set_params(n_components=n)
        icaX = ica.fit_transform(X)
        icaX = pd.DataFrame(icaX)
        icaX = icaX.kurt(axis=0)
        kurt_scores.append(icaX.abs().mean())

    if verbose:
        print(kurt_scores)
    plt.figure(0)
    plt.xlabel("# of Components", fontsize=16)
    plt.ylabel("Average Kurtosis", fontsize=16)
    plt.title(filename + ' ICA', fontsize=16)
    plt.plot(n_com, kurt_scores, 'b-')
    plt.xticks(range(1, n_cols + 1), fontsize=16)
    plt.yticks(fontsize=16)
    plt.grid(linestyle='-', linewidth=1, axis="x")
    plt.savefig("Images\\" + filename + " ICA Kurtosis")
    plt.show()
    plt.close()

    n_cols = len(X.columns)
    n_com = range(1, n_cols + 1)

    re = defaultdict(dict)

    for i, n in product(range(50), n_com):
        random_projection = PCA(random_state=random_seed, n_components=n)
        X_Reduced = random_projection.fit_transform(X)

        p_inverse = np.linalg.pinv(random_projection.components_.T)
        Recon_X = X_Reduced.dot(p_inverse)

        MSE_RE = metrics.mean_squared_error(X, Recon_X)
        re[n][i] = MSE_RE

    rec = pd.DataFrame(re).T
    re_mean = rec.mean(axis=1).tolist()
    re_std = rec.std(axis=1).tolist()
    lower_axis = []
    upper_axis = []

    zip_object = zip(re_mean, re_std)
    for list1_i, list2_i in zip_object:
        lower_axis.append(list1_i - list2_i)
        upper_axis.append(list1_i + list2_i)

    if verbose:
        print('ICA RE')
        print(re_mean)
        print(re_std)
    fig, ax1 = plt.subplots()
    ax1.plot(n_com, re_mean, 'b-')
    ax1.fill_between(n_com, lower_axis, upper_axis, alpha=0.2)
    ax1.set_xlabel('# of Components', fontsize=16)
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Error', color='b', fontsize=16)
    ax1.tick_params('y', colors='b', labelsize=16)
    ax1.tick_params('x', labelsize=16)
    plt.grid(False)
    plt.title(filename + " ICA Mean Reconstruction Error", fontsize=16)
    fig.tight_layout()
    plt.show()
Ejemplo n.º 21
0
def main():
    out = './BASES/'
    np.random.seed(0)
    character = pd.read_hdf('./BASES/datasets.hdf', 'character')
    character_X = character.drop('Class', 1).copy().values
    character_Y = character['Class'].copy().values

    madelon = pd.read_hdf('./BASES/datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values

    madelon_X = StandardScaler().fit_transform(madelon_X)
    character_X = StandardScaler().fit_transform(character_X)

    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dim_red = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims_red_s = [2, 4, 6, 8, 10, 12, 14, 16]

    # raise data for 1
    ################################
    ica = FastICA(random_state=5)
    kurt = {}
    for dim in dims_red_s:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(character_X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + 'character_scree.csv')
    ################################
    ica = FastICA(random_state=5)
    kurt = {}
    for dim in dim_red:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(madelon_X)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(out + 'madelon_scree.csv')
    raise

    # Data for 2
    ##############################

    grid = {'ica__n_components': dims_red_s, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
    ica = FastICA(random_state=5)
    mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('ica', ica), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_dim_red.csv')
    ##############################
    grid = {'ica__n_components': dim_red, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
    ica = FastICA(random_state=5)
    mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('ica', ica), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon_dim_red.csv')

    # raise data for 3
    ###############################
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 16
    ica = FastICA(n_components=dim, random_state=10)
    character_X2 = ica.fit_transform(character_X)
    character_2 = pd.DataFrame(np.hstack((character_X2, np.atleast_2d(character_Y).T)))
    cols = list(range(character_2.shape[1]))
    cols[-1] = 'Class'
    character_2.columns = cols
    character_2.to_hdf(out + 'datasets.hdf', 'character', complib='blosc', complevel=9)

    #################################
    dim = 45
    ica = FastICA(n_components=dim, random_state=10)
    madelon_X2 = ica.fit_transform(madelon_X)
    madelon_2 = pd.DataFrame(np.hstack((madelon_X2, np.atleast_2d(madelon_Y).T)))
    cols = list(range(madelon_2.shape[1]))
    cols[-1] = 'Class'
    madelon_2.columns = cols
    madelon_2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9)
Ejemplo n.º 22
0
def clustering_ica(cluster_range, ICA_component_, dataset, dir):
    df = dataset.data
    x = (df.iloc[:, 0:-1])
    y = (df.iloc[:, -1])
    y = y.astype('int')
    x = StandardScaler().fit_transform(x)
    global _ica, x_ica, _dataset_ica, _dataset_ica
    NN_ICA_accuracy = defaultdict(dict)
    kmeans_accuracy_ICA = defaultdict(dict)
    kmeans_time_ICA = defaultdict(dict)
    em_accuracy_ICA = defaultdict(dict)
    em_time_ICA = defaultdict(dict)
    _data_ICA = FastICA(random_state=0)
    kurt = {}
    for dim in ICA_component_:
        _data_ICA.set_params(n_components=dim)
        tmp = _data_ICA.fit_transform(dataset.x)
        tmp = pd.DataFrame(tmp)
        tmp = tmp.kurt(axis=0)
        kurt[dim] = tmp.abs().mean()

    kurt = pd.Series(kurt)
    kurt.to_csv(dir + '{}_ica_scree.csv'.format(dataset.dataset_name))
    common_utils.plot_dim_red_scores(
        dir + '{}_ica_scree.csv'.format(dataset.dataset_name),
        dir,
        dataset.dataset_name,
        "ICA",
        multiple_runs=False,
        xlabel='Number of Clusters',
        ylabel=None)

    _data_ICA_data = _data_ICA.fit_transform(x)
    _data_ICA_df = pd.DataFrame(data=_data_ICA_data)
    _data_ICA_kurtosis = _data_ICA_df.kurt()
    print(_data_ICA_kurtosis)
    for ICA_comp in ICA_component_:

        _data_ICA = FastICA(n_components=ICA_comp, random_state=0)
        _data_ICA_data = _data_ICA.fit_transform(x)
        _data_ICA_df = pd.DataFrame(data=_data_ICA_data)

        _ica = FastICA(n_components=ICA_comp, random_state=0)
        x_ica = _ica.fit_transform(x)

        _dataset_ica = dataset
        _dataset_ica.x = x_ica
        _dataset_ica.y = y

        for cluster in cluster_range:
            # Kmeans
            start = datetime.now()
            myk_mean_ICA_prediction = KMeans(
                n_clusters=cluster, random_state=0).fit_predict(_data_ICA_df)
            kmeans_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, myk_mean_ICA_prediction)
            end = datetime.now()

            kmeans_accuracy_ICA[ICA_comp][cluster] = kmeans_accuracy_for_k
            kmeans_time_ICA[ICA_comp][cluster] = (end - start).total_seconds()

            # EM
            start = datetime.now()
            em_pca_prediction_y = GaussianMixture(
                n_components=cluster).fit(_data_ICA_df).predict(_data_ICA_df)
            em_pca_accuracy_for_k = common_utils.get_cluster_accuracy(
                y, em_pca_prediction_y)
            end = datetime.now()

            em_accuracy_ICA[ICA_comp][cluster] = em_pca_accuracy_for_k
            em_time_ICA[ICA_comp][cluster] = (end - start).total_seconds()

        NN_ICA_accuracy[ICA_comp] = nn_experiment(_dataset_ica)
    common_utils.plot_feature_transformation_time(
        kmeans_time_ICA, "k-means ICA clusters vs time", dir)
    common_utils.plot_feature_transformation_accuracy(
        kmeans_accuracy_ICA, "k-means ICA clusters vs accuracy", dir)
    common_utils.plot_feature_transformation_time(em_time_ICA,
                                                  "EM ICA clusters vs time",
                                                  dir)
    common_utils.plot_feature_transformation_accuracy(
        em_accuracy_ICA, "EM ICA clusters vs accuracy", dir)
Ejemplo n.º 23
0
        file_2.write(";")
        file_2.write("%1.9f" % pca_var_2[i])
    file_2.write("\n")

    file_2.write("PCA_singular_2")
    for i in range(0, len(pca_sing_2)):
        file_2.write(";")
        file_2.write("%1.9f" % pca_sing_2[i])
    file_2.write("\n")

    ############################## ICA ##############################

    ica = FastICA(random_state=5)
    error_rate_1 = np.zeros(np.shape(data1_X)[1])
    for i in range(0, np.shape(data1_X)[1]):
        ica.set_params(n_components=i + 1)
        DT1 = tree.DecisionTreeClassifier(criterion='gini',
                                          min_samples_leaf=0.005)
        error_rate_1[i] = sum(
            DT1.fit(ica.fit_transform(data1_X), data1_Y).predict(
                ica.fit_transform(data1_X)) == data1_Y) * 1.0 / n1
        print i + 1
    i1 = np.argmax(error_rate_1) + 1
    ica.set_params(n_components=i1)
    temp1 = ica.fit_transform(data1_X)
    temp1 = pd.DataFrame(temp1)
    kurt1 = temp1.kurt(axis=0)

    error_rate_2 = np.zeros(np.shape(data2_X)[1])
    for i in range(0, np.shape(data2_X)[1]):
        ica.set_params(n_components=i + 1)
Ejemplo n.º 24
0
class assignment4:
    def __init__(self):
        # data processing
        self.dataSetPath = './data_set/'
        self.dataSetName = ""
        self.csv_delimiter = ','
        self.data = None
        self.allFeatures = []
        self.allTarget = []

        # not used
        self.XTrain = None
        self.XTest = None
        self.YTrain = None
        self.YTest = None

        # k-mean clustering
        self.kNum = range(1, 21)
        self.kmean = None
        self.kmeanRD = None
        # expectation maximization
        self.em = None
        self.emRD = None
        # PCA
        self.pca = None
        self.pcaDims = range(1, 21)

        # ICA
        self.icaDims = range(1, 21)
        self.ica = None

        # RP
        self.rp = None
        self.rpDims = range(1, 21)

        # TSVD
        self.tsvd = None
        self.tsvdDims = range(1, 10)

    def read_data_voice(self, dataName):
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=self.csv_delimiter)
            self.data = list(reader)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.data)))
        print('Number of attributes: {}'.format(len(self.data[0]) - 1))

    def read_data_haptX(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)

        print(len(self.data))
        for elim in self.data:
            feature = []
            for i in elim:
                feature.append(i)
            self.allFeatures.append(feature)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allFeatures)))
        print('Number of attributes: {}'.format(len(self.allFeatures[0])))

    def read_data_haptY(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)
        for elim in self.data:
            self.allTarget.append(elim)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allTarget)))
        print('Number of attributes: {}'.format(len(self.allTarget[0])))

        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.allTarget = self.allTarget.ravel()

    def split_data_to_train_test(self, testSize=0.3):
        # in case the data set are very different in format
        sample_len = len(self.data[0])
        for elem in self.data:
            feature = elem[0:sample_len - 1]
            feature_vector = []
            for f in feature:
                feature_vector.append(float(f))
            self.allFeatures.append(feature_vector)
            if elem[-1] == '0':
                val = 0
            else:
                val = 1
            self.allTarget.append((float(val)))
        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split(
            self.allFeatures,
            self.allTarget,
            test_size=testSize,
            random_state=42)
        print(
            'Total X train data -> {}%'.format(
                int((len(self.XTrain) / len(self.data)) * 100)), 'Size:',
            len(self.XTrain))
        print(
            'Total X test data -> {}%'.format(
                int((len(self.XTest) / len(self.data)) * 100)), 'Size:',
            len(self.XTest))
        print(
            'Total Y train data -> {}%'.format(
                int((len(self.YTrain) / len(self.data)) * 100)), 'Size:',
            len(self.YTrain))
        print(
            'Total Y test data -> {}%'.format(
                int((len(self.YTest) / len(self.data)) * 100)), 'Size',
            len(self.YTest))

    def get_max_idx(self, input):
        maxVal = input[0]
        maxIdx = 0
        for i in range(1, len(input)):
            if input[i] > maxVal:
                maxIdx = i
                maxVal = input[i]
        return maxIdx

    def pairwiseDistCorr(self, X1, X2):
        assert X1.shape[0] == X2.shape[0]

        d1 = pairwise_distances(X1)
        d2 = pairwise_distances(X2)
        return np.corrcoef(d1.ravel(), d2.ravel())[0, 1]

    def k_mean_cluster(self):
        print("-" * 50)
        print('{}: K-mean clustering'.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.kmean = KMeans(random_state=5, max_iter=1000)
        for i in self.kNum:
            self.kmean.set_params(n_clusters=i)
            self.kmean.fit(dataX)
            scores.append(sm.accuracy_score(self.allTarget,
                                            self.kmean.labels_))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, self.kmean.labels_))
        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:", confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('K-mean Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_KMEAN.png'.format(self.dataSetName))
        print("-" * 50)

    def k_mean_cluster_reduced(self, n_clusters, reduced_data, name):
        print("-" * 50)
        print('{}: K-mean clustering {}'.format(self.dataSetName, name))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.kmeanRD = KMeans(n_clusters=n_clusters,
                              random_state=5,
                              max_iter=1000)
        self.kmeanRD.fit(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, self.kmeanRD.labels_)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_))

        print("-" * 50)

    def expectation_maximization_reduced(self, n_components, reduced_data,
                                         name):
        print("-" * 50)
        print('{}: Expectation maximization {}'.format(self.dataSetName, name))

        self.emRD = GaussianMixture(n_components=n_components, random_state=5)
        self.emRD.fit(reduced_data)
        y_predict = self.emRD.predict(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, y_predict)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, y_predict))
        print("-" * 50)

    def expectation_maximization(self):
        print("-" * 50)
        print('{}: Expectation maximization'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.em = GaussianMixture(random_state=5)
        for i in self.kNum:
            self.em.set_params(n_components=i)
            self.em.fit(dataX)
            y_predict = self.em.predict(dataX)
            scores.append(sm.accuracy_score(self.allTarget, y_predict))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, y_predict))

        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:")
        print(confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_EM.png'.format(self.dataSetName))
        print("-" * 50)

    def PCA(self):
        print("-" * 50)
        print('{}: Principal component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)

        self.pca = PCA(random_state=5)
        grid = {'pca__n_components': self.pcaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('pca', self.pca), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)

        print("Best number PCA components:", search.best_params_)

        self.pca.fit(dataX)
        var = np.cumsum(
            np.round(self.pca.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.ylim([0, 1])
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.pcaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName))

        print("-" * 50)

    def ICA(self):
        print("-" * 50)
        print('{}: Independent component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.ica = FastICA(random_state=5, max_iter=6000)
        # kurtosis
        kurt = []
        for dim in self.icaDims:
            self.ica.set_params(n_components=dim)
            tmp = self.ica.fit_transform(dataX)
            tmp = pd.DataFrame(tmp)
            tmp = tmp.kurt(axis=0)
            kurt.append(tmp.abs().mean())

        # grid search
        grid = {'ica__n_components': self.icaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('ica', self.ica), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number ICA components:", search.best_params_)

        plt.figure()
        plt.ylabel('Kurtosis')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(kurt)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_kurtosis.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.icaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def RP(self):
        print("-" * 50)
        print('{}: Random Projection'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        disCorr = []
        self.rp = SparseRandomProjection(random_state=5)
        for dim in self.rpDims:
            self.rp.set_params(n_components=dim)
            disCorr.append(
                self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX))
        print(disCorr)

        # grid search
        grid = {'rp__n_components': self.rpDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('rp', self.rp), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number RP components:", search.best_params_)

        plt.figure()
        plt.ylabel('Distance')
        plt.xlabel('# of Features')
        plt.title('RP Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(disCorr)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_distance.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_RP_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def TSVD(self):
        print("-" * 50)
        print('{}: TruncatedSVD'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.tsvd = TruncatedSVD(random_state=5)

        # grid search
        grid = {'tsvd__n_components': self.tsvdDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number TSVD components:", search.best_params_)

        self.tsvd.fit(dataX)
        var = np.cumsum(
            np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName))
        print("-" * 50)
Ejemplo n.º 25
0
    kurt1_test = np.zeros(np.shape(data1_X_test)[1])

    DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth = None )
    # DT1 = neighbors.KNeighborsClassifier(n_neighbors=5, algorithm='auto')
    # DT1 = svm.SVC(C=0.418, kernel='rbf', max_iter=-1)
    error_rate_train_DT_1 = sum(
            DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0]
    print "error_rate_train_DT_1", error_rate_train_DT_1
    error_rate_test_DT_1 = sum(
            DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0]
    print "error_rate_test_DT_2", error_rate_test_DT_1

    for i in range(0, np.shape(data1_X_train)[1]):
        print i
        start_time = time.time()
        ica.set_params(n_components=i + 1)
        data1_X_train_ica = ica.fit_transform(data1_X_train)  # data2_X_train is observation, data2_X_train_ica is ICAed
        # A_1 = ica.mixing_  # Get estimated mixing matrix
        # # print "A_2", A_2
        # data1_X_test_ica = np.dot(data1_X_test, A_1)
        data1_X_test_ica = ica.transform(data1_X_test)

        error_rate_train_1[i] = sum(
            DT1.fit(data1_X_train_ica, data1_y_train).predict(data1_X_train_ica) == data1_y_train) * 1.0 /data1_y_train.shape[0]
        print("error_rate_train_1[%f]" %i), error_rate_train_1[i]
        error_rate_test_1[i] = sum(
            DT1.fit(data1_X_train_ica, data1_y_train).predict(data1_X_test_ica) == data1_y_test) * 1.0 / data1_y_test.shape[0]
        print("error_rate_test_1[%f]" % i), error_rate_test_1[i]
        print "time consumed:", time.time()-start_time

    file_2.write("ICA_error_rate_train_1")
Ejemplo n.º 26
0
adultY = getAdultY()
adultX = getAdultX()
adultX, adultTestX = adultX.iloc[:6000, :], adultX.iloc[6000:, :]
adultY = getAdultY()
adultY, adultTestY = adultY[:6000, ], adultY[6000:, ]

dims1 = range(1, 8)
dims2 = range(1, 16)
#raise
#%% data for 1
svm = SVC(kernel="linear", random_state=0, C=6)
ica = FastICA(random_state=5)
kurt = {}
acc = {}
for dim in dims1:
    ica.set_params(n_components=dim, max_iter=500, tol=0.1)
    tmp = ica.fit_transform(ecoliX)
    svm.fit(tmp, ecoliY)
    testX = ica.transform(ecoliTestX)
    acc[dim] = accuracy_score(ecoliTestY, svm.predict(testX))
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt)
kurt.to_csv(out + 'ecoli scree.csv')
acc = pd.Series(acc)
acc.to_csv(out + 'ecoli svm validate.csv')
tmp.to_csv(out + 'ecoli kurtosis.csv')

dt = DecisionTreeClassifier(random_state=0)