Example #1
0
def test_fastica_simple(add_noise, seed):
    # Test the FastICA algorithm on very simple data.
    rng = np.random.RandomState(seed)
    # scipy.stats uses the global RNG:
    n_samples = 1000
    # Generate two sources:
    s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
    s2 = stats.t.rvs(1, size=n_samples)
    s = np.c_[s1, s2].T
    center_and_norm(s)
    s1, s2 = s

    # Mixing angle
    phi = 0.6
    mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi),
                                                    -np.cos(phi)]])
    m = np.dot(mixing, s)

    if add_noise:
        m += 0.1 * rng.randn(2, 1000)

    center_and_norm(m)

    # function as fun arg
    def g_test(x):
        return x**3, (3 * x**2).mean(axis=-1)

    algos = ['parallel', 'deflation']
    nls = ['logcosh', 'exp', 'cube', g_test]
    whitening = [True, False]
    for algo, nl, whiten in itertools.product(algos, nls, whitening):
        if whiten:
            k_, mixing_, s_ = fastica(m.T,
                                      fun=nl,
                                      algorithm=algo,
                                      random_state=rng)
            with pytest.raises(ValueError):
                fastica(m.T, fun=np.tanh, algorithm=algo)
        else:
            pca = PCA(n_components=2, whiten=True, random_state=rng)
            X = pca.fit_transform(m.T)
            k_, mixing_, s_ = fastica(X,
                                      fun=nl,
                                      algorithm=algo,
                                      whiten=False,
                                      random_state=rng)
            with pytest.raises(ValueError):
                fastica(X, fun=np.tanh, algorithm=algo)
        s_ = s_.T
        # Check that the mixing model described in the docstring holds:
        if whiten:
            assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))

        center_and_norm(s_)
        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
        else:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1)

    # Test FastICA class
    _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed)
    ica = FastICA(fun=nl, algorithm=algo, random_state=seed)
    sources = ica.fit_transform(m.T)
    assert ica.components_.shape == (2, 2)
    assert sources.shape == (1000, 2)

    assert_array_almost_equal(sources_fun, sources)
    assert_array_almost_equal(sources, ica.transform(m.T))

    assert ica.mixing_.shape == (2, 2)

    for fn in [np.tanh, "exp(-.5(x^2))"]:
        ica = FastICA(fun=fn, algorithm=algo)
        with pytest.raises(ValueError):
            ica.fit(m.T)

    with pytest.raises(TypeError):
        FastICA(fun=range(10)).fit(m.T)
Example #2
0
############################################################
# Add decomposition feature
############################################################
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results = tsvd.fit_transform(train_test_p)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(train_test_p)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(train_test_p)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(train_test_p)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(train_test_p)

# save columns list before adding the decomposition components
usable_columns = train_test_p.columns

print train_test_p.shape
print tsvd_results.shape, type(tsvd_results)
Example #3
0

clusters = [2, 5, 10, 15, 20, 25, 30, 35]
dim = [2, 4, 6, 8, 9]

km = KM(random_state=42)
gmm = GMM(random_state=42)

Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for i in dim:
    reduced_X = FastICA(n_components=i,
                        random_state=42).fit_transform(X_scaled)
    k = 10
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(reduced_X)
    gmm.fit(reduced_X)
    Score['km'].append(km.score(reduced_X))
    Score['gmm'].append(gmm.score(reduced_X))
    S_homog['km'].append(
        metrics.homogeneity_score(labels, km.predict(reduced_X)))
    S_homog['gmm'].append(
        metrics.homogeneity_score(labels, gmm.predict(reduced_X)))
    S_adjMI['km'].append(
        metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X)))
    S_adjMI['gmm'].append(
        metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X)))
Example #4
0
ADA_ICA_Valid = []
ADA_ICA_Valid_STD = []
ADA_ICA_Test = []

NN_PCA_Valid = []
NN_PCA_Valid_STD = []
NN_PCA_Test = []

NN_ICA_Valid = []
NN_ICA_Valid_STD = []
NN_ICA_Test = []

for comp in Comp_space:
    pca = PCA(n_components=comp, whiten=False)
    ica = FastICA(n_components=comp, whiten=True, max_iter=1000000)

    print("PCA Fit...")
    pca.fit(Data.features_train[:, 5:])
    print("ICA Fit...")
    ica.fit(Data.features_train[:, 5:])

    X_train_pca = pca.transform(Data.features_train[:, 5:])
    X_test_pca = pca.transform(Data.features_test[:, 5:])

    train_pca = np.hstack((X_train_pca, Data.features_train[:, 0:5]))
    test_pca = np.hstack((X_test_pca, Data.features_test[:, 0:5]))

    X_train_ica = ica.transform(Data.features_train[:, 5:])
    X_test_ica = ica.transform(Data.features_test[:, 5:])
Example #5
0
        warnings.warn('nilearn must be installed to run CanICA')

canica_dmn = nibabel.load(join(path, 'canica.nii.gz')).get_data()[..., 4]

### Melodic ICA ############################################################
# To have MELODIC results, please use my melodic branch of nilearn

melodic_dmn = nibabel.load(join(path, 'melodic.nii.gz')).get_data()[..., 3]

### FastICA ##################################################################

# Concatenate all the subjects
if not exists(join(path, 'ica.nii.gz')):
    from sklearn.decomposition import FastICA
    X = np.vstack(X)
    ica = FastICA(n_components=n_components, random_state=2)
    t0 = time.time()
    ica.fit(X)
    print('FastICA: %f' % (time.time() - t0))
    ica_components = masking.unmask(ica.components_, mask_img)
    nibabel.save(nibabel.Nifti1Image(ica_components, mask_img.get_affine()),
                 join(path, 'ica.nii.gz'))

ica_dmn = -nibabel.load(join(path, 'ica.nii.gz')).get_data()[..., 1]

### Plots ####################################################################

# Split the sign to harmonize maps
ica_dmn = -ica_dmn
canica_dmn = -canica_dmn
def dimensionality_ICA(instruction, dataset, target="", y=""):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        data, y, target, full_pipeline = initial_preprocesser(data,
                                                              instruction,
                                                              True,
                                                              0.2, [],
                                                              0.2,
                                                              random_state=49)

        X_train = data['train']
        X_test = data['test']

        y_train = y['train']
        y_test = y['test']

    pca = FastICA(n_components=len(X_train.columns))
    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.fit_transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train)
    acc = []
    sets = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    frame = pd.DataFrame(
        pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
    frame[target] = np.r_[y_train, y_test]
    sets.append(frame)

    for i in range(2, len(X_train.columns)):
        pca = FastICA(n_components=i)
        X_train_mod = pca.fit_transform(X_train)
        X_test_mod = pca.fit_transform(X_test)

        frame = pd.DataFrame(
            pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
        frame[target] = np.r_[y_train, y_test]
        sets.append(frame)

        clf_mod = tree.DecisionTreeClassifier()
        clf_mod.fit(X_train_mod, y_train)

        acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    del i

    data_modified = sets[acc.index(max(acc))]
    score = max(acc)

    return data_modified, score, ((len(X_train.columns) + 1) -
                                  len(data_modified.columns))
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import FastICA


ica = FastICA(n_components=5)
base = pd.read_csv('cardio_train.csv')
caracteristicas = ['age', 'gender', 'height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco','active']
age =[]
for i in base.age.values:
    i = i//365
    age.append(i)
base['age'] = age

x = base[caracteristicas].values
y = base.cardio.values
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state=1)

x_treino = ica.fit_transform(x_treino)
x_teste = ica.transform(x_teste)

model = Sequential()
model.add(Dense(5, input_dim=5, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# Compile model. We use the the logarithmic loss function, and the Adam gradient optimizer.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_treino, y_treino, epochs=100, batch_size=32,verbose=1)

loss_and_metrics = model.evaluate(x_teste, y_teste, batch_size=128)
Example #8
0
def fast_ica_transform(fitting_inputs_scaled):
    ica = FastICA()
    ica.fit(fitting_inputs_scaled)

    return ica.transform(fitting_inputs_scaled)
Example #9
0
train_inv_pca = np.zeros((n_images,INPUT_SIZE*INPUT_SIZE,3))  # variable inverting pca features for visualization
x_train_ica = np.zeros((n_images,NCOMPONENTS_ICA,3))  # variable containing ica features
train_inv_ica = np.zeros((n_images,INPUT_SIZE*INPUT_SIZE,3))  # variable inverting ica features for visualization
x_train_canny = np.zeros((n_images,INPUT_SIZE,INPUT_SIZE,3))  # variable containing edges detected by canny edge detector
x_train_cornerharris = np.zeros((n_images,INPUT_SIZE,INPUT_SIZE,3))
for i in range(CHANNELS):
    if USE_PCA:
        pca = PCA(n_components=NCOMPONENTS_PCA)
        x_tr = np.reshape(x_train[:, :, :, i], (n_images, INPUT_SIZE*INPUT_SIZE))
        x_train_pca[:, :, i] = pca.fit_transform(x_tr)
        print(pca.explained_variance_ratio_)
        inv_pca = pca.inverse_transform(x_train_pca[:, :, i])
        train_inv_pca[:,:,i] = inv_pca

    if USE_ICA:
        ica = FastICA(n_components=NCOMPONENTS_ICA)
        x_tr = np.reshape(x_train[:, :, :, i], (n_images, INPUT_SIZE * INPUT_SIZE))
        x_train_ica[:, :, i] = ica.fit_transform(x_tr)
        print(pca.explained_variance_ratio_)
        inv_ica = ica.inverse_transform(x_train_ica[:, :, i])
        train_inv_ica[:, :, i] = inv_ica


    if USE_GENSEL:
        # Genetic feature selection: #very slow
        clsfr = linear_model.LogisticRegression()
        gs = GeneticSelectionCV(clsfr)
        print(np.shape(x_tr))
        print(np.shape(y_train))
        print(np.array(np.argmax(y_train, axis=1)))
        gs_features = gs.fit(x_tr, np.array(np.argmax(y_train, axis=1)))
Example #10
0
def main():
    async_dict = {
        "dbl": get_binary_target_basic_dbl_trigger_data,
        "dbl_deriv_valid": get_binary_target_deriv_valid_dbl_trigger_data,
        "dbl_deriv_valid_retro":
        get_binary_target_retrospective_dbl_trigger_data,
        "fa": get_binary_target_fa_patient_data,
        "bs": get_binary_target_bs_patient_data,
        "bs_deriv_valid": get_binary_target_deriv_valid_bs_patient_data,
        "co": get_binary_target_cough_data,
        "su": get_binary_target_suction_data,
        "multi": get_multi_target_derivation_cohort_data,
        "multi_retro": get_multi_target_retrospective_data,
        "multi_retro_first_dbl_bs":
        get_multi_target_retrospective_data_first_dbl_as_bs,
        "multi_retro_fdb_deriv_val":
        get_multi_retrospective_data_fdb_deriv_val_cohort,
        "bs_dbl": get_bs_dbl_non_retrospective_data,
        "bs_dbl_retro": get_bs_dbl_retrospective_data,
        "bs_dbl_retro_fdb_deriv_val":
        get_bs_dbl_retrospective_first_dbl_as_bs_deriv_valid,
        "bs_dbl_retro_fdn_deriv_val":
        get_bs_dbl_retrospective_first_dbl_as_norm_deriv_valid,
        "bs_dbl_retro_fdb_deriv_cohort":
        get_bs_dbl_retrospective_data_first_dbl_as_bs,
        "bs_dbl_retro_fdb_val_cohort":
        get_bs_dbl_retrospective_data_first_dbl_as_bs_val_cohort,
        "multi_binary_retro": get_multi_class_binary_label_retrospective_data,
    }
    feature_dict = {
        "slope": get_slopes_of_pressure_curve_df,
        "integ": get_integrated_pressure_curve_df,
        "v1": get_v1,
        "v2": get_v2,
        "settings": get_vent_settings_features,
        "derived": get_derived_metadata_features,
        "metadata": get_settings_and_derived_features,
        "v2_and_metadata": get_v2_and_metadata,
        "fa_heuristic": get_fa_heuristic,
        "dbl_heuristic": get_dbl_trigger_heuristic_features,
        "dbl_all": get_dbl_trigger_heuristic_and_metadata_features,
        "dbl_chi2": get_dbl_chi2,
        "dbl_retro": get_dbl_trigger_retrospective_plus_metadata_features,
        "dbl_retro_chi2": get_dbl_retro_chi2,
        "dbl_no_retro_curated": get_dbl_curated,
        "bs_heuristic": get_bs_heuristic,
        "bs_all": get_bs_all,
        "bs_chi2": get_bs_chi2,
        "bs_curated": get_bs_curated,
        "co_heuristic": get_co_heuristic,
        "co_all": get_co_all,
        "co_curated": get_co_curated_no_tvi,
        "co_curated_with_tvi": get_co_curated_with_tvi,
        "su_heuristic": get_suction_heuristic,
        "su_all": get_suction_all,
        "su_curated": get_suction_curated,
        "retro_fused_plus_metadata": get_retro_plus_metadata,
        "greg_selection": get_all_greg_selected_features,
        "retro_non_noisy": get_retro_non_noisy,
        "retro_prev_plus_metadata": get_retro_prev_plus_metadata,
        "retro_prev_prev_plus_metadata": get_retro_prev_prev_plus_metadata,
        "retro_stripped_expert_plus_chi2": get_retro_stripped_expert_plus_chi2,
        "retro_stripped_expert_plus_chi2_2":
        get_retro_stripped_expert_plus_chi2_2,
        "retro_stripped_expert_plus_chi2_3":
        get_retro_stripped_expert_plus_chi2_3,
        "retro_stripped_expert_plus_chi2_4":
        get_retro_stripped_expert_plus_chi2_4,
        # this is currently highest performer
        "retro_stripped_expert_plus_chi2_5":
        get_retro_stripped_expert_plus_chi2_5,
        "retro_stripped_expert_plus_chi2_6":
        get_retro_stripped_expert_plus_chi2_6,
        "retro_stripped_expert_plus_chi2_7":
        get_retro_stripped_expert_plus_chi2_7,
        "retro_stripped_expert_plus_instrr":
        get_retro_stripped_expert_plus_instrr,
        "retro_stripped_expert_plus_instrr_prev":
        get_retro_stripped_expert_plus_instrr,
        "retro_stripped_low_prec": get_retro_stripped_lower_prec,
        "retro_stripped_high_prec": get_retro_stripped_higher_prec,
    }
    parser = build_parser(async_dict, feature_dict)
    args = parser.parse_args()
    additional_error_handling(args)
    feature_func = feature_dict.get(args.feature_type)
    gold_stnd_func = async_dict.get(args.async_type)

    x, y, extra_info = get_x_y(feature_func, args.bins, args.pickle_file,
                               args.new_pickling_file, gold_stnd_func,
                               args.new_csv_file)

    generator = args.split_func(x, y, args)
    results = []
    for x_train, x_test, y_train, y_test in generator:
        if args.only_patient:
            if args.only_patient not in x_test.patient.unique():
                continue
            else:
                x_test = x_test[x_test.patient == args.only_patient]
        try:
            del x_train['patient']
        except:
            pass
        try:
            del x_test['patient']
        except:
            pass

        x_train = perform_space_replacement(x_train)
        if x_test != []:
            x_test = perform_space_replacement(x_test)
        if args.selected_features:
            x_train = x_train[args.selected_features]
            x_test = x_test[args.selected_features]

        # I guess I only wanted to winsorize two vars? Maybe I wanted to reduce
        # side effects
        winsorizor = Winsorizor(args.winsorize)
        x_train = winsorizor.fit_transform(
            x_train, ['tve:tvi-ratio', 'tve:tvi-ratio-prev'])
        x_test = winsorizor.transform(x_test)

        scaler = ScalerWrapper(args.scaler, None, x_train, x_test)
        classifier = Classifier(args, scaler)
        x_train = scaler.train_transform()
        x_test = scaler.test_transform()
        x_train, x_test = perform_pca(x_train, y_train, x_test, y_test,
                                      args.pca)
        if args.lda:
            lda = LinearDiscriminantAnalysis(solver='svd')
            cols = x_train.columns
            train_index, test_index = x_train.index, x_test.index
            lda.fit(x_train.values, y_train.values)
            x_train = DataFrame(lda.transform(x_train), index=train_index)
            x_test = DataFrame(lda.transform(x_test), index=test_index)

        if args.ica:
            fast_ica = FastICA(n_components=args.ica,
                               whiten=True,
                               random_state=True)
            train_index, test_index = x_train.index, x_test.index
            fast_ica.fit(x_train, y_train)
            x_train = DataFrame(fast_ica.transform(x_train), index=train_index)
            x_test = DataFrame(fast_ica.transform(x_test), index=test_index)

        if args.tsne:
            tsne = TSNE(n_components=args.tsne)
            train_index, test_index = x_train.index, x_test.index
            tsne.fit(x_train, y_train)
            x_train = tsne.fit_transform(x_train)
            import IPython
            IPython.embed()

        # This was the best thing I could do with my given architecture.
        if (args.run_chi2 or args.chi2_pruning) and args.pca:
            raise ValueError(
                "It doesn't make sense to run chi2 when using PCA!")
        if args.run_chi2:
            run_and_print_chi2(x_train, y_train)
        if args.chi2_pruning:
            patients = map(lambda x: x[1], x_test.index.str.split("-"))
            unique = set(patients)
            patient = "-".join(unique)
            x_train, x_test = perform_chi2_feature_pruning(
                x_train, x_test, y_train, args.chi2_pruning,
                args.write_chi2_results, args.pickle_file, patient)

        if not args.with_smote and (args.rfecv or args.l1_selector
                                    or args.grid_search):
            patient = map(lambda x: x[1], x_train.index.str.split("-"))
            x_train['patient'] = patient
            x_train.sort_values(by=['patient'], inplace=True)
            x_train = x_train.drop("patient", axis=1)

        if args.rfecv:
            x_train, x_test = classifier.backwards_feature_elimination(
                x_train, y_train, x_test)
        if args.l1_selector:
            x_train, x_test = classifier.l1_selection(x_train, y_train, x_test)

        if len(y_train) != 0:
            get_fa_elements(y_train, "training")
            get_fa_elements(y_test, "testing")
        else:
            get_fa_elements(y_test, "all")

        if len(x_train) != 0 and args.grid_search:
            classifier.grid_search(x_train, y_train)
        elif len(x_train) != 0 and args.cross_validate:
            classifier.cross_validate(x_train, y_train)
        elif len(x_train) != 0:
            classifier.fit(x_train, y_train)

        if args.train_on_all:
            classifier.write_to_file(args.model_file)
            scaler.to_pickle()
            winsorizor.to_pickle()
            return
        else:
            run_results = classifier.analyze_and_print_results(
                x_test, y_test, y_train, extra_info)
            results.append(run_results)

    if args.write_results:
        write_results(results, args)
def f_extract(X_train,
              X_test,
              y_train,
              y_test,
              method='26PCA',
              feature_limit=26):
    def str_split_num(s):
        tail = s.lstrip('0123456789')  # use rstrip if num is last part of str
        head = s[0:-len(tail)]  # negative to count from last char
        return int(head), tail

    if method[0].isdigit():
        n_comps, method = str_split_num(method)
        print("Feature extraction using", method)

    if method == 'PCA':
        reducer = PCA(n_components=n_comps, whiten=True,
                      random_state=rand).fit(X_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    if method == 'LDA':
        reducer = LinearDiscriminantAnalysis(n_components=n_comps).fit(
            X_train, y_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    if method == 'ICA':
        reducer = FastICA(n_components=n_comps, whiten=True,
                          random_state=rand).fit(X_train, y_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    if method == 'LLE':  # too slow
        reducer = LocallyLinearEmbedding(n_components=n_comps,
                                         random_state=rand,
                                         n_jobs=threads).fit(X_train, y_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    if method == 'TSNE':
        reducer = TSNE(n_components=n_comps,
                       learning_rate=1000,
                       metric='euclidean',
                       n_iter=11,
                       random_state=rand,
                       n_jobs=threads).fit(X_train, y_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    if method == 'UMAP':  # too slow ...angular_rp_forest=True,
        y_train, y_test = encode_labels(y_train, y_test)
        reducer = UMAP(n_components=n_comps,
                       n_neighbors=15,
                       metric='correlation',
                       random_state=rand,
                       min_dist=0.0,
                       angular_rp_forest=True,
                       n_epochs=15).fit(X_train, y_train)
        X_train = reducer.transform(X_train)
        X_test = reducer.transform(X_test)

    return X_train, X_test
def run_ica(
    ica_input: np.ndarray,
    iters: int,
    seed: int,
    verbose: bool = False
) -> Union[Tuple[np.ndarray, np.ndarray, bool], Tuple[np.ndarray, np.ndarray,
                                                      bool, bool]]:
    """
    ica_input -- an MxN numpy array; in the context of the decrosstalk
    problem, N=the number of timesteps; M=2 (first row is signal;
    second is crosstalk)

    iters -- an int; the number of iterative loops to try to go through
    to get the off-diagonal elements of the mixing matrix < 0.3

    seed -- an int; the seed of the random number generator that will
    be fed to sklearn.decompositions.FastICA

    verbose -- if True, also returns a flag indicating whether or not
    ICA output signals had to be swapped (just used for testing)

    Returns
    -------
    ica_output -- and MxN numpy array; ica_output[0,:] is the unmixed signal,
    ica_output[1,:] is the unmixed crosstalk (in the context of the decrosstalk
    problem)

    mixing -- the mixing matrix that gets from ica_input to ica_output
    np.dot(mixing, ica_output) will restore ica_input

    roi_demixed -- a boolean indicating whether or not the iteration to get
    the off-diagonal elements of the mixing matrix < 0.3 actually worked
    """

    # Whiten observations
    #
    # NOTE: we whiten the data by hand and then call
    # FastICA() with whiten=False to avoid running
    # afoul of this bug in sklearn
    #
    # https://github.com/scikit-learn/scikit-learn/issues/17162
    #
    # after this issue is resolved in sklearn, we can
    # revisit the possibility of using sklearn.FastICA's
    # internal whitening

    Ow, W, m = whiten_data(ica_input.transpose())
    alpha = 1
    beta = 1
    it = 0
    roi_demixed = False
    rng = np.random.RandomState(seed)

    while not roi_demixed and it <= iters:
        if alpha > 0.3 or beta > 0.3 or alpha < 0 or beta < 0:
            # Unmixing
            ica = FastICA(whiten=False, max_iter=10000, random_state=rng)
            ica.fit(Ow)  # Reconstruct sources
            mixing_raw = ica.mixing_

            # correcting for scale and offset:

            # applying inverse of whitening matrix
            M_hat = np.dot(np.linalg.inv(W), mixing_raw)

            # computing scaling matrix
            scale = np.dot(np.linalg.inv(M_hat), np.array([1, 1]))

            # applying scaling matrix
            mixing = M_hat * scale

        else:
            roi_demixed = True

        alpha = mixing[0, 1]
        beta = mixing[1, 0]
        it += 1

    # recovering outputs using new mixing matrix
    Sos = np.dot(np.linalg.inv(mixing), ica_input)

    # fixing source assignment ambiguity
    (ica_output, swapped) = fix_source_assignment(ica_input, Sos)

    if swapped:
        new_mixing = np.zeros((2, 2), dtype=float)
        new_mixing[:, 1] = mixing[:, 0]
        new_mixing[:, 0] = mixing[:, 1]
        mixing = new_mixing

    if verbose:
        return ica_output, mixing, roi_demixed, swapped

    return ica_output, mixing, roi_demixed
Example #13
0
    if RP:
        from sklearn.random_projection import GaussianRandomProjection
        model = GaussianRandomProjection(n_components=num_components)

    if FA:
        from sklearn.cluster import FeatureAgglomeration
        model = FeatureAgglomeration(n_clusters=num_components)

    if PCA:
        from sklearn.decomposition import PCA
        model = PCA(n_components=num_components)

    if ICA:
        from sklearn.decomposition import FastICA
        model = FastICA(n_components=num_components)

else:
    # IRIS
    from sklearn.datasets import load_iris
    iris = load_iris()
    data = scale(iris.data)
    # n_samples, n_features = data.shape
    # n_digits = len(np.unique(iris.target))
    labels = iris.target

    num_components = 3

    if RP:
        from sklearn.random_projection import GaussianRandomProjection
        model = GaussianRandomProjection(n_components=num_components)
# NMF
for i in range(10):
    nmf = NMF()
    nmf.fit(digit_mat_array[i] + 1)
    plt.figure()
    for j in range(9):
        plt.subplot(3, 3, j)
        #        plt.gray()
        plt.imshow(nmf.components_[j].reshape(16, 16))
    plt.title("nmf_digit_%s_components" % i, y=-0.5)
    filename = "nmf_digit_%s_components.png" % i
    plt.savefig(filename)

# ICA
for i in range(10):
    ica = FastICA()
    ica.fit(digit_mat_array[i])
    plt.figure()
    for j in range(9):
        plt.subplot(3, 3, j)
        #        plt.gray()
        plt.imshow(ica.components_[j].reshape(16, 16))
    plt.title("ica_digit_%s_components" % i, y=-0.5)
    filename = "ica_digit_%s_components.png" % i
    plt.savefig(filename)

factor_array = [10, 20, 50, 250]
for factor in factor_array:
    nmf = NMF(n_components=factor)
    nmf.fit(digit_mat_array[3] + 1)
    plt.figure()
Example #15
0
 def reduction_ica(self):
     ica = FastICA(n_components=len(self.columns)-2, random_state=0)
     x_reduced = ica.fit_transform(self.X_train)
     print("ICA: {}".format(x_reduced.shape))
Example #16
0
    execfile(home + '/research_code/graicar/load_MEG_data.py')
elif len(sys.argv) > 1:
    subj = sys.argv[1]
    freq_band = None
    res_dir = home + '/data/results/graicar/fmri/'
    execfile(home + '/research_code/graicar/load_fMRI_data.py')
else:
    # subj = 'JOAOCEOG'
    # freq_band = '8-13'
    # res_dir = home + '/data/results/graicar/meg/'
    # execfile(home + '/research_code/graicar/load_MEG_data.py')
    subj = 'subj1'
    freq_band = None
    es_dir = home + '/data/results/graicar/fmri/'
    execfile(home + '/research_code/graicar/load_fMRI_data.py')

nreals = 60
ncomps = 30
rng = np.random.RandomState()

for i in range(nreals):
    print 'Realization %d of %d' % (i + 1, nreals)
    ica = FastICA(n_components=ncomps, random_state=rng)
    # return the first dimension as the number of ICs
    ICs = ica.fit_transform(data).T
    if freq_band is not None:
        fname = res_dir + subj + '_' + freq_band + '_R%02d.npz' % i
    else:
        fname = res_dir + subj + '_R%02d.npz' % i
    np.savez(fname, ICs=ICs)
    #
    print('classifiers config:')
    for k, reg in reg_scikit.items():
        print('{0}={1}'.format(k, reg.get_params()), flush=True)

    #four known clusters
    clust = MiniBatchKMeans(n_clusters=n_clust,
                            max_iter=1000,
                            init_size=n_clust * 10)
    #decompositions
    tfs = {}
    tfs['svd'] = TruncatedSVD(n_components=nb_comp['svd'],
                              random_state=seed_tf)
    tfs['pca'] = PCA(n_components=nb_comp['pca'], random_state=seed_tf)
    tfs['ica'] = FastICA(n_components=nb_comp['ica'],
                         max_iter=250,
                         random_state=seed_tf)
    tfs['grp'] = GaussianRandomProjection(n_components=nb_comp['grp'],
                                          eps=0.1,
                                          random_state=seed_tf)
    tfs['srp'] = SparseRandomProjection(n_components=nb_comp['srp'],
                                        dense_output=True,
                                        random_state=seed_tf)
    tfs['nmf'] = NMF(n_components=nb_comp['nmf'],
                     shuffle=True,
                     init='random',
                     random_state=seed_tf)
    #embedding
    trees, depth, leafs = 25, 8, 32  #2 ** 8 = 256
    embed = RandomTreesEmbedding(n_estimators=trees,
                                 max_depth=depth,
Example #18
0
digitsY = digits['Class'].copy().values

abalone = pd.read_hdf('./BASE/datasets.hdf', 'abalone')
abaloneX = abalone.drop('Class', 1).copy().values
abaloneY = abalone['Class'].copy().values

abaloneX = StandardScaler().fit_transform(abaloneX)
digitsX = StandardScaler().fit_transform(digitsX)

clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
abalone_dims = range(1, 9)
#raise
#%% data for 1

ica = FastICA(random_state=5)
kurt = {}
for dim in abalone_dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(abaloneX)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt)
kurt.to_csv(out + 'abalone scree.csv')

ica = FastICA(random_state=5)
kurt = {}
for dim in dims:
    ica.set_params(n_components=dim)
Example #19
0
def tmi_run_ica(img_data_trunc, num_comp, masking_array, affine_array, variance_threshold = 0.9, timeplot = False, timeplot_name=None, filetype='nii.gz', outname='ica.nii.gz'):
	ica = FastICA(n_components=int(num_comp),max_iter=1000, tol=0.00001)
	S_ = ica.fit_transform(img_data_trunc).T
	components = ica.components_.T
	#scaling
	fitcomps = np.copy(S_)
	fitcomps = zscaler(fitcomps)
	img_data_trunc =  np.copy(fitcomps.T) # ram shouldn't be an issue here...
	np.savetxt("ICA_fit.csv", zscaler(components), fmt='%10.8f', delimiter=',')

	# variance explained.
	explained_total_var = np.zeros((int(num_comp)))
	explained_var_ratio = np.zeros((int(num_comp)))
	# total variance
	back_projection = ica.inverse_transform(S_.T)
	total_var = back_projection.var()
	for i in range(int(num_comp)):
		tempcomps = np.copy(S_)
		tempcomps[i,:] = 0
		temp_back_proj = ica.inverse_transform(tempcomps.T)
		temp_var = temp_back_proj.var()
		explained_var_ratio[i] = total_var - temp_var
		explained_total_var[i] = (total_var - temp_var) / total_var
		print "ICA # %d; Percent of Total Variance %1.3f" % ((i+1),explained_total_var[i]*100)
	explained_var_ratio = explained_var_ratio / explained_var_ratio.sum()

	sum_total_variance_explained = explained_total_var.sum()
	print "Total variance explained by all components = %1.3f" % sum_total_variance_explained
	print "Re-ordering components"
	sort_mask = (-1*explained_total_var).argsort()
	if sum_total_variance_explained > variance_threshold:
		#sort data
		sort_mask = (-1*explained_total_var).argsort()
		np.savetxt("ICA_total_var.csv",explained_total_var[sort_mask], fmt='%1.5f', delimiter=',')
		np.savetxt("ICA_explained_var_ratio.csv",explained_var_ratio[sort_mask], fmt='%1.5f', delimiter=',')
		img_data_trunc=img_data_trunc[:,sort_mask]
		if filetype=='nii.gz':
			savenifti_v2(img_data_trunc, masking_array[0], outname, affine_array[0])
		else:
			pointer = 0
			position_array = [0]
			for i in range(len(masking_array)):
				pointer += len(masking_array[i][masking_array[i]==True])
				position_array.append(pointer)
			del pointer
			for i in range(len(masking_array)):
				start = position_array[i]
				end = position_array[i+1]
				savemgh_v2(img_data_trunc[start:end], masking_array[i], "%d_%s" % (i,outname), affine_array[i])

		# save outputs and ica functions for potential ica removal
		if os.path.exists('ICA_temp'):
			print 'ICA_temp directory exists'
			exit()
		else:
			os.makedirs('ICA_temp')
		np.save('ICA_temp/signals.npy',S_)
		pickle.dump( ica, open( "ICA_temp/icasave.p", "wb" ) )


	return ica, sort_mask,  sum_total_variance_explained
Example #20
0
 def _fastica(self):
     f = FastICA(random_state=self.random_state)
     return f.fit(self.Z.T).components_
        print(f"Fold #{fold}")
        print("TRAIN:", index_subjects[train_index], "TEST:", index_subjects[test_index])
        # load training and testing data
        print('Load training data... (view {})'.format(view))
        train_data = np.concatenate([load_data(sub, view) for sub in index_subjects[train_index]])
        print("Shape of the training data:", train_data.shape)
        print('Load testdata... (view {})'.format(view))
        test_data = np.concatenate([load_data(sub, view) for sub in index_subjects[test_index]])
        print("Shape of the test data:", test_data.shape)
        # Data normalization to range [-1, 1]
        # print("Data normalization to range [-1, 1]")
        scaler = MinMaxScaler()
        normalized_train_data = scaler.fit_transform(train_data)
        normalized_test_data = scaler.fit_transform(test_data)
        # intialize ica
        ica = FastICA(n_components=dim)

        # fit ica on training set
        ica.fit(normalized_train_data)

        # Apply the mapping (transform) to both the training set and the test set
        X_train_ica = ica.transform(normalized_train_data)
        X_test_ica = ica.transform(normalized_test_data)
        print("Original shape:   ", normalized_train_data.shape)
        print("Transformed shape:", X_train_ica.shape)

        # Reconstruction of training data
        print("Reconstruction of training data... ")
        X_train_new = ica.inverse_transform(X_train_ica)
        print("Reconstructed matrix shape:", X_train_new.shape)
        mse = mean_squared_error(normalized_train_data, X_train_new)
def detectHeartRate(fps, video, sequence):
    def visualize(x, y, path):
        global IMAGE_IDX
        plt.figure(figsize=(15, 12))

        plt.tick_params(labelsize=23)
        plt.plot(x, y[0], color='red', linewidth=2, linestyle='-')
        plt.plot(x, y[1], color='green', linewidth=2, linestyle='-')
        plt.plot(x, y[2], color='blue', linewidth=2, linestyle='-')

        plt.savefig(path + "/signal" + str(IMAGE_IDX) + '.jpg')
        plt.close()
        IMAGE_IDX += 1

    frame_num = len(sequence)
    print("Frame number with human face:", frame_num)

    psd_path = "./results/psd/" + video[:video.index('/')]
    folder = os.path.exists(psd_path)
    if not folder:
        os.makedirs(psd_path)
    '''Normalize the RGB value'''
    sequence = np.array(sequence)
    x = [i for i in range(len(sequence))]
    sequence = signal.detrend(sequence, axis=0)
    visualize(x, sequence.T, psd_path)

    mean = np.mean(sequence, axis=0)
    std = np.std(sequence, axis=0)
    sequence = (sequence - mean) / std
    ''' 
    Apply ICA to clear the RGB signals
    input shape:sequenceLength * 3
    output shape: sequenceLength * 3 
    '''
    predictions = []
    n_window_frames = fps * WINDOW_LENGTH
    print("Frame number of sliding window: ", n_window_frames)
    print("Number of sliding windows:", len(sequence) / fps - WINDOW_LENGTH)
    for start_idx in range(0, len(sequence) - n_window_frames, fps):
        window = sequence[start_idx:start_idx +
                          min(n_window_frames, len(sequence))]
        x = [i for i in range(min(n_window_frames, len(sequence)))]
        visualize(x, window.T, psd_path)
        '''Apply ICA method'''
        # print("ICA input shape:", window.shape)
        ica = FastICA(max_iter=2000, tol=0.1)
        transformed = ica.fit_transform(window)
        # print("output shape after ICA transformation:", transformed.shape)
        visualize(x, transformed.T, psd_path)
        '''Apply FFT method and PSD method'''
        powerSpec = np.abs(np.fft.fft(transformed, axis=0))**2
        maxPwrSrc = np.max(powerSpec, axis=1)
        freqs = np.fft.fftfreq(len(transformed), 1.0 / fps)
        '''Filter the HR signals using the frequency band'''
        valid_idx = np.where((freqs >= MIN_HR_BPM / 60)
                             & (freqs <= MAX_HR_BMP / 60))
        valid_pwr = maxPwrSrc[valid_idx]
        valid_freqs = freqs[valid_idx]

        visualize(valid_freqs, powerSpec[valid_idx].T, psd_path)
        visualize(valid_freqs, valid_pwr.T, psd_path)
        '''Predict the heart rate'''
        max_pwr_idx = np.argmax(valid_pwr)
        predictions.append(valid_freqs[max_pwr_idx] * 60.0)
    return predictions
Example #23
0
print pca_model

sparse_pca = SparsePCA(n_components=50)
sparse_pca_model = pca.fit(sparse_pca_data)
sparse_pca_X_new = pca.fit_transform(X)
joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl')
joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl')
print sparse_pca_model

kernel_pca = KernelPCA(n_components=50)
kernel_pca_model = kernel_pca.fit(kernel_pca_data)
kernel_X_new = kernel_pca.fit_transform(X)
joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl')
joblib.dump(kernel_X_new, 'kernel_X_new.pkl')

fast_ica = FastICA(n_components=None)
fast_ica_start = time.time()
fast_ica_model = fast_ica.fit(fast_ica_data)
fast_ica_end = time.time()
print 'fast_ica fit time', fast_ica_end - fast_ica_start
fast_ica_X_new = fast_ica.transform(X)
joblib.dump(fast_ica_model, 'fast_ica_model.pkl')
joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl')
print fast_ica_model
'''
nmf = NMF(n_components=None)
nmf_start = time.time()
#nmf_model = nmf.fit(nmf_data)
nmf_X_new = nmf.fit_transform(X)
nmf_end = time.time()
print 'nmf fit time', nmf_end - nmf_start
Example #24
0
from sklearn.model_selection import train_test_split

# irisデータの読み込み
iris = datasets.load_iris()
data = iris['data']
print(data.shape)

# 散布図に描画
fig = plt.figure()
plt.plot(data[:, 2], data[:, 3], 'k.')
plt.xlabel("petalLength")
plt.ylabel("petalWidth")
plt.show()

#ICAの実行
ICA = FastICA(n_components=2, random_state=0)  #20個の基底(コンポネント)を作る
X_transformed = ICA.fit_transform(data)

# 色を分けて可視化する場合
features = iris.data[:, [0, 2]]
plt.scatter(*features.T,
            c=[['orange', 'green', 'blue'][x] for x in iris.target])
plt.show()

# 処理後に可視化する場合
fig = plt.figure()
features = X_transformed[:, [0, 1]]
plt.scatter(*features.T,
            c=[['orange', 'green', 'blue'][x] for x in iris.target])
plt.show()
print('--------------------------------')
Example #25
0
#before model perform some dimensionality reduction

# PCA
remaining_comp = 15 # number of dimensions the data is reduced to
pca = PCA(n_components=remaining_comp, random_state=420)
pca_train = pca.fit_transform(x_train)
pca_test = pca.transform(x_test)

# tSVD
tsvd = TruncatedSVD(n_components=remaining_comp, random_state=420)
tsvd_train = tsvd.fit_transform(x_train)
tsvd_test = tsvd.transform(x_test)

# ICA
ica = FastICA(n_components=remaining_comp, random_state=420)
ica_train = ica.fit_transform(x_train)
ica_test = ica.transform(x_test)

# GRP
grp = GaussianRandomProjection(n_components=remaining_comp, eps=0.1, random_state=420)
grp_train = grp.fit_transform(x_train)
grp_test = grp.transform(x_test)

# SRP
srp = SparseRandomProjection(n_components=remaining_comp, dense_output=True, random_state=420)
srp_train = srp.fit_transform(x_train)
srp_test = srp.transform(x_test)

# NMF
nmf = NMF(n_components=remaining_comp, init='nndsvdar', random_state=420)
Example #26
0
    )
    logging.info('Step 1 - .. Done')

    X_player, y_player = df[stg.PLAYER_FEATURES], df[stg.PLAYER_TARGET]

    logging.info('Step 1 - Impute missing values with median ..')
    dump(X_player.median().to_dict(),
         join(stg.MODELS_DIR, stg.PLAYER_FEATURES_MEDIAN_FILENAME))
    dump(X_player.median().to_dict(),
         join(stg.SUBMISSION_DIR, stg.PLAYER_FEATURES_MEDIAN_FILENAME))
    X_player.fillna(X_player.median(), inplace=True)
    logging.info('Step 1 - .. Done')

    logging.info('Step 1 - Fit and save pipeline to predict players..')
    player_pipeline = make_pipeline(
        make_union(FastICA(tol=0.85), FunctionTransformer(copy)),
        ExtraTreesClassifier(n_estimators=75,
                             max_depth=18,
                             bootstrap=False,
                             criterion="gini",
                             max_features=0.1,
                             min_samples_leaf=1,
                             min_samples_split=2))
    player_pipeline_light = make_pipeline(
        make_union(FastICA(tol=0.85), FunctionTransformer(copy)),
        ExtraTreesClassifier(n_estimators=75,
                             max_depth=17,
                             bootstrap=False,
                             criterion="gini",
                             max_features=0.1,
                             min_samples_leaf=1,
Example #27
0
def project(vector,
            shape,
            roimask=None,
            n_components=None,
            svd_multiplier=5,
            calc_residuals=True):
    '''
    Apply an ica decomposition to the first axis of the input vector.  If a roimask is provided, the flattened roimask will be used to crop the vector before decomposition.

    If n_components is not set, an adaptive svd threshold is used (see approximate_svd_linearity_transition), with the hyperparameter svd_mutliplier.  

    Residuals lost in the ICA projection are captured if calc_residuals == True.  This represents the signal lost by ICA compression.

    Arguments:
        vector: 
            The (x*y, t) vector to be spatially ICA projected
        shape:
            The shape of the original movie (t,x,y)
        roimask:
            The roimask to crop the vectorized movie (x,y)
        n_components:
            Manually request a set number of ICA components
        svd_multiplier:
            The hyperparameter for svd adaptive thresholding
        calc_residuals:
            Whether to calculate spatial and temporal residuals of projection compression.

    Returns:
        components: A dictionary containing all the results, metadata, and information regarding the filter applied.

            mean: 
                the original video mean
            roimask: 
                the mask applied to the video before decomposing
            shape: 
                the original shape of the movie array
            eig_mix: 
                the ICA mixing matrix
            timecourses: 
                the ICA component time series
            eig_vec: 
                the eigenvectors
            n_components:
                the number of components in eig_vec (reduced to only have 25% of total components as noise)
            project_meta:
                The metadata for the ica projection
            expmeta:
                All metadata created for this class
            lag1: 
                the lag-1 autocorrelation
            noise_components: 
                a vector (n components long) to store binary representation of which components were detected as noise 
            cutoff: 
                the signal-noise cutoff value

        if the n_components was automatically set, the following additional keys are also returned in components

            svd_cutoff: 
                the number of components originally decomposed
            lag1_full: 
                the lag-1 autocorrelation of the full set of components decomposed before cropping to only 25% noise components
            svd_multiplier: 
                the svd multiplier value used to determine cutoff
    '''
    print('\nCalculating Eigenspace\n-----------------------')
    assert (vector.ndim == 2), (
        'vector was not a two-dimensional np array.'
        'If input is a movie, be sure to convert shape to (xy, t)')

    if roimask is not None:
        print('Using roimask to crop video')
        assert roimask.size == vector.shape[0], \
        'Vector was not the same size as the cropped mask'

        print('Original size:', vector.shape)
        maskind = np.where(roimask.flat == 1)
        vector = vector[maskind]
        print('Reduced size:', vector.shape)

    mean = np.mean(vector, 0).flatten()
    vector = vector - mean

    components = {}
    components['mean'] = mean
    components['roimask'] = roimask
    components['shape'] = shape

    if svd_multiplier is None:
        svd_multiplier = 5

    if vector.dtype == np.float16:
        vector = vector.astype('float32', copy=False)

    if n_components is None:
        print('Calculating ICA (with n_component SVD estimator)...')

        t0 = timer()
        try:
            u, ev, _ = linalg.svd(vector, full_matrices=False)
        except ValueError:
            # LAPACK error if matricies are too big
            u, ev, _ = linalg.svd(vector,
                                  full_matrices=False,
                                  lapack_driver='gesvd')

        components['svd_eigval'] = ev

        # get starting point for decomposition based on svd mutliplier * the approximate point of transition to linearity in tail of ev components
        cross_1 = approximate_svd_linearity_transition(ev)
        n_components = cross_1 * svd_multiplier

        components['increased_cutoff'] = 0

        while True:
            print('\nCalculating ICA with', n_components, 'components...')

            w_init = u[:n_components, :n_components].astype('float64')
            ica = FastICA(n_components=n_components,
                          random_state=1000,
                          w_init=w_init)

            eig_vec = ica.fit_transform(vector)
            eig_mix = ica.mixing_

            noise, cutoff = sort_noise(eig_mix.T)

            p_signal = (1 - noise.sum() / noise.size) * 100

            if noise.size == shape[0]:  # all components are being used
                break
            elif p_signal < 75:
                print('ICA components were under 75% signal ({0}% signal).'\
                    .format(p_signal))
                break
            elif n_components >= shape[0]:
                print('ICA components were under 75% signal ({0}% signal).'\
                    .format(p_signal))
                print('However, number of components is maxed out.')
                print('Using this decomposition...')
                break
            else:
                print('ICA components were over 75% signal ({0}% signal).'\
                    .format(p_signal))
                print('Recalculating with more components...')
                n_components += n_components // 2
                components['increased_cutoff'] += 1

                if n_components > shape[0]:
                    print('\nComponents maxed out!')
                    print('\tAttempted:', n_components)
                    n_components = shape[0]
                    print('\tReduced to:', shape[0])

        components['lag1_full'] = lag_n_autocorr(eig_mix.T, 1)
        components['svd_multiplier'] = svd_multiplier

        print('Cropping excess noise components')
        components['svd_cutoff'] = n_components
        reduced_n_components = int((noise.size - noise.sum()) * 1.25)

        print('reduced_n_components:', reduced_n_components)

        if reduced_n_components < n_components:
            print('Cropping', n_components, 'to', reduced_n_components)

            ev_sort = np.argsort(eig_mix.std(axis=0))
            eig_vec = eig_vec[:, ev_sort][:, ::-1]
            eig_mix = eig_mix[:, ev_sort][:, ::-1]
            noise = noise[ev_sort][::-1]

            eig_vec = eig_vec[:, :reduced_n_components]
            eig_mix = eig_mix[:, :reduced_n_components]
            n_components = reduced_n_components
            noise = noise[:reduced_n_components]

            components['lag1_full'] = components['lag1_full'][ev_sort][::-1]
        else:
            print('Less than 75% signal.  Not cropping excess noise.')

        components['noise_components'] = noise
        components['cutoff'] = cutoff
        t = timer() - t0
        print('Independent Component Analysis took: {0} sec'.format(t))

    else:
        print('Calculating ICA (' + str(n_components) + ' components)...')

        t0 = timer()
        ica = FastICA(n_components=n_components, random_state=1000)

        try:
            eig_vec = ica.fit_transform(vector)  # Eigenbrains
        except ValueError:
            print('Calculation exceeded float32 maximum.')
            print('Trying again with float64 vector...')
            #value error if any value exceeds float32 maximum.
            #overcome this by converting to float64
            eig_vec = ica.fit_transform(vector.astype('float64'))

        t = timer() - t0
        print('Independent Component Analysis took: {0} sec'.format(t))
        eig_mix = ica.mixing_

        # sort components by their eig val influence (approximated by timecourse standard deviation)
        ev_sort = np.argsort(eig_mix.std(axis=0))
        eig_vec = eig_vec[:, ev_sort][:, ::-1]
        eig_mix = eig_mix[:, ev_sort][:, ::-1]

        noise, cutoff = sort_noise(eig_mix.T)
        components['noise_components'] = noise
        components['cutoff'] = cutoff

    print('components shape:', eig_vec.shape)

    components['eig_mix'] = eig_mix
    components['timecourses'] = eig_mix.T

    n_components = eig_vec.shape[1]
    components['eig_vec'] = eig_vec
    components['n_components'] = n_components
    components['lag1'] = lag_n_autocorr(components['timecourses'], 1)

    if calc_residuals:
        try:
            vector = vector.astype('float64')
            rebuilt = rebuild(components,
                              artifact_components='none',
                              vector=True).T

            rebuilt -= rebuilt.mean(axis=0)
            vector -= vector.mean(axis=0)

            residuals = np.abs(vector - rebuilt)

            residuals_temporal = residuals.mean(axis=0)

            if roimask is not None:
                residuals_spatial = np.zeros(roimask.shape)
                residuals_spatial.flat[maskind] = residuals.mean(axis=1)
            else:
                residuals_spatial = np.reshape(residuals.mean(axis=1),
                                               (shape[1], shape[2]))

            components['residuals_spatial'] = residuals_spatial
            components['residuals_temporal'] = residuals_temporal

        except Exception as e:
            print('Residual Calculation Failed!!')
            print('\t', e)

    # Save filter metadata information about how and when movie was filtered in dictionary
    project_meta = {}
    project_meta['time_elapsed'] = t
    project_meta['date'] = \
        datetime.now().strftime('%Y%m%d')[2:]
    fmt = '%Y-%m-%dT%H:%M:%SZ'
    project_meta['tstmp'] = \
        datetime.now().strftime(fmt)
    project_meta['n_components'] = n_components
    components['project_meta'] = project_meta

    print('\n')
    return components
Example #28
0
# plt.show()
# Nope, actually health isn't really trending up at all recently.

# How about some nice ICA with your data
#FastICA(n_components=n_components, whiten=True),
# Compute ICA
# ica = FastICA(n_components=3)
# S_ = ica.fit_transform(X)  # Reconstruct signals

###############################################
## Number of components to use in ICA        ##
###############################################
ncomp = 13
###############################################

ica = FastICA(n_components=ncomp, whiten=True)
ica.fit(bigdf)
#icafittrans = ica.fit_transform(bigdf)
icafittrans = ica.transform(bigdf)
print(icafittrans.shape)
icafittrans = pd.DataFrame(icafittrans)
icafittrans.index = dfDateIndex

A_ = ica.mixing_  # Get estimated mixing matrix
# nrows: number of search terms
# ncols: number of components (that we chose)
# so each column is a list of that component's contribution to each search term
# so if we sort a column, that will give us the top search terms for that component!
# but let's make it a data frame and label everything correctly for convenience
A_ = pd.DataFrame(A_)
# set the row names
Example #29
0
    if l.strip() == "":
        continue
    sp = l.split(',')
    print(sp)
    nums[i, 0] = idxs[sp[0].strip()]
    # TODO i,1
    nums[i, 2:] = [int(s) for s in sp[2:]]

print(nums)

print("PCA...")
import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA
# pca = PCA(n_components=3)
from sklearn.decomposition import FastICA
pca = FastICA(n_components=3)

pca.fit(nums[:, 2:])
# print(pca.explained_variance_ratio_)
# print(pca.singular_values_)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

import matplotlib.cm as cm
colors = cm.rainbow(np.linspace(0, 1, len(names)))
for i, xyz in enumerate(pca.transform(nums[:, 2:])):
    ax.scatter(xyz[0], xyz[1], xyz[2], color=colors[int(nums[i][0])])

ax.set_xlabel('X Label')
    def run(self, df, train, test, nbite, reg, alpha=0.001, verbose=True):
        """
		data: pandas dataframe
		nbite: nombre d'iterations
		reg: regularization parameter
		alpha: learning rate
		"""

        #moyenne item
        #baseline = tf.constant(np.tile(np.array(df.mean(axis=0)),(df.shape[1],1)))
        shape = df.shape

        #constante: la matrice R à reconstituer entièrement
        R = tf.constant(df.values)
        #variable tensorflow masque
        mask_tf_train = tf.Variable(train)
        mask_tf_test = tf.Variable(test)

        #variables tensorflow

        if self.init == "random":
            #U et I initialisés selon une loi normale et normalisés en divisant par k
            U = tf.Variable(np.abs(
                np.random.normal(scale=1. / self.k,
                                 size=(shape[0], self.k)).astype(np.float64)),
                            name="U")
            I = tf.Variable(np.abs(
                np.random.normal(scale=1. / self.k,
                                 size=(self.k, shape[1])).astype(np.float64)),
                            name="I")
        if self.init == "ica":
            matrix = dok_matrix(df.shape, dtype=np.float64)

            for i in range(df.shape[0]):
                for j in range(df.shape[1]):
                    if not np.isnan(df.values[i, j]):
                        matrix[i, j] = df.values[i, j]

            ica = FastICA(n_components=self.k)

            U = tf.Variable(np.abs(ica.fit_transform(matrix.toarray())),
                            name="U")
            I = tf.Variable(np.abs(ica.components_), name="I")

        if self.init == "pca":
            matrix = dok_matrix(df.shape, dtype=np.float64)

            for i in range(df.shape[0]):
                for j in range(df.shape[1]):
                    if not np.isnan(df.values[i, j]):
                        matrix[i, j] = df.values[i, j]

            pca = PCA(n_components=self.k)

            U = tf.Variable(np.abs(pca.fit_transform(matrix.toarray())),
                            name="U")
            I = tf.Variable(np.abs(pca.components_), name="I")

        R_pred = tf.matmul(U, I)  #embeddings

        #beta: paramètre de regularization
        beta = tf.constant(reg, dtype=tf.float64, name="beta")
        #regularization L1
        regularizer = beta * (tf.reduce_sum(U) + tf.reduce_sum(I))

        #cout de l'algo NMF, norme matricielle de R - R_pred
        cost = tf.reduce_sum(
            tf.square(
                tf.boolean_mask(R, mask_tf_train) -
                tf.boolean_mask(R_pred, mask_tf_train)))
        cost += regularizer

        #contraintes de non-négativité de U et I
        clip_U = U.assign(tf.maximum(tf.zeros_like(U), U))
        clip_I = I.assign(tf.maximum(tf.zeros_like(I), I))
        clip = tf.group(clip_U, clip_I)

        #erreur MSE train
        mse_train = tf.reduce_mean(tf.square(
            tf.boolean_mask(R_pred, mask_tf_train) -
            tf.boolean_mask(R, mask_tf_train)),
                                   name="mse_train")
        mse_test = tf.reduce_mean(tf.square(
            tf.boolean_mask(R_pred, mask_tf_test) -
            tf.boolean_mask(R, mask_tf_test)),
                                  name="mse_test")

        #baseline MSE test
        #baselineMSE = tf.reduce_mean(tf.square(tf.boolean_mask(baseline, mask_tf_test) - tf.boolean_mask(R, mask_tf_test)))

        global_step = tf.Variable(0, trainable=False)

        if self.solver == "adam":
            optimizer = tf.train.AdamOptimizer(alpha).minimize(
                cost, global_step=global_step)
        elif self.solver == "sgd":
            optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(
                cost, global_step=global_step)
        else:
            #optimiseur rmsprop
            optimizer = tf.train.RMSPropOptimizer(alpha).minimize(
                cost, global_step=global_step)

        costs = []
        mses_train = []
        mses_test = []

        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        for i in range(nbite):
            sess.run(optimizer)
            sess.run(clip)
            if i % 100 == 0:
                prout = sess.run(cost)
                lol = sess.run(mse_train)
                mdr = sess.run(mse_test)
                if verbose:
                    print("cost: %f" % prout)
                    print("mse train: %f" % lol)
                    print("mse test: %f" % mdr)
                    print("***************")
                costs.append((i, prout))
                mses_train.append((i, lol))
                mses_test.append((i, mdr))

        learnt_U = sess.run(U)
        learnt_I = sess.run(I)
        #msebaseline = sess.run(baselineMSE)
        #if verbose:
        #print("baseline: ", msebaseline)
        sess.close()

        return learnt_U, learnt_I, {
            "mse_train": mses_train,
            "mse_test": mses_test,
            "cost": costs
        }