Exemple #1
0
    def backward(self, X, **params):
        try:
            self.lossy = params["lossy"]
        except KeyError:
            pass

        try:
            self.n_components = params["n_components"]
        except KeyError:
            pass

        pca = PCA(n_components=self.n_components)
        tiles = []
        for x in X:
            component, cov, mean = x
            component0, component1, component2 = dcps_array_3d(component)
            cov0, cov1, cov2 = dcps_array_3d(cov)
            mean0, mean1, mean2 = mean[0], mean[1], mean[2]
            pca.components_ = cov0[:self.n_components, :self.n_components]
            pca.mean_ = mean0[:self.n_components]
            channel0 = pca.inverse_transform(component0[:, :self.n_components])
            pca.components_ = cov2[:self.n_components, :self.n_components]
            pca.mean_ = mean1[:self.n_components]
            channel1 = pca.inverse_transform(component1[:, :self.n_components])
            pca.components_ = cov2[:self.n_components, :self.n_components]
            pca.mean_ = mean2[:self.n_components]
            channel2 = pca.inverse_transform(component2[:, :self.n_components])

            tiles.append(cat_arrays_2d([channel0, channel1, channel2]))

        return tiles
Exemple #2
0
def naive_bayes(images, size, labels, color_labels):

    mean = numpy.mean(images)
    std = numpy.std(images)

    scaled = preprocessing.scale(images, with_mean=True, with_std=True)

    pca = PCA(4)
    pca.fit(scaled)

    one_two = pca.components_[:2]
    three_four = pca.components_[2:4]

    lab = numpy.copy(labels)

    for i, elem in enumerate(labels):
        if elem == 'dog':
            labels[i] = 0
        if elem == 'house':
            labels[i] = 1
        if elem == 'guitar':
            labels[i] = 2
        if elem == 'person':
            labels[i] = 3

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        scaled, labels, train_size=0.7, random_state=1)

    model = nb.GaussianNB()
    model.fit(X_train, y_train)
    score_original = model.score(X_test, y_test)
    print(f"Score with without pca decomposition is: {score_original}")

    pca.components_ = one_two
    X_train_transf = pca.transform(X_train)
    X_test_transf = pca.transform(X_test)
    model.fit(X_train_transf, y_train)
    score_first = model.score(X_test_transf, y_test)
    print(f"Score with first two pcs is: {score_first}")
    title = "Decision boundaries for NB with first two pca components"

    plot_boundaries(X_train_transf, y_train, X_test_transf, y_test, model,
                    title, lab)

    pca.components_ = three_four
    X_train_transf = pca.transform(X_train)
    X_test_transf = pca.transform(X_test)
    model.fit(X_train_transf, y_train)
    score_second = model.score(X_test_transf, y_test)
    print(f"Score with third and fourth pcs is: {score_second}")
    title = "Decision boundaries for NB with 3rd and 4th pca components"

    plot_boundaries(X_train_transf, y_train, X_test_transf, y_test, model,
                    title, lab)
    return
Exemple #3
0
def showImageWithPcaLast(n_pca, input_matrix, nImg, show):
    #PCA last n computation
    pca_last_n = PCA()
    scaled = scaler.fit_transform(input_matrix)
    pca_last_n = pca_last_n.fit(scaled)
    components = pca_last_n.components_[-n_pca:]
    pca_last_n.components_ = components
    x_last_n = pca_last_n.transform(scaled)
    x_inv_last_n = pca_last_n.inverse_transform(x_last_n)
    x_inv_last_n = scaler.inverse_transform(x_inv_last_n)

    #print variance ratio
    val = np.sum(pca_last_n.explained_variance_ratio_)
    print("Variance converage for the last " + str(n_pca) + ": " +
          str(val))  #variance covered with this pca

    if (show):
        fig_last_n = plt.figure()
        fig_last_n.add_subplot(1, 2, 1)
        plt.imshow(np.reshape(input_matrix[nImg, :] / 255.0, (227, 227, 3)))
        fig_last_n.add_subplot(1, 2, 2)
        plt.imshow(np.reshape(x_inv_last_n[nImg, :] / 255.0, (227, 227, 3)))
        plt.show()
        #plt.clf()
    else:
        return x_inv_last_n
Exemple #4
0
    def test_init(self, df_norm, n_components):
        from flotilla.compute.decomposition import DataFramePCA

        test_pca = DataFramePCA(df_norm, n_components=n_components)

        true_pca = PCA(n_components=n_components)
        true_pca.fit(df_norm.values)
        pc_names = ['pc_{}'.format(i + 1) for i in
                    range(true_pca.components_.shape[0])]
        true_pca.components_ = pd.DataFrame(true_pca.components_,
                                            index=pc_names,
                                            columns=df_norm.columns)
        true_pca.explained_variance_ = pd.Series(
            true_pca.explained_variance_, index=pc_names)
        true_pca.explained_variance_ratio_ = pd.Series(
            true_pca.explained_variance_ratio_, index=pc_names)
        true_pca.reduced_space = true_pca.transform(df_norm.values)
        true_pca.reduced_space = pd.DataFrame(true_pca.reduced_space,
                                              index=df_norm.index,
                                              columns=pc_names)

        npt.assert_array_equal(test_pca.X, df_norm.values)
        pdt.assert_frame_equal(test_pca.components_,
                               true_pca.components_)
        pdt.assert_series_equal(test_pca.explained_variance_,
                                true_pca.explained_variance_)
        pdt.assert_series_equal(test_pca.explained_variance_ratio_,
                                true_pca.explained_variance_ratio_)
        pdt.assert_frame_equal(test_pca.reduced_space,
                               true_pca.reduced_space)
Exemple #5
0
	def load(self, filename='pca.nc'):
		"""
		Read sklearn PCA parameters from a netcdf file
		"""

		infile = netCDF4.Dataset(filename, 'r')

		self.locations = [json.loads(string) for string in list(infile.variables['location'])]
		self.pcas = []

		id = 0
		for location in self.locations:
			n_components = infile.variables['n_components'][id]
			components = infile.variables['components'][id]
			mean = infile.variables['means'][id]
			explained_variance_ratio = infile.variables['explained_variance_ratio'][id]
			noise_variance = infile.variables['noise_variance'][id]

			pca = PCA(n_components=n_components)
			pca.components_ = components
			pca.mean_ = mean
			pca.explained_variance_ratio_ = explained_variance_ratio
			pca.noise_variance_ = noise_variance

			self.pcas.append(pca)

			id += 1
Exemple #6
0
def principalComponent(X,
                       n_components,
                       isPrint=False,
                       isShow=False,
                       n_show=None):
    pca = PCA(n_components=n_components)
    pca.fit(X)
    X_pca = pca.transform(X)
    if isPrint:
        print("Форма исходного массива: {}".format(str(X.shape)))
        print("Форма массива после сокращения размерности: {}".format(
            str(X_pca.shape)))
        print("форма главных компонент: {}".format(pca.components_.shape))
        print("компоненты PCA:\n{}".format(pca.components_))
    if isShow:
        if n_show is not None:
            pca.components_ = pca.components_[0:n_show[0], 0:n_show[1]]
        plt.matshow(pca.components_, cmap='seismic')
        # plt.yticks([0, 1], ["Первая компонента", "Вторая компонента"])
        plt.colorbar()
        # plt.xticks(range(len(X.columns)),
        #            X.columns, rotation=85, ha='left')
        plt.xlabel("Характеристика")
        plt.ylabel("Главные компоненты")
        plt.show()
    return X_pca
Exemple #7
0
def E(A, B, n_components=20):
    pca_A = PCA(n_components)
    pca_A.fit(np.array(A))

    pca_B = PCA(n_components)
    pca_B.fit(np.array(B))

    #This approach tries to use the eigenvector and eiganvalues from B
    pca_A.components_ = pca_B.components_
    pca_A.explained_variance_ = pca_B.explained_variance_
    transformed = pca_A.transform(A)
    inverse_transformed = pca_A.inverse_transform(transformed)

    #This approach tries to use pca_B with mean_A
    # pca_B.mean_=pca_A.mean_
    # transformed = pca_B.transform(A)
    # inverse_transformed = pca_B.inverse_transform(transformed)

    error_vector = np.array(A) - np.array(inverse_transformed)
    N = len(error_vector)
    squares = []

    for i in range(0, len(error_vector)):
        squares.append(list(map(lambda x: x * x, error_vector[i])))
    sums = list(map(lambda x: sum(x), squares))
    error = sum(sums) / N
    return error
Exemple #8
0
    def _initialize_with_pca(self,
                             datas,
                             inputs=None,
                             masks=None,
                             tags=None,
                             num_iters=20):
        for data in datas:
            assert data.shape[1] == self.N

        N_offsets = np.cumsum(self.N_vec)[:-1]
        pcas = []

        split_datas = list(
            zip(*[np.split(data, N_offsets, axis=1) for data in datas]))
        split_masks = list(
            zip(*[np.split(mask, N_offsets, axis=1) for mask in masks]))
        assert len(split_masks) == len(split_datas) == self.P

        for em, dps, mps in zip(self.emissions_models, split_datas,
                                split_masks):
            pcas.append(em._initialize_with_pca(dps, inputs, mps, tags))

        # Combine the PCA objects
        from sklearn.decomposition import PCA
        pca = PCA(self.D)
        pca.components_ = block_diag(*[p.components_ for p in pcas])
        pca.mean_ = np.concatenate([p.mean_ for p in pcas])
        # Not super pleased with this, but it should work...
        pca.noise_variance_ = np.concatenate(
            [p.noise_variance_ * np.ones(n) for p, n in zip(pcas, self.N_vec)])
        return pca
Exemple #9
0
 def from_tuple(tuple):
     # Create PCA object
     components, explained_variance, mean, whiten = tuple
     pca = PCA(whiten=whiten)
     pca.components_ = components
     pca.explained_variance_ = explained_variance
     pca.mean_ = mean
     return pca
Exemple #10
0
def get_components(data):
    pca = PCA(n_components=COMPONENTS)
    pca.fit(data)
    new_components = np.array([np.dot(component, ortho_rotation(pca.components_)) for component in pca.components_])
    pca.components_ = new_components
    print(pca.components_, pca.explained_variance_ratio_)
    transformed = pca.transform(data)
    df_transformed = pd.DataFrame(data=transformed,  index=data.index)
    return df_transformed
Exemple #11
0
def applyPCA(sampleData, mean, components):
	pca = PCA(n_components=components.shape[0])
	pca.components_ = components
	pca.mean_ = mean

	transform = pca.transform(np.array([sampleData]))

	reconstructed = fast_dot(transform, pca.components_) + pca.mean_
	reconstructed = reconstructed[0]
	return sampleData / reconstructed
Exemple #12
0
def project_pc(sample_data, ref_file, ap):
    pca = PCA(n_components=ref_file['pca_components{}'.format(ap)].shape[0])
    pca.components_ = ref_file['pca_components{}'.format(ap)]
    pca.mean_ = ref_file['pca_mean{}'.format(ap)]

    transform = pca.transform(np.array([sample_data]))

    reconstructed = np.dot(transform, pca.components_) + pca.mean_
    reconstructed = reconstructed[0]
    return sample_data / reconstructed
Exemple #13
0
def applyPCA(sampleData, mean, components):
    pca = PCA(n_components=components.shape[0])
    pca.components_ = components
    pca.mean_ = mean

    transform = pca.transform(np.array([sampleData]))

    reconstructed = fast_dot(transform, pca.components_) + pca.mean_
    reconstructed = reconstructed[0]
    return sampleData / reconstructed
    def test_init(self, df_norm, n_components):
        from flotilla.compute.decomposition import DataFramePCA

        test_pca = DataFramePCA(df_norm, n_components=n_components)

        true_pca = PCA(n_components=n_components)
        true_pca.fit(df_norm.values)
        pc_names = ['pc_{}'.format(i+1) for i in
                    range(true_pca.components_.shape[0])]
        true_pca.components_ = pd.DataFrame(true_pca.components_,
                                            index=pc_names,
                                            columns=df_norm.columns)
        true_pca.explained_variance_ = pd.Series(
            true_pca.explained_variance_, index=pc_names)
        true_pca.explained_variance_ratio_ = pd.Series(
            true_pca.explained_variance_ratio_, index=pc_names)
        true_pca.reduced_space = true_pca.transform(df_norm.values)
        true_pca.reduced_space = pd.DataFrame(true_pca.reduced_space,
                                              index=df_norm.index,
                                              columns=pc_names)

        npt.assert_array_equal(test_pca.X, df_norm.values)
        pdt.assert_frame_equal(test_pca.components_,
                               true_pca.components_)
        pdt.assert_series_equal(test_pca.explained_variance_,
                               true_pca.explained_variance_)
        pdt.assert_series_equal(test_pca.explained_variance_ratio_,
                               true_pca.explained_variance_ratio_)
        pdt.assert_frame_equal(test_pca.reduced_space,
                               true_pca.reduced_space)
        
        
# class TestDataFrameNMF():
#     def test_init(self, df_nonneg, n_components, RANDOM_STATE):
#         from flotilla.compute.decomposition import DataFrameNMF
#
#         test_nmf = DataFrameNMF(df_nonneg, n_components=n_components,
#                                 random_state=RANDOM_STATE)
#
#         true_nmf = NMF(n_components=n_components, random_state=RANDOM_STATE)
#         true_nmf.reduced_space = true_nmf.fit_transform(df_nonneg.values)
#         pc_names = ['pc_{}'.format(i + 1) for i in
#                     range(true_nmf.components_.shape[0])]
#         true_nmf.reduced_space = pd.DataFrame(true_nmf.reduced_space,
#                                               index=df_nonneg.index,
#                                               columns=pc_names)
#         true_nmf.components_ = pd.DataFrame(true_nmf.components_,
#                                             index=pc_names,
#                                             columns=df_nonneg.columns)
#
#         npt.assert_almost_equal(test_nmf.X, df_nonneg.values, decimal=4)
#         pdt.assert_frame_equal(test_nmf.components_,
#                                true_nmf.components_)
#         pdt.assert_frame_equal(test_nmf.reduced_space,
#                                true_nmf.reduced_space)
def cal_pca(point_cloud,is_show=False,desired_num_of_feature=3,title="pca demo"):
    pca = PCA(n_components=desired_num_of_feature)
    pca.fit(point_cloud)
    print("*"*30)
    print("z 向量 %f ,%f ,%f" % (pca.components_[2,0],pca.components_[2,1],pca.components_[2,2]))
    if(np.inner(pca.components_[2,:],[0,0,1])>0):
        print("pca_z向量與z方向同向,需要對x軸旋轉180度")
        pca.components_[2,:]=-pca.components_[2,:]
        # r = R.from_euler('x',180, degrees=True)
        # r_b_o=R.from_dcm(pca.components_.T)
        # r3=r_b_o*r
        # pca.components_=r3.as_dcm().T
    # 求出x,y的外積,應該為z 看是否與第三軸同向確認是否為正確
    x_axis_matrix=np.outer(pca.components_[1,:],pca.components_[2,:])
    x_axis=np.asarray([x_axis_matrix[1,2]-x_axis_matrix[2,1],x_axis_matrix[2,0]-x_axis_matrix[0,2],x_axis_matrix[0,1]-x_axis_matrix[1,0]])
    print("*"*30)
    print("外積計算的x軸為:")
    print(x_axis)

    # 確認pca_x與經由外積(y,z)計算的x同向
    if(np.allclose(pca.components_[0,:],x_axis)):
        print("pca_x與外積(y,z)計算的x同向")
    else:
        # 反向,將不重要的x軸轉向
        print("x方向不正確,需替換成正確的項")
        pca.components_[0,:]=x_axis
    if(np.inner(pca.components_[0,:],[1,0,0])<0):
        # 希望夾爪朝前,這樣末端點就不需要轉太多
        print("pca_x向量與x方向反向,需要對z軸旋轉180度")
        r = R.from_euler('z',180, degrees=True)
        # r_b_o=R.from_dcm(pca.components_.T)
        r3=np.dot(pca.components_.transpose(),r.as_dcm().astype(int))
        pca.components_=r3.transpose()
    if is_show:
        fig = plt.figure(figsize=(3,3))
        ax = fig.add_subplot(111, projection='3d')
        # ax.set_xlabel('X')
        # ax.set_ylabel('Y')
        # ax.set_zlabel('Z')
        # ax.set_xlim3d(-1, 1)
        # ax.set_ylim3d(-1,1)
        # ax.set_zlim3d(-1,1)
        plt.title(title)
        ax.scatter(point_cloud[:,0], point_cloud[:,1], point_cloud[:,2], c='y',s=1)
        xm,ym,zm=get_centroid_from_pc(point_cloud)
        ax.scatter(xm, ym, zm, c='r',s=10)   
        discount=1
        print("*"*30)
        for length, vector in zip(pca.explained_variance_, pca.components_):
            ax.quiver(xm,ym,zm,vector[0],vector[1],vector[2], length=discount)
            discount/=3
        plt.tight_layout()
        plt.show()
    return pca.components_,pca.explained_variance_
Exemple #16
0
def showColorClasses(input_matrix):
    cvec = []
    #fig = plt.figure(figsize = (6,6))
    plt.subplots_adjust(hspace=0.4, wspace=1)

    scaled = scaler.fit_transform(input_matrix)
    x_t = PCA(2).fit_transform(scaled)
    for k in y:
        #z = labels[k]
        cvec.append(label_color[labels[k]])
    cvec = [label_color[labels[k]] for k in y]
    #plt.subplot(1,3,1)
    plt.scatter(x_t[:, 0], x_t[:, 1], c=cvec, s=4)
    plt.title("Scatter plot using PC(1,2)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.show()

    pca_tot = PCA()
    pca_tot = pca_tot.fit(x)
    components3_4 = pca_tot.components_[3:5]
    components10_11 = pca_tot.components_[10:12]

    pca_tot.components_ = components3_4
    x_3_4 = pca_tot.transform(x)
    #plt.subplot(1,3,2)
    plt.scatter(x_3_4[:, 0], x_3_4[:, 1], c=cvec, s=4)
    plt.title("Scatter plot using PC(3,4)")
    plt.xlabel("PC3")
    plt.ylabel("PC4")
    plt.show()

    pca_tot.components_ = components10_11
    x_10_11 = pca_tot.transform(x)
    #plt.subplot(1,3,3)
    plt.scatter(x_10_11[:, 0], x_10_11[:, 1], c=cvec, s=4)
    plt.title("Scatter plot using PC(10,11)")
    plt.xlabel("PC10")
    plt.ylabel("PC11")
    plt.show()
Exemple #17
0
def manual_fit(data, from_pc, to_pc=None):
    if to_pc == None:
        #perfoms PCA with all the PCs
        pca = PCA()
        #fit the PCA on the data (computes the PCs on the data)
        pca.fit(data)
        to_pc = len(pca.components_)
    else:
        to_pc += 1
        pca = PCA(to_pc)
        pca.fit(data)
    # extracts the last N PCs (last N rows.. corrispondant columns are extracted automatically)
    pca.components_ = pca.components_[from_pc:to_pc]
    return pca
def extract_pca_component(img, mask):
    # extract ROI
    data = apply_mask(img, image.index_img(mask, 0))
    # normalized data
    scaler = StandardScaler()
    normalized = scaler.fit_transform(data)
    # pca
    pca = PCA(n_components=1)
    pca.fit(normalized)
    # force eigenvalue to be positive
    if np.all(pca.components_ < 0):
        pca.components_ = -1 * pca.components_
    projected = pca.transform(data)
    # variance
    var_projected = np.sum(np.var(projected, axis=0))
    var_original = np.sum(np.var(data, axis=0))
    return projected, var_projected / var_original
Exemple #19
0
def run_sample_weighting_and_pca_transformation(
		training_taskset,
		validation_taskset,
		testing_taskset,
		should_standardise,
		pca_transform,
		training_event_stats):

	training_responses = training_taskset[2]
	validation_responses = validation_taskset[2]
	testing_responses = testing_taskset[2]

	kde_params = {}
	kde_params["normalised"] = should_standardise
	kde_params["normalisation_statistic"] = training_event_stats[-1][1]

	kde_params["values"] = training_responses
	kde, _, _ = run_kernel_density_estimation(**kde_params)

	kde_params["kde"] = kde
	training_sample_weights = compute_sample_weights_from_kde(**kde_params)

	kde_params["values"] = validation_responses
	validation_sample_weights = compute_sample_weights_from_kde(**kde_params)

	kde_params["values"] = testing_responses
	testing_sample_weights = compute_sample_weights_from_kde(**kde_params)

	num_input_events = len(training_taskset[1][0])

	if pca_transform == True:
		from sklearn.decomposition import PCA
		pca = PCA(n_components=num_input_events)
		pca.fit_transform(training_taskset[1])
	else:
		pca = pca_struct()
		pca.components_ = np.identity(num_input_events)

	run_pca_transformation(training_taskset, pca)
	run_pca_transformation(validation_taskset, pca)
	run_pca_transformation(testing_taskset, pca)

	return training_taskset, validation_taskset, testing_taskset, training_sample_weights, validation_sample_weights, testing_sample_weights, pca
Exemple #20
0
	def fit(self, predictors, locations, **kwargs):

		self.locations = locations
		self.pcas = []
		self.n = predictors['n']

		for location in locations:
			raw = extract_n_by_n(predictors, location, **kwargs)
			
			#pca = PCA(n_components='mle', whiten=True)
			#pca = PCA(n_components=0.95, whiten=True)
			pca = PCA(n_components=2)
			
			pca = pca.fit(raw)
			components = pca.components_
			pca.components_ = components
			
			self.pcas.append(pca.fit(raw))

			print "pca: ", location, pca.n_components_, pca.explained_variance_ratio_
Exemple #21
0
    def fit(self, X, y=None):
        max_possible_comp = min(X.shape)
        self.max_n_components = int(min([self.max_n_components, max_possible_comp]))
        pca = PCA(n_components=self.max_n_components)
        pca.fit(X)
        exp_var_rat = pca.explained_variance_ratio_
        sum_exp_var_rat = cumsum(exp_var_rat)

        n_comp_to_retain = 0
        for n_comp_to_retain in range(self.min_n_components, self.max_n_components + 1):
            if sum_exp_var_rat[n_comp_to_retain - 1] >= self.target_variance:
                break

        pca.components_ = pca.components_[:n_comp_to_retain, :]

        # Note: pca not needed for the functioning of the class, but keeping around for debug reasons
        self._pca = pca

        self.scalings_ = pca.components_.T
        return self
Exemple #22
0
def naiveBayesClassifier(input_matrix, classes, firstPC=0, lastPC=0):
    scaled = scaler.fit_transform(input_matrix)
    if firstPC == 0 and lastPC == 0:
        x_train, x_test, y_train, y_test = train_test_split(scaled,
                                                            classes,
                                                            test_size=0.1)
    else:
        #Select only PCA in range
        pca_tot = PCA()
        pca_tot = pca_tot.fit(scaled)
        twoComponents = pca_tot.components_[firstPC:lastPC]
        pca_tot.components_ = twoComponents
        x = pca_tot.transform(scaled)
        #Train and use the model
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            classes,
                                                            test_size=0.1)

    clf = GaussianNB()
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    accuracy = accuracy_score(y_test, prediction)
    print('Accuracy score: ' + str(accuracy))
def cal_pca_for_pose_data_generator(point_cloud,desired_num_of_feature=3):
    pca = PCA(n_components=desired_num_of_feature)
    pca.fit(point_cloud)

    # print("z 向量 %f ,%f ,%f" % (pca.components_[2,0],pca.components_[2,1],pca.components_[2,2]))
    if(np.inner(pca.components_[2,:],[0,0,1])>0):
        # print("pca_z向量與z方向同向,需要對x軸旋轉180度")
        pca.components_[2,:]=-pca.components_[2,:]
        # r = R.from_euler('x',180, degrees=True)
        # r_b_o=R.from_dcm(pca.components_.T)
        # r3=r_b_o*r
        # pca.components_=r3.as_dcm().T
    # 求出x,y的外積,應該為z 看是否與第三軸同向確認是否為正確
    x_axis_matrix=np.outer(pca.components_[1,:],pca.components_[2,:])
    x_axis=np.asarray([x_axis_matrix[1,2]-x_axis_matrix[2,1],x_axis_matrix[2,0]-x_axis_matrix[0,2],x_axis_matrix[0,1]-x_axis_matrix[1,0]])
    # print("*"*30)
    # print("外積計算的x軸為:")
    # print(x_axis)

    # 確認pca_x與經由外積(y,z)計算的x同向
    if(np.allclose(pca.components_[0,:],x_axis)):
        # print("pca_x與外積(y,z)計算的x同向")
        pass
    else:
        # 反向,將不重要的x軸轉向
        # print("x方向不正確,需替換成正確的項")
        pca.components_[0,:]=x_axis
    if(np.inner(pca.components_[0,:],[1,0,0])<0):
        # 希望夾爪朝前,這樣末端點就不需要轉太多
        # print("pca_x向量與x方向反向,需要對z軸旋轉180度")
        r = R.from_euler('z',180, degrees=True)
        # r_b_o=R.from_dcm(pca.components_.T)
        r3=np.dot(pca.components_.transpose(),r.as_dcm().astype(int))
        pca.components_=r3.transpose()   

    return pca.components_,pca.explained_variance_
    def apply_cv(epochs):
        count = 1
        confusion_matrixes = []
        confusion_matrixes_percent = []
        predicted = ''
        test_label = ''
        firstIterCV = True
        probabilities = np.array([[]], ndmin=2)
        predictions = np.array([])
        best_threshold = []
        cv_probabilities = []
        cv_probabilities_label = []
        for train, test in cv:
            ## Train Data processing ##
            train_data = epochs._data[train]
            train_label = label[train]

            # Online simulation flag
            if FILTER_METHOD is 'WINDOWED':  # epochs should have one epoch only
                train_bp = mne.filter.band_pass_filter(
                    train_data,
                    sfreq,
                    Fp1=2,
                    Fp2=h_freq,
                    copy=True,
                    filter_length=None,
                    method='fft',
                    iir_params=None)  # bandpass on one epoch
            if FILTER_METHOD is 'NC' or FILTER_METHOD is 'LFILT':
                train_bp = train_data
            train_bp = train_bp[:, :, paddingIdx:paddingIdx +
                                (int((tmax - tmin) * sfreq))]

            for trial in range(train_bp.shape[0]):
                for ch in range(train_bp.shape[1]):
                    train_bp[trial, ch, :] = train_bp[trial, ch, :] - np.mean(
                        train_bp[trial, ch, :])

            # plt.figure()
            # plt.plot(train_bp[7,:].T)
            # plt.savefig(str(FILTER_METHOD)+'.png')
            # Normalization
            (train_normalized, trainShiftFactor,
             trainScaleFactor) = normalizeAcrossEpoch(train_bp, 'MinMax')

            # Downsampling
            train_downsampling = train_normalized[:, :, ::decim_factor]

            # Merge (reshape) channel and time for the PCA
            train_reshaped = train_downsampling.reshape(
                train_downsampling.shape[0], -1)

            # PCA initialisation
            if APPLY_PCA is False:
                pca = None
                train_pcaed = train_reshaped
            else:
                pca = PCA(0.95)
                pca.fit(train_reshaped)
                pca.components_ = -pca.components_  # inversion of vector to be constistant with Inaki's code
                train_pcaed = pca.transform(train_reshaped)

            # PCA
            #			train_pcaed = train_reshaped

            ## Test data processing ##
            test_data = epochs._data[test]
            test_label = label[test]

            # Compute_feature does the same steps as for train, but requires a computed PCA (that we got from train)
            # (bandpass, norm, ds, and merge channel and time)
            test_pcaed = compute_features(test_data,
                                          sfreq,
                                          l_freq,
                                          h_freq,
                                          decim_factor,
                                          trainShiftFactor,
                                          trainScaleFactor,
                                          pca,
                                          FILTER_METHOD,
                                          tmin,
                                          tmax,
                                          paddingIdx,
                                          iir_params=dict(a=a, b=b))
            #			test_pcaed = compute_features(test_data,sfreq,l_freq,h_freq,decim_factor,trainShiftFactor,trainScaleFactor,pca=None)

            ## Test ##
            train_x = train_pcaed
            test_x = test_pcaed

            # Classifier init
            #			RF = dict(trees=100, maxdepth=None)
            #			cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], n_jobs=n_jobs)
            # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], class_weight="balanced", n_jobs=n_jobs)
            # cls = LDA(solver='eigen')
            #			cls = QDA(reg_param=0.3) # regularized LDA

            #			cls.fit( train_x, train_label )
            # Y_pred= cls.predict( test_x )
            # prediction = Y_pred

            # Fitting
            cls = rLDA(regcoeff)
            cls.fit(train_x, train_label)

            predicted = cls.predict(test_x)
            probs = cls.predict_proba(test_x)
            prediction = np.array(predicted)

            if useLeaveOneOut is True:
                if firstIterCV is True:
                    probabilities = np.append(probabilities, probs, axis=1)
                    firstIterCV = False
                    predictions = np.append(predictions, prediction)
                else:
                    probabilities = np.append(probabilities, probs, axis=0)
                    predictions = np.append(predictions, prediction)
            else:
                predictions = np.append(predictions, prediction)
                probabilities = np.append(probabilities, probs)

            # Performance
            if useLeaveOneOut is not True:
                cm = np.array(confusion_matrix(test_label, prediction))
                cm_normalized = cm.astype('float') / cm.sum(axis=1)[:,
                                                                    np.newaxis]
                confusion_matrixes.append(cm)
                confusion_matrixes_percent.append(cm_normalized)
                avg_confusion_matrixes = np.mean(confusion_matrixes_percent,
                                                 axis=0)

            print('CV #' + str(count))
            print('Prediction: ' + str(prediction))
            print('    Actual: ' + str(test_label))

            # Append probs to the global list
            probs_np = np.array(probs)
            cv_probabilities.append(probs_np[:, 0])
            cv_probabilities_label.append(test_label)

            #			if useLeaveOneOut is not True:
            #				print('Confusion matrix')
            #				print(cm)
            #				print('Confusion matrix (normalized)')
            #				print(cm_normalized)
            #				print('---')
            #				print('True positive rate: '+str(cm_normalized[0][0]))
            #				print('True negative rate: '+str(cm_normalized[1][1]))
            print('===================')

            ## One CV done, go to the next one
            count += 1

        best_threshold = None
        cv_prob_linear = np.ravel(cv_probabilities)
        cv_prob_label_np = np.array(cv_probabilities_label)
        cv_prob_label_linear = np.ravel(cv_prob_label_np)
        threshold_list = np.linspace(0, 1, 100)

        biglist_fpr = []
        biglist_tpr = []
        biglist_thresh = []
        biglist_cms = []

        for thresh in threshold_list:
            biglist_pred = [
                4 if x < thresh else 3 for x in cv_prob_linear
            ]  # list comprehension to quickly go through the list.
            biglist_cm = confusion_matrix(cv_prob_label_linear, biglist_pred)
            biglist_cm_norm = biglist_cm.astype('float') / biglist_cm.sum(
                axis=1)[:, np.newaxis]
            biglist_cms.append(biglist_cm_norm)
            biglist_tpr.append(biglist_cm_norm[0][0])
            biglist_fpr.append(biglist_cm_norm[1][0])
            biglist_thresh.append(thresh)
        biglist_auc = auc(biglist_fpr, biglist_tpr)

        # Make a subset of data where FPR < MAX_FPR
        idx_below_maxfpr = np.where(np.array(biglist_fpr) < MAX_FPR)
        fpr_below_maxfpr = np.array(biglist_fpr)[idx_below_maxfpr[0]]
        tpr_below_maxfpr = np.array(biglist_tpr)[idx_below_maxfpr[0]]

        # Look for the best (max value) FPR in that subset
        best_tpr_below_maxfpr = np.max(tpr_below_maxfpr)
        best_tpr_below_maxfpr_idx = np.array(
            np.where(
                biglist_tpr == best_tpr_below_maxfpr)).ravel()  # get its idx

        # Get the associated TPRs
        best_tpr_below_maxfpr_associated_fpr = np.array(
            biglist_fpr)[best_tpr_below_maxfpr_idx]
        # Get the best (min value) in that subset
        best_associated_fpr = np.min(best_tpr_below_maxfpr_associated_fpr)
        # ... get its idx
        best_associated_fpr_idx = np.array(
            np.where(biglist_fpr == best_associated_fpr)).ravel()

        # The best idx is the one that is on both set
        best_idx = best_tpr_below_maxfpr_idx[np.in1d(best_tpr_below_maxfpr_idx,
                                                     best_associated_fpr_idx)]

        plt.plot(biglist_fpr, biglist_tpr)
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        best_threshold = threshold_list[best_idx]
        print('#################################')
        print('Best treshold:' + str(best_threshold))
        print('Gives a TPR of ' + str(best_tpr_below_maxfpr))
        print('And a FPR of ' + str(best_associated_fpr))
        print('CM')
        print(biglist_cms[best_idx[0]])

        return (biglist_auc, best_threshold)
Exemple #25
0
def main(filename,
         xtrains_percent=0.8,
         maxfeature=1,
         fit_ylabel=False,
         nn_estimator=100,
         sepaLabel=True,
         treeLabel=False,
         seed=42,
         pcaLabel=False,
         n_comp=2,
         sepa2=False,
         time_label=False,
         stream=False,
         sfl=False):
    mugen = float("inf")
    all_start = time.time()
    rng = np.random.RandomState(seed)

    # httpとsmtpは別の方法でデータ取得
    if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat':
        mat = {}
        f = h5py.File(filename)
        for k, v in f.items():
            mat[k] = np.array(v)
        X = mat['X'].T
        y2 = mat['y'][0]
        y3 = []
        for i in range(len(y2)):
            y3.append(int(y2[i]))
        y = np.reshape(y3, [len(y3), 1])

    else:
        mat = scipy.io.loadmat(filename)
        X = mat['X']
        y = mat['y']

    rate = xtrains_percent
    max_feat = int(maxfeature)
    if max_feat == 3:
        max_feat = X.shape[1]

    if treeLabel:
        anegawa = 0
    else:
        print('X_train\'s rate : ' + str(rate))
        print('max_features : ' + str(max_feat))
        print('fit_ylabel : ' + str(fit_ylabel))
        print('nn_estimator : ' + str(nn_estimator))
        print('sepaLabel : ' + str(sepaLabel))

    clf = IsolationForest(random_state=rng)
    clf.n_estimators = nn_estimator
    clf.verbose = 0
    clf.max_features = max_feat

    if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/http.mat'):
        clf.contamination = 0.004

    elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'):
        clf.contamination = 0.02

    elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'):
        clf.contamination = 0.009

    elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'):
        clf.contamination = 0.15

    elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'):
        clf.contamination = 0.36

    elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'):
        clf.contamination = 0.32

    elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'):
        clf.contamination = 0.03 / 100

    else:
        print('cannot file it.')
        # Generate train data
        a = rng.randn(400, 2)
        X = 0.3 * a
        X_train = np.r_[X + 2, X - 2]
        # X_train = np.ones([400, 2])

        # Generate some regular novel observations
        X = 0.3 * rng.randn(400, 2)
        X_test = np.r_[X + 2, X - 2]
        # X_test = np.ones([400, 2])

        # Generate some abnormal novel observations
        X_outliers = np.random.exponential(1. / 0.001, size=[20, 2])
        # X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
        # X_outliers = np.zeros([20, 2])
        X_test = np.r_[X_test, X_outliers]
        X_train_correct = np.ones([X_train.shape])

    hoge = 1 / (1 - rate)
    cross_count = int(np.ceil(hoge))
    if cross_count > hoge:
        cross_count = cross_count - 1

    sum_auc = 0
    sum_accuracy = 0

    pca_fit_time = 0
    pca_transform_train_time = 0
    pca_transform_test_time = 0
    test_time = 0
    fit_time = 0
    sum_train_time = 0

    # for count in range(cross_count):

    if sepaLabel == True:  # separated
        # data cut
        X_anomaly = []
        X_normal = []
        for i in range(len(X)):
            if y[i] == 1:
                X_anomaly.append(X[i])
            else:
                X_normal.append(X[i])

        cutter_anomaly = int(np.ceil(len(X_anomaly) * rate))
        cutter_normal = int(np.ceil(len(X_normal) * rate))

        for count in range(cross_count):
            part_anomaly = int(np.ceil(cutter_anomaly * count))
            part_normal = int(np.ceil(cutter_normal * count))
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X_anomaly)),
                            range(part_anomaly,
                                  part_anomaly + len(X_anomaly))):
                while k >= len(X_anomaly):
                    k = k - len(X_anomaly)

                if i < cutter_anomaly:
                    X_train.append(X_anomaly[k])
                    X_train_correct.append(-1)

                else:
                    X_test.append(X_anomaly[k])
                    X_test_correct.append(-1)

            for i, k in zip(range(len(X_normal)),
                            range(part_normal, part_normal + len(X_normal))):
                while k >= len(X_normal):
                    k = k - len(X_normal)

                if i < cutter_normal:
                    X_train.append(X_normal[k])
                    X_train_correct.append(1)
                else:
                    X_test.append(X_normal[k])
                    X_test_correct.append(1)

            if sfl:
                X_train_set = []
                X_test_set = []
                for i in range(len(X_train)):
                    buf = []
                    buf.append(X_train[i])
                    buf.append(X_train_correct[i])
                    X_train_set.append(buf)
                for i in range(len(X_test)):
                    buf = []
                    buf.append(X_test[i])
                    buf.append(X_test_correct[i])
                    X_test_set.append(buf)

                random.shuffle(X_train_set)
                random.shuffle(X_test_set)

                X_train = []
                X_test = []
                X_train_correct = []
                X_test_correct = []
                for i in range(len(X_train_set)):
                    X_train.append(X_train_set[i][0])
                    X_train_correct.append(X_train_set[i][1])
                for i in range(len(X_test_set)):
                    X_test.append(X_test_set[i][0])
                    X_test_correct.append(X_test_set[i][1])

    else:  # mixed
        cutter = len(X) * rate  # test start this index at the first time
        for count in range(cross_count):
            part = int(np.ceil(cutter * count))
            # while part >= len(X):
            #     part = part - len(X)
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X)), range(part, part + len(X))):
                while k >= len(X):
                    k = k - len(X)

                if i < len(X) * rate:
                    X_train.append(X[k])
                    X_train_correct.append(y[k])

                else:
                    X_test.append(X[k])
                    X_test_correct.append(y[k])

            for q in range(len(X_train_correct)):
                j = X_train_correct[q]
                if (j == 1):
                    X_train_correct[q] = -1
                else:
                    X_train_correct[q] = 1

            for w in range(len(X_test_correct)):
                j = X_test_correct[w]
                if (j == 1):
                    X_test_correct[w] = -1
                else:
                    X_test_correct[w] = 1

        # owari
        # finished cutting data

        if pcaLabel:
            pca_fit_start = time.time()
            pca = PCA(copy=True,
                      iterated_power='auto',
                      n_components=n_comp,
                      random_state=None,
                      svd_solver='auto',
                      tol=0.0,
                      whiten=False)
            pca2 = PCA(copy=True,
                       iterated_power='auto',
                       random_state=None,
                       svd_solver='auto',
                       tol=0.0,
                       whiten=False)

            if sepa2:
                # if False:
                print("こっち入ってるけどええんか!?")
                pca2.fit(X_train_normal)
                component = pca2.components_
                component2 = np.sort(pca2.components_)
                if n_comp < len(component2):
                    pca2.components_ = component2[0:n_comp]
                    # print(pca2.components_.shape)
                X_train = pca2.transform(X_train)
                X_test = pca2.transform(X_test)

            else:
                pca.fit(X_train)
                pca_fit_finish = time.time()

                pca_transform_train_start = time.time()
                X_train = pca.transform(X_train)
                pca_transform_train_finish = time.time()

                # a = X_test[0]
                # X_test = pca.transform(a)

                # if not stream:
                #     pca_transform_test_start = time.time()
                #     X_test = pca.transform(X_test) #stream version
                #     pca_transform_test_finish = time.time()
                #     pca_transform_test_time += (pca_transform_test_finish - pca_transform_test_start)
            clf.max_features = n_comp
            pca_fit_time += (pca_fit_finish - pca_fit_start)
            pca_transform_train_time += (pca_transform_train_finish -
                                         pca_transform_train_start)

        fit_start = time.time()
        if fit_ylabel:
            clf.fit(X_train, X_train_correct, sample_weight=None)
        else:
            clf.fit(X_train, y=None, sample_weight=None)
        fit_finish = time.time()
        fit_time += (fit_finish - fit_start)

        # if pcaLabel and stream:
        if stream:
            sum_score_auc = []
            sum_score_acc = []

            # print(X_test[0:1])
            for i in range(len(X_test)):
                if pcaLabel:
                    pca_transform_test_start = time.time()
                    a = [X_test[i]]
                    X_test_pca = pca.transform(a)
                    pca_transform_test_finish = time.time()
                    pca_transform_test_time += (pca_transform_test_finish -
                                                pca_transform_test_start)

                else:
                    X_test_pca = [X_test[i]]

                test_start = time.time()
                y_pred_test, a_score = clf.predict(X_test_pca)
                test_finish = time.time()
                test_time += (test_finish - test_start)

                sum_score_auc.append(a_score)
                sum_score_acc.append(y_pred_test)
            a_score = sum_score_auc
            y_pred_test = sum_score_acc

        else:
            if pcaLabel:
                pca_transform_test_start = time.time()
                X_test = pca.transform(X_test)  # stream version
                pca_transform_test_finish = time.time()
                pca_transform_test_time += (pca_transform_test_finish -
                                            pca_transform_test_start)

            test_start = time.time()
            y_pred_test, a_score = clf.predict(X_test)
            test_finish = time.time()
            test_time += (test_finish - test_start)
        # a_score = clf.decision_function(X_test)

        acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel)
        AUC = calc_AUC(X_test_correct, a_score, treeLabel)
        sum_auc += AUC
        sum_accuracy = acc

    # return AUC

    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # # plot the line, the samples, and the nearest vectors to the plane
    # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000))
    # # clf.max_features = 2
    # # print(yy.ravel())
    # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #
    # # Z = Z.reshape(xx.shape)
    #
    # plt.figure(figsize=(100, 200))
    # plt.suptitle("satellite")
    # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scat
    # # plot the line, the samples, and the nearest vectors to the plane
    # xx, yy = np.meshgrid(np.linspace(-200, 200, 1000), np.linspace(-200, 200, 1000))
    # # clf.max_features = 2
    # # print(yy.ravel())
    # # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    #
    # # Z = Z.reshape(xx.shape)
    #
    # plt.figure(figsize=(100, 200))
    # plt.suptitle("satellite")
    # # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()
    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    auc2 = sum_auc / cross_count
    # print(sum_accuracy)
    acc2 = sum_accuracy / cross_count

    # calc time
    all_finish = time.time()
    all_time = all_finish - all_start
    pca_fit_time = pca_fit_time / cross_count
    pca_transform_train_time = pca_transform_train_time / cross_count
    pca_transform_test_time = pca_transform_test_time / cross_count
    test_time = test_time / cross_count
    fit_time = fit_time / cross_count
    sum_train_time = fit_time + pca_fit_time + pca_transform_train_time
    sum_test_time = pca_transform_test_time + test_time
    # print("sum_train_time : " + str(sum_train_time))
    # print("pca_transform_train_time : " + str(pca_transform_train_time))
    # print("pca_fit_time : " + str(pca_fit_time))
    # print("test_time : " + str(test_time))
    # print("fit_time : " + str(fit_time))
    # print("all_time : " + str(all_time))
    # return

    if time_label:
        return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time
    elif treeLabel:
        if math.isnan(auc2):
            majikayo = True
        return auc2
    else:
        return auc2, acc2
Exemple #26
0
#MAKE DF from BIOM table
col = ['sample']
for i in list(range(8)):
        col.append('OTU'+str(i)) #list of column names

table= pd.read_csv('{}/{}'.format(cwd,inputfile),delim_whitespace=True, header= None)
table.columns= col # name columns
table= table.set_index('sample')    # index by sample names
null_data = table[table.isnull().any(axis=1)]
print '**Warning: the following lines are missing data. All nulls will be filled with zeros'
print null_data
table= table.fillna(0)   #replace any missing values with zeros
table.to_csv(r'pd_df.csv')
print 'pandas dataframe saved as: pd_df.csv'
##REMOVING PC1
pca = PCA(n_components=8) #keeps 8 components? was 100 in original script-not sure why
#X = pca.fit_transform(table.apply(np.log(table))) #fit df into model 
X= pca.fit_transform(table)
#print X[1]
Y = X[:,1:] #Y = every value in list X except the first one
#print Y[1] 	#just showing that the first value is removed
untrans = pca.inverse_transform(X)  #get original data matrix back (w/out PC1 variance)
pca.components_ = pca.components_[1:] #remove the first PCA vector 
trans = pca.inverse_transform(Y)
print 'new pca vectors saved as: transformed_pca.txt'
with open('transformed_pca.txt' , 'w') as f:
	count =0
	while count < len(trans):
		f.write(trans[count])
		count += 1
Exemple #27
0
def query(query_list,
          feature_list,
          out_dir,
          top=200,
          pca_thresh=0.9,
          out_dim=None,
          pca_file='-1',
          qe_fn=None,
          mask_pred=False,
          euclidean_dist=False,
          rmac=False,
          mac=False,
          aml=False):
    """Query by list."""
    print(Notify.INFO, 'Read feature', Notify.ENDC)
    print(Notify.INFO, 'Use R-MAC: ', rmac, Notify.ENDC)
    num_regions, db_feat, image_names = read_feature(feature_list,
                                                     euclidean_dist, rmac, mac)

    # below codes are for predicted mask visualization.
    itv = 10
    while mask_pred:
        idx0 = randint(itv, len(feature_list) - 1 - itv)
        idx1 = randint(idx0 - itv, idx0 + itv)
        print(Notify.INFO, 'Pair idx', (idx0, idx1), Notify.ENDC)
        # FIXME: adapt the image ext.
        image_path0 = feature_list[idx0].replace('npy', 'JPG')
        image_path1 = feature_list[idx1].replace('npy', 'JPG')
        # some images end with '.jpg'
        if not os.path.exists(image_path0):
            image_path0 = feature_list[idx0].replace('npy', 'jpg')
        if not os.path.exists(image_path1):
            image_path1 = feature_list[idx1].replace('npy', 'jpg')
        rv0 = db_feat[idx0 * num_regions:(idx0 + 1) * num_regions]
        rv1 = db_feat[idx1 * num_regions:(idx1 + 1) * num_regions]
        mask_prediction(rv0, rv1, image_path0, image_path1)

    print(Notify.INFO, '# Feature', len(feature_list), Notify.ENDC)
    print(Notify.INFO, '# Dim', db_feat.shape[-1], Notify.ENDC)
    print(Notify.INFO, '# Reginal vector', num_regions, Notify.ENDC)
    # perform PCA whitening.
    use_pca = (pca_thresh is not None or out_dim is not None
               or pca_file != '-1') and len(image_names) > out_dim

    if pca_file != '-1':
        pca_data = np.load(pca_file).item()
        pca = PCA(whiten=True, copy=True, random_state=0)
        pca.mean_ = pca_data['mean']
        pca.components_ = pca_data['eigvec']
        pca.explained_variance_ = pca_data['eigval']
    else:
        pca = None

    if use_pca:
        db_trans_feat, pca = whitening(db_feat,
                                       num_regions,
                                       pca_thresh,
                                       out_dim,
                                       pca=pca)
        print(Notify.INFO, 'PCA-ed feature dim', db_trans_feat.shape[1],
              Notify.ENDC)
    else:
        print(Notify.WARNING, 'No whitening', Notify.ENDC)
        db_trans_feat = db_feat

    if query_list is not None:
        query_num_regions, query_feat, query_names = read_feature(
            query_list, euclidean_dist, rmac, mac)
        assert (num_regions == query_num_regions)
        if use_pca:
            query_trans_feat, _ = whitening(query_feat, num_regions, pca=pca)
        else:
            query_trans_feat = query_feat
        query_num = len(query_list)
    else:
        query_trans_feat = db_trans_feat
        query_num = len(feature_list)

    # output path name
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    match_index_file = os.path.join(out_dir, 'match_pairs')

    print(Notify.INFO, 'Compute nn distance', Notify.ENDC)
    start = time.time()
    query_result = match_gpu(query_trans_feat,
                             db_trans_feat,
                             num_regions,
                             top,
                             euclidean_dist=euclidean_dist,
                             aml=aml)
    end = time.time()
    print(Notify.INFO, 'Time cost in matching', end - start, 's', Notify.ENDC)

    if qe_fn is not None:
        for _ in range(ARGS.et):
            print(Notify.INFO, 'Expand queries and re-match', Notify.ENDC)
            qe_feature = qe_fn(query_trans_feat, db_trans_feat, query_result,
                               num_regions)
            query_result = match_gpu(qe_feature,
                                     db_trans_feat,
                                     num_regions,
                                     top,
                                     aml=aml)

    content = []
    aps = []
    for i in range(query_num):
        inds = query_result[i][0]
        dists = query_result[i][1]
        content.extend([
            ' '.join([str(i),
                      str(inds[j]),
                      str(dists[j] / num_regions)]) for j in range(len(inds))
        ])
    write_list(content, match_index_file)
    return 0
def run_to_task(task_to):

    import general_utils
    from general_utils import RuntimeDeterminedEnviromentVars
    import models.architectures as architectures
    from data.load_ops import resize_rescale_image
    from data.load_ops import rescale_image
    import utils
    from data.task_data_loading import load_and_specify_preprocessors_for_representation_extraction
    from data.task_data_loading import load_and_specify_preprocessors_for_input_depends_on_target
    import lib.data.load_ops as load_ops
    tf.logging.set_verbosity(tf.logging.ERROR)

    args = parser.parse_args()

    cfg, is_transfer, task, config_name = generate_cfg(args.config, args.vid,
                                                       args)
    if task == 'class_places' or task == 'class_1000':
        synset = get_synset(task)
    if task == 'jigsaw':
        cfg['preprocess_fn'] = load_and_specify_preprocessors_for_input_depends_on_target

    print("Doing {task}".format(task=task))
    general_utils = importlib.reload(general_utils)
    tf.reset_default_graph()
    training_runners = {
        'sess': tf.InteractiveSession(),
        'coord': tf.train.Coordinator()
    }

    ############## Start dataloading workers ##############
    if is_transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    ############## Set Up Inputs ##############
    # tf.logging.set_verbosity( tf.logging.INFO )
    inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False)
    RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
    RuntimeDeterminedEnviromentVars.populate_registered_variables()
    start_time = time.time()

    ############## Set Up Model ##############
    model = utils.setup_model(inputs, cfg, is_training=IN_TRAIN_MODE)
    m = model['model']
    model['saver_op'].restore(training_runners['sess'], cfg['model_path'])

    data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
        inputs, cfg, is_training=False, use_filename_queue=False)
    prefetch_threads = threading.Thread(target=data_prefetch_init_fn,
                                        args=(training_runners['sess'],
                                              training_runners['coord']))

    prefetch_threads.start()
    list_of_fname = np.load(
        '/home/ubuntu/task-taxonomy-331b/assets/aws_data/video{}_fname.npy'.
        format(args.vid))
    import errno

    try:
        os.mkdir('/home/ubuntu/{}'.format(task))
        os.mkdir('/home/ubuntu/{}/vid1'.format(task))
        os.mkdir('/home/ubuntu/{}/vid2'.format(task))
        os.mkdir('/home/ubuntu/{}/vid3'.format(task))
        os.mkdir('/home/ubuntu/{}/vid4'.format(task))
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
    curr_comp = np.zeros((3, 64))
    curr_fit_img = np.zeros((256, 256, 3))
    embeddings = []
    curr_vp = []
    curr_layout = []

    ############## Run First Batch ##############
    def rescale_l_for_display(batch, rescale=True):
        '''
        Prepares network output for display by optionally rescaling from [-1,1],
        and by setting some pixels to the min/max of 0/1. This prevents matplotlib
        from rescaling the images. 
        '''
        if rescale:
            display_batch = [
                rescale_image(im.copy(),
                              new_scale=[0, 100],
                              current_scale=[-1, 1]) for im in batch
            ]
        else:
            display_batch = batch.copy()
        for im in display_batch:
            im[0, 0,
               0] = 1.0  # Adjust some values so that matplotlib doesn't rescale
            im[0, 1, 0] = 0.0  # Now adjust the min
        return display_batch

    for step_num in range(inputs['max_steps'] - 1):
        #for step_num in range(20):
        #if step_num > 0 and step_num % 20 == 0:
        print(step_num)
        if is_transfer:
            (input_batch, target_batch, data_idx,
             predicted) = training_runners['sess'].run([
                 m.input_images, m.target_images, model['data_idxs'],
                 m.decoder.decoder_output
             ])
        else:
            (input_batch, target_batch, data_idx,
             predicted) = training_runners['sess'].run([
                 m.input_images, m.targets, model['data_idxs'],
                 m.decoder_output
             ])

        if task == 'segment2d' or task == 'segment25d':
            from sklearn.decomposition import PCA
            x = np.zeros((32, 256, 256, 3), dtype='float')
            k_embed = 8
            for i in range(predicted.shape[0]):
                embedding_flattened = np.squeeze(predicted[i]).reshape(
                    (-1, 64))
                embeddings.append(embedding_flattened)
                if len(embeddings) > k_embed:
                    embeddings.pop(0)
                pca = PCA(n_components=3)
                pca.fit(np.vstack(embeddings))
                min_order = None
                min_dist = float('inf')
                copy_of_comp = np.copy(pca.components_)
                for order in itertools.permutations([0, 1, 2]):
                    #reordered = pca.components_[list(order), :]
                    #dist = np.linalg.norm(curr_comp-reordered)
                    pca.components_ = copy_of_comp[order, :]
                    lower_dim = pca.transform(embedding_flattened).reshape(
                        (256, 256, -1))
                    lower_dim = (lower_dim - lower_dim.min()) / (
                        lower_dim.max() - lower_dim.min())
                    dist = np.linalg.norm(lower_dim - curr_fit_img)
                    if dist < min_dist:
                        min_order = order
                        min_dist = dist
                pca.components_ = copy_of_comp[min_order, :]
                lower_dim = pca.transform(embedding_flattened).reshape(
                    (256, 256, -1))
                lower_dim = (lower_dim - lower_dim.min()) / (lower_dim.max() -
                                                             lower_dim.min())
                curr_fit_img = np.copy(lower_dim)
                x[i] = lower_dim
            predicted = x
        if task == 'curvature':
            std = [31.922, 21.658]
            mean = [123.572, 120.1]
            predicted = (predicted * std) + mean
            predicted[:, 0, 0, :] = 0.
            predicted[:, 1, 0, :] = 1.
            predicted = np.squeeze(
                np.clip(predicted.astype(int) / 255., 0., 1.)[:, :, :, 0])

        if task == 'colorization':
            maxs = np.amax(predicted, axis=-1)
            softmax = np.exp(predicted - np.expand_dims(maxs, axis=-1))
            sums = np.sum(softmax, axis=-1)
            softmax = softmax / np.expand_dims(sums, -1)

            kernel = np.load(
                '/home/ubuntu/task-taxonomy-331b/lib/data/pts_in_hull.npy')
            gen_target_no_temp = np.dot(softmax, kernel)

            images_resized = np.zeros([0, 256, 256, 2], dtype=np.float32)
            for image in range(gen_target_no_temp.shape[0]):
                temp = scipy.ndimage.zoom(np.squeeze(
                    gen_target_no_temp[image]), (4, 4, 1),
                                          mode='nearest')
                images_resized = np.append(images_resized,
                                           np.expand_dims(temp, axis=0),
                                           axis=0)
            inp_rescale = rescale_l_for_display(input_batch)
            output_lab_no_temp = np.concatenate((inp_rescale, images_resized),
                                                axis=3).astype(np.float64)

            for i in range(input_batch.shape[0]):
                output_lab_no_temp[i, :, :, :] = skimage.color.lab2rgb(
                    output_lab_no_temp[i, :, :, :])
            predicted = output_lab_no_temp

        just_rescale = [
            'autoencoder', 'denoise', 'edge2d', 'edge3d', 'keypoint2d',
            'keypoint3d', 'reshade', 'rgb2sfnorm', 'impainting_whole'
        ]

        if task in just_rescale:
            predicted = (predicted + 1.) / 2.
            predicted = np.clip(predicted, 0., 1.)
            predicted[:, 0, 0, :] = 0.
            predicted[:, 1, 0, :] = 1.

        just_clip = ['rgb2depth', 'rgb2mist']
        if task in just_clip:
            predicted = np.exp(predicted * np.log(2.0**16.0)) - 1.0
            predicted = np.log(predicted) / 11.09
            predicted = (predicted - 0.64) / 0.18
            predicted = (predicted + 1.) / 2
            predicted[:, 0, 0, :] = 0.
            predicted[:, 1, 0, :] = 1.

        if task == 'segmentsemantic_rb':
            label = np.argmax(predicted, axis=-1)
            COLORS = ('white', 'red', 'blue', 'yellow', 'magenta', 'green',
                      'indigo', 'darkorange', 'cyan', 'pink', 'yellowgreen',
                      'black', 'darkgreen', 'brown', 'gray', 'purple',
                      'darkviolet')
            rgb = (input_batch + 1.) / 2.
            preds = [
                color.label2rgb(np.squeeze(x),
                                np.squeeze(y),
                                colors=COLORS,
                                kind='overlay')[np.newaxis, :, :, :]
                for x, y in zip(label, rgb)
            ]
            predicted = np.vstack(preds)

        if task in ['class_1000', 'class_places']:
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                sorted_pred = np.argsort(predict_output)[::-1]
                top_5_pred = [synset[sorted_pred[i]] for i in range(5)]
                to_print_pred = "Top 5 prediction: \n {}\n {}\n {}\n {} \n {}".format(
                    *top_5_pred)
                img = Image.new('RGBA', (400, 200), (255, 255, 255))
                d = ImageDraw.Draw(img)
                fnt = ImageFont.truetype(
                    '/usr/share/fonts/truetype/dejavu/DejaVuSerifCondensed.ttf',
                    25)
                d.text((20, 5), to_print_pred, fill=(255, 0, 0), font=fnt)
                img.save(to_store_name, 'PNG')
        elif task == 'vanishing_point_well_defined':
            counter = 0
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                curr_vp.append(
                    plot_vanishing_point_smoothed(
                        predict_output, (input_batch[counter] + 1.) / 2.,
                        to_store_name, curr_vp))
                if len(curr_vp) > 5:
                    curr_vp.pop(0)
                counter += 1
                #scipy.misc.toimage(result, cmin=0.0, cmax=1.0).save(to_store_name)
        elif task == 'room_layout':
            mean = np.array([
                0.006072743318127848, 0.010272365569691076, -3.135909774145468,
                1.5603802322235532, 5.6228218371102496e-05,
                -1.5669352793761442, 5.622875878174759, 4.082800262277375,
                2.7713941642895956
            ])
            std = np.array([
                0.8669452525283652, 0.687915294956501, 2.080513632043758,
                0.19627420479282623, 0.014680602791251812, 0.4183827359302299,
                3.991778013006544, 2.703495278378409, 1.2269185938626304
            ])
            predicted = predicted * std + mean
            counter = 0
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                plot_room_layout(predict_output,
                                 (input_batch[counter] + 1.) / 2.,
                                 to_store_name,
                                 curr_layout,
                                 cube_only=True)
                curr_layout.append(predict_output)
                if len(curr_layout) > 5:
                    curr_layout.pop(0)
                #scipy.misc.toimage(result, cmin=0.0, cmax=1.0).save(to_store_name)
                counter += 1
        elif task == 'segmentsemantic_rb':
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                process_semseg_frame(predict_output, to_store_name)
        elif task == 'jigsaw':
            predicted = np.argmax(predicted, axis=1)
            counter = 0
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                perm = cfg['target_dict'][predict_output]
                show_jigsaw((input_batch[counter] + 1.) / 2., perm,
                            to_store_name)
                counter += 1
        else:
            for file_idx, predict_output in zip(data_idx, predicted):
                to_store_name = list_of_fname[file_idx].decode(
                    'utf-8').replace('video', task)
                to_store_name = os.path.join('/home/ubuntu', to_store_name)
                scipy.misc.toimage(np.squeeze(predict_output),
                                   cmin=0.0,
                                   cmax=1.0).save(to_store_name)

    # subprocess.call('tar -czvf /home/ubuntu/{c}_{vid_id}.tar.gz /home/ubuntu/{t}/vid{vid_id}'.format(
    # c=config_name, t=task, vid_id=args.vid), shell=True)
    # subprocess.call('ffmpeg -r 29.97 -f image2 -s 256x256 -i /home/ubuntu/{t}/vid{vid_id}/0{vid_id}0%04d.png -vcodec libx264 -crf 15  {c}_{vid_id}.mp4'.format(
    # c=config_name, t=task, vid_id=args.vid), shell=True)
    subprocess.call(
        'ffmpeg -r 29.97 -f image2 -s 256x256 -i /home/ubuntu/{t}/vid{vid_id}/0{vid_id}0%04d.png -ss 00:01:54 -t 00:00:40 -c:v libvpx-vp9 -crf 10 -b:v 128k {c}_{vid_id}.webm'
        .format(c=config_name, t=task, vid_id=args.vid),
        shell=True)
    # subprocess.call('ffmpeg -r 29.97 -f image2 -s 256x256 -i /home/ubuntu/{t}/vid{vid_id}/0{vid_id}0%04d.png -vcodec libx264 -crf 15  -pix_fmt yuv420p {c}_{vid_id}.mp4'.format(
    # c=config_name, t=task, vid_id=args.vid), shell=True)
    subprocess.call(
        'sudo mkdir -p /home/ubuntu/s3/video_new/{t}'.format(t=task),
        shell=True)
    #subprocess.call('sudo mkdir -p /home/ubuntu/s3/video_new_all/{t}'.format(t=task), shell=True)
    #     subprocess.call('aws s3 cp /home/ubuntu/{c}_{vid_id}.tar.gz s3://task-preprocessing-512-oregon/video_new_all/{t}/'.format(
    # c=config_name, t=task, vid_id=args.vid), shell=True)
    subprocess.call(
        'aws s3 cp {c}_{vid_id}.webm s3://task-preprocessing-512-oregon/video_new/{t}/'
        .format(c=config_name, t=task, vid_id=args.vid),
        shell=True)

    # subprocess.call('aws s3 cp /home/ubuntu/{c}_{vid_id}.tar.gz s3://taskonomy-unpacked-oregon/video_tar_all/{t}/'.format(
    # c=config_name, t=task, vid_id=args.vid), shell=True)
    # subprocess.call('aws s3 cp {c}_{vid_id}.mp4 s3://taskonomy-unpacked-oregon/video_all/{t}/'.format(
    #     c=config_name, t=task, vid_id=args.vid), shell=True)

    ############## Clean Up ##############
    training_runners['coord'].request_stop()
    training_runners['coord'].join()
    print("Done: {}".format(config_name))

    ############## Reset graph and paths ##############
    tf.reset_default_graph()
    training_runners['sess'].close()

    return
Exemple #29
0
def makePCA(fn, validExcept, rows, ftype, fcols, n_cols, isTrain, target,
            exceptCols, comp, exva, mean, exceptTargetForPCA, useLog,
            logConstant):
    print('')
    print('+=======================+')
    print('|  Function : makePCA   |')
    print('+=======================+')

    # get dataFrame
    (dataSetDF, targetCol) = _MDF.makeDataFrame(fn, validExcept, rows, ftype,
                                                fcols, isTrain, target,
                                                exceptCols, useLog,
                                                logConstant)
    DFtoFindPCA = dataSetDF  # dataFrame to find PCA

    # remove target column when exceptTargetForPCA is True
    if exceptTargetForPCA == True:
        newDataSetDF = dataSetDF.drop([target], axis='columns')

        # print newDataSetDF
        print('\n<<< [8] newDataSetDF.columns >>>')
        print(newDataSetDF.columns)
        print('\n<<< [9] newDataSetDF >>>')
        print(newDataSetDF)

        DFtoFindPCA = newDataSetDF

    # display correlation
    # https://seaborn.pydata.org/generated/seaborn.clustermap.html
    df = DFtoFindPCA.corr()  # get correlation
    seab.clustermap(df, annot=True, cmap='RdYlBu_r', vmin=-1, vmax=1)
    plt.show()

    # to standard normal distribution
    scaled = StandardScaler().fit_transform(DFtoFindPCA)

    # PCA
    # https://medium.com/@john_analyst/pca-%EC%B0%A8%EC%9B%90-%EC%B6%95%EC%86%8C-%EB%9E%80-3339aed5afa1
    initializePCA = False  # initializing PCA?

    if str(comp) == 'None' or str(exva) == 'None' or str(
            mean) == 'None':  # create PCA if does not exist
        pca = PCA(n_components=n_cols)
        pca.fit(scaled)

        # get components and explained variances of PCA
        comp = pca.components_
        exva = pca.explained_variance_
        mean = pca.mean_

        initializePCA = True

    # https://machinelearningmastery.com/calculate-principal-component-analysis-scratch-python/
    # print pca.components_ and pca.explained_variance_
    print('\n<<< [10] pca.components_ >>>\n' + str(comp))
    print('\n<<< [11] pca.explained_variance_ >>>\n' + str(exva))
    print('\n<<< [12] pca.mean_ >>>\n' + str(mean))

    # create PCA using comp and exva
    if initializePCA == False:
        pca = PCA(n_components=n_cols)
        pca.components_ = comp
        pca.explained_variance_ = exva
        pca.mean_ = mean

    # apply PCA to the data
    scaledPCA = pca.transform(scaled)

    print('\n<<< [13] scaledPCA.shape >>>\n' + str(scaledPCA.shape))
    print('\n<<< [14] scaledPCA.data.shape >>>\n' + str(scaledPCA.data.shape))

    print('\n<<< [15] scaledPCA >>>')
    print(scaledPCA)

    # for training data
    # (ID : ~original train data) -> (ID : ~except for validation data)
    if isTrain == True:
        print('\n<<< [15-1] dataSetDF[target] before >>>')
        print(dataSetDF[target])

        # dataFrame -> list -> dataFrame
        targetList = list(dataSetDF[target])
        targetListCopy = []
        for i in range(len(targetList)):
            targetListCopy.append(targetList[i])
        targetDF = pd.DataFrame(targetListCopy)

        print('\n<<< [15-2] dataSetDF[target] after : targetDF >>>')
        print(targetDF)

    # name each column for PCA transformed data
    pca_cols = []
    for i in range(n_cols):
        pca_cols.append('pca' + str(i))
    df_pca = pd.DataFrame(scaledPCA, columns=pca_cols)
    if isTrain == True: df_pca['target'] = targetDF

    print('\n<<< [16] df_pca >>>')
    print(df_pca)

    df_pcaCorr = df_pca.corr()
    seab.clustermap(df_pcaCorr, annot=True, cmap='RdYlBu_r', vmin=-1, vmax=1)
    plt.show()

    # immediately return the pca if testing
    if isTrain == False:
        print('')
        print('+=======================+')
        print('|    Exit : makePCA     |')
        print('+=======================+')

        return (df_pca, comp, exva, mean, targetCol)

    # print data as 2d or 3d space (run only on training data)
    _PD.printDataAsSpace(n_cols, df_pca, '(PCA) training data')

    print('')
    print('+=======================+')
    print('|    Exit : makePCA     |')
    print('+=======================+')

    return (df_pca, comp, exva, mean, targetCol)
Exemple #30
0
def main(filename,
         xtrains_percent=0.8,
         maxfeature=3,
         fit_ylabel=False,
         nn_estimator=100,
         sepaLabel=True,
         treeLabel=False,
         seed=42,
         pcaLabel=False,
         n_comp=2,
         sepa2=False,
         time_label=False,
         stream=False,
         sfl=False):
    inf = float("inf")
    all_start = time.time()
    rng = np.random.RandomState(seed)

    # httpとsmtpのみ別の方法でデータ取得
    if filename == '/home/anegawa/Dropbox/http.mat' or filename == '/home/anegawa/Dropbox/smtp.mat':
        mat = {}
        f = h5py.File(filename)
        for k, v in f.items():
            mat[k] = np.array(v)
        X = mat['X'].T
        y2 = mat['y'][0]
        y3 = []
        for i in range(len(y2)):
            y3.append(int(y2[i]))
        y = np.reshape(y3, [len(y3), 1])
    else:
        mat = scipy.io.loadmat(filename)
        X = mat['X']
        y = mat['y']

    rate = xtrains_percent
    max_feat = int(maxfeature)
    if max_feat == 3:
        max_feat = X.shape[1]

    if not treeLabel:
        print('X_train\'s rate : ' + str(rate))
        print('max_features : ' + str(max_feat))
        print('fit_ylabel : ' + str(fit_ylabel))
        print('nn_estimator : ' + str(nn_estimator))
        print('sepaLabel : ' + str(sepaLabel))

    clf = IsolationForest(random_state=rng)
    clf.n_estimators = nn_estimator
    clf.verbose = 0
    clf.max_features = max_feat

    if (str(filename) == '/home/anegawa/Dropbox/shuttle.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/http.mat'):
        clf.contamination = 0.004

    elif (str(filename) == '/home/anegawa/Dropbox/pima.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/mammography.mat'):
        clf.contamination = 0.02

    elif (str(filename) == '/home/anegawa/Dropbox/cover.mat'):
        clf.contamination = 0.009

    elif (str(filename) == '/home/anegawa/Dropbox/breastw.mat'):
        clf.contamination = 0.35

    elif (str(filename) == '/home/anegawa/Dropbox/arrhythmia.mat'):
        clf.contamination = 0.15

    elif (str(filename) == '/home/anegawa/Dropbox/ionosphere.mat'):
        clf.contamination = 0.36

    elif (str(filename) == '/home/anegawa/Dropbox/satellite.mat'):
        clf.contamination = 0.32

    elif (str(filename) == '/home/anegawa/Dropbox/annthyroid.mat'):
        clf.contamination = 0.07

    elif (str(filename) == '/home/anegawa/Dropbox/smtp.mat'):
        clf.contamination = 0.03 / 100

    else:
        raise Exception("error! cannot file it.")

    # 交差検証を何回行うか(例:8:2なら5回)
    # もっとうまい方法ありそう
    hoge = 1 / (1 - rate)
    cross_count = int(np.ceil(hoge))
    if cross_count > hoge:
        cross_count = cross_count - 1

    # cross_count分のauc,acc合計
    sum_auc = 0
    sum_accuracy = 0

    pca_fit_time = 0
    pca_transform_train_time = 0
    pca_transform_test_time = 0
    test_time = 0
    fit_time = 0

    if sepaLabel == True:  # separated
        # data cut
        X_anomaly = []
        X_normal = []
        for i in range(len(X)):
            if y[i] == 1:
                X_anomaly.append(X[i])
            else:
                X_normal.append(X[i])

        cutter_anomaly = int(np.ceil(len(X_anomaly) * rate))
        cutter_normal = int(np.ceil(len(X_normal) * rate))

    for count in range(cross_count):
        if sepaLabel:
            part_anomaly = int(np.ceil(cutter_anomaly * count))
            part_normal = int(np.ceil(cutter_normal * count))
            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X_anomaly)),
                            range(part_anomaly,
                                  part_anomaly + len(X_anomaly))):
                while k >= len(X_anomaly):
                    k = k - len(X_anomaly)

                if i < cutter_anomaly:
                    X_train.append(X_anomaly[k])
                    X_train_correct.append(-1)
                else:
                    X_test.append(X_anomaly[k])
                    X_test_correct.append(-1)

            for i, k in zip(range(len(X_normal)),
                            range(part_normal, part_normal + len(X_normal))):
                while k >= len(X_normal):
                    k = k - len(X_normal)

                if i < cutter_normal:
                    X_train.append(X_normal[k])
                    X_train_correct.append(1)
                else:
                    X_test.append(X_normal[k])
                    X_test_correct.append(1)

            # シャッフルするかどうか
            if sfl:
                X_train_set = []
                X_test_set = []
                for i in range(len(X_train)):
                    buf = []
                    buf.append(X_train[i])
                    buf.append(X_train_correct[i])
                    X_train_set.append(buf)
                for i in range(len(X_test)):
                    buf = []
                    buf.append(X_test[i])
                    buf.append(X_test_correct[i])
                    X_test_set.append(buf)

                random.shuffle(X_train_set)
                random.shuffle(X_test_set)

                X_train = []
                X_test = []
                X_train_correct = []
                X_test_correct = []
                for i in range(len(X_train_set)):
                    X_train.append(X_train_set[i][0])
                    X_train_correct.append(X_train_set[i][1])
                for i in range(len(X_test_set)):
                    X_test.append(X_test_set[i][0])
                    X_test_correct.append(X_test_set[i][1])

        else:  # mixed
            cutter = len(X) * rate
            part = int(np.ceil(cutter * count))

            X_train = []
            X_train_correct = []
            X_test = []
            X_test_correct = []

            for i, k in zip(range(len(X)), range(part, part + len(X))):
                while k >= len(X):
                    k = k - len(X)

                if i < len(X) * rate:
                    X_train.append(X[k])
                    X_train_correct.append(y[k])
                else:
                    X_test.append(X[k])
                    X_test_correct.append(y[k])

            for q in range(len(X_train_correct)):
                j = X_train_correct[q]
                if (j == 1):
                    X_train_correct[q] = -1
                else:
                    X_train_correct[q] = 1

            for w in range(len(X_test_correct)):
                j = X_test_correct[w]
                if (j == 1):
                    X_test_correct[w] = -1
                else:
                    X_test_correct[w] = 1

        # owari
        # finished cutting data

        if pcaLabel:
            if sepa2:
                # if False:
                pca2 = PCA(copy=True,
                           iterated_power='auto',
                           random_state=None,
                           svd_solver='auto',
                           tol=0.0,
                           whiten=False)
                pca2.fit(X_train_normal)
                component = pca2.components_
                component2 = np.sort(pca2.components_)
                if n_comp < len(component2):
                    pca2.components_ = component2[0:n_comp]
                X_train = pca2.transform(X_train)
                X_test = pca2.transform(X_test)

            else:
                pca_fit_start = time.time()
                pca = PCA(copy=True,
                          iterated_power='auto',
                          n_components=n_comp,
                          random_state=None,
                          svd_solver='auto',
                          tol=0.0,
                          whiten=False)
                pca.fit(X_train)
                pca_fit_finish = time.time()

                pca_transform_train_start = time.time()
                X_train = pca.transform(X_train)
                pca_transform_train_finish = time.time()

            clf.max_features = n_comp
            pca_fit_time += (pca_fit_finish - pca_fit_start)
            pca_transform_train_time += (pca_transform_train_finish -
                                         pca_transform_train_start)

        fit_start = time.time()
        # fit_ylabelはFalseで固定
        if fit_ylabel:
            clf.fit(X_train, X_train_correct, sample_weight=None)
        else:
            clf.fit(X_train, y=None, sample_weight=None)
        fit_finish = time.time()
        fit_time += (fit_finish - fit_start)

        if stream:
            sum_score_auc = []
            sum_score_acc = []

            for i in range(len(X_test)):
                if pcaLabel:
                    pca_transform_test_start = time.time()
                    a = [X_test[i]]
                    X_test_pca = pca.transform(a)
                    pca_transform_test_finish = time.time()
                    pca_transform_test_time += (pca_transform_test_finish -
                                                pca_transform_test_start)

                else:
                    X_test_pca = [X_test[i]]

                test_start = time.time()
                y_pred_test, a_score = clf.predict(X_test_pca)
                test_finish = time.time()
                test_time += (test_finish - test_start)

                sum_score_auc.append(a_score)
                sum_score_acc.append(y_pred_test)
            a_score = sum_score_auc
            y_pred_test = sum_score_acc

        else:  # batch
            if pcaLabel:
                pca_transform_test_start = time.time()
                X_test = pca.transform(X_test)  # stream version
                pca_transform_test_finish = time.time()
                pca_transform_test_time += (pca_transform_test_finish -
                                            pca_transform_test_start)

            test_start = time.time()
            y_pred_test, a_score = clf.predict(X_test)
            # a_score = clf.decision_function(X_test)
            test_finish = time.time()
            test_time += (test_finish - test_start)

        acc = calc_accuracy(X_test_correct, y_pred_test, treeLabel)
        AUC = calc_AUC(X_test_correct, a_score, treeLabel)
        sum_auc += AUC
        sum_accuracy += acc

    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # # plot the line, the samples, and the nearest vectors to the plane
    #
    # X_train = np.array(X_train)
    # X_test = np.array(X_test)
    #
    # lim = True
    # x = (-200, 200)
    # y = (-200, 300)
    #
    # for i,j in zip(range(2), [True, False]):
    #     small = j  # trueがsmallestね
    #
    #     plt.subplot(2, 2, i+1)
    #     if small:
    #         plt.title("smallest")
    #     else:
    #         plt.title("largest")
    #
    #     if small:
    #         # b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()ter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, X_test.shape[1]-1], X_test[:, X_test.shape[1]-2], c='green', s=20, edgecolor='k')
    #     else:
    #         # b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b2],["testing observations"],
    #                loc="upper left")
    #     # plt.legend([b1], ["training observations"],
    #     #            loc="upper left")
    #
    #
    #
    #     plt.subplot(2, 2, i+3)
    #     if small:
    #         b1 = plt.scatter(X_train[:, X_train.shape[1]-1], X_train[:, X_train.shape[1]-2], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, X_test.shape[1] - 1], X_test[:, X_test.shape[1] - 2], c='green', s=20, edgecolor='k')
    #     else:
    #         b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
    #         # b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
    #     # c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', s=20, edgecolor='k')
    #     plt.axis('tight')
    #     if lim:
    #         plt.xlim(x)
    #         plt.ylim(y)
    #     # plt.legend([b1, b2],
    #     #            ["training observations",
    #     #             "testing observations"],
    #     #            loc="upper left")
    #     # plt.legend([b2], ["testing observations"],
    #     #            loc="upper left")
    #     plt.legend([b1], ["training observations"],
    #                loc="upper left")
    # plt.show()
    # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    auc2 = sum_auc / cross_count
    acc2 = sum_accuracy / cross_count

    # calc time
    all_finish = time.time()
    all_time = all_finish - all_start
    pca_fit_time = pca_fit_time / cross_count
    pca_transform_train_time = pca_transform_train_time / cross_count
    pca_transform_test_time = pca_transform_test_time / cross_count
    test_time = test_time / cross_count
    fit_time = fit_time / cross_count
    sum_train_time = fit_time + pca_fit_time + pca_transform_train_time
    sum_test_time = pca_transform_test_time + test_time
    # print("sum_train_time : " + str(sum_train_time))
    # print("pca_transform_train_time : " + str(pca_transform_train_time))
    # print("pca_fit_time : " + str(pca_fit_time))
    # print("test_time : " + str(test_time))
    # print("fit_time : " + str(fit_time))
    # print("all_time : " + str(all_time))

    if time_label:
        return all_time, pca_fit_time + pca_transform_train_time, fit_time, pca_transform_test_time, test_time, sum_train_time, sum_test_time
    elif treeLabel:
        if math.isnan(auc2):
            raise Exception("error! auc is NaN!.")
        return auc2
    else:
        return auc2, acc2
Exemple #31
0
            train_data_downsampled = train_data_normalized[:, :, ::
                                                           decim_factor]
            test_data_downsampled = test_data_normalized[:, :, ::decim_factor]

            train_x = train_data_downsampled.reshape(
                train_data_downsampled.shape[0],
                -1)  # put the last dimension into the preceding one
            test_x = test_data_downsampled.reshape(
                test_data_downsampled.shape[0],
                -1)  # put the last dimension into the preceding one

            # next: apply PCA
            if True:
                pca = PCA(0.95)
                pca.fit(train_x)
                pca.components_ = -pca.components_  #
                train_x = pca.transform(train_x)
                test_x = pca.transform(test_x)

            # oversampling the least present sample
            if False:
                idx_offset = balance_idx(train_label)
                oversampled_train_label = np.append(train_label,
                                                    train_label[idx_offset])
                oversampled_train_x = np.concatenate(
                    (train_x, train_x[idx_offset]), 0)
                train_label = oversampled_train_label
                train_x = oversampled_train_x

            cls.fit(train_x, np.unique(train_label))
            # cls.fit( oversampled_train_x, oversampled_train_label )
Exemple #32
0
    def run(self) -> None:
        """
        Run method of the module. The position and contrast of a planet is measured by injecting
        negative copies of the PSF template and applying a simplex method (Nelder-Mead) for
        minimization of a figure of merit at the planet location.

        Returns
        -------
        NoneType
            None
        """

        for item in self.m_res_out_port:
            item.del_all_data()
            item.del_all_attributes()

        for item in self.m_flux_pos_port:
            item.del_all_data()
            item.del_all_attributes()

        parang = self.m_image_in_port.get_attribute('PARANG')
        pixscale = self.m_image_in_port.get_attribute('PIXSCALE')

        aperture = (self.m_position[1], self.m_position[0], self.m_aperture/pixscale)

        self.m_sigma /= pixscale

        if self.m_cent_size is not None:
            self.m_cent_size /= pixscale

        if self.m_edge_size is not None:
            self.m_edge_size /= pixscale

        psf = self.m_psf_in_port.get_all()
        images = self.m_image_in_port.get_all()

        if psf.shape[0] != 1 and psf.shape[0] != images.shape[0]:
            raise ValueError('The number of frames in psf_in_tag does not match with the number '
                             'of frames in image_in_tag. The DerotateAndStackModule can be '
                             'used to average the PSF frames (without derotating) before applying '
                             'the SimplexMinimizationModule.')

        center = center_subpixel(psf)

        if self.m_reference_in_port is not None and self.m_merit != 'poisson':
            raise NotImplementedError('The reference_in_tag can only be used in combination with '
                                      'the \'poisson\' figure of merit.')

        def _objective(arg, count, n_components, sklearn_pca):
            pos_y = arg[0]
            pos_x = arg[1]
            mag = arg[2]

            sep_ang = cartesian_to_polar(center, pos_y, pos_x)

            fake = fake_planet(images=images,
                               psf=psf,
                               parang=parang,
                               position=(sep_ang[0], sep_ang[1]),
                               magnitude=mag,
                               psf_scaling=self.m_psf_scaling)

            mask = create_mask(fake.shape[-2:], (self.m_cent_size, self.m_edge_size))

            if self.m_reference_in_port is None:
                im_res_rot, im_res_derot = pca_psf_subtraction(images=fake*mask,
                                                               angles=-1.*parang+self.m_extra_rot,
                                                               pca_number=n_components,
                                                               pca_sklearn=sklearn_pca,
                                                               im_shape=None,
                                                               indices=None)

            else:
                im_reshape = np.reshape(fake*mask, (im_shape[0], im_shape[1]*im_shape[2]))

                im_res_rot, im_res_derot = pca_psf_subtraction(images=im_reshape,
                                                               angles=-1.*parang+self.m_extra_rot,
                                                               pca_number=n_components,
                                                               pca_sklearn=sklearn_pca,
                                                               im_shape=im_shape,
                                                               indices=None)

            res_stack = combine_residuals(method=self.m_residuals,
                                          res_rot=im_res_derot,
                                          residuals=im_res_rot,
                                          angles=parang)

            self.m_res_out_port[count].append(res_stack, data_dim=3)

            chi_square = merit_function(residuals=res_stack[0, ],
                                        merit=self.m_merit,
                                        aperture=aperture,
                                        sigma=self.m_sigma)

            position = rotate_coordinates(center, (pos_y, pos_x), -self.m_extra_rot)

            res = np.asarray([position[1],
                              position[0],
                              sep_ang[0]*pixscale,
                              (sep_ang[1]-self.m_extra_rot) % 360.,
                              mag,
                              chi_square])

            self.m_flux_pos_port[count].append(res, data_dim=2)

            sys.stdout.write('\rSimplex minimization... ')
            sys.stdout.write(f'{n_components} PC - chi^2 = {chi_square:.8E}')
            sys.stdout.flush()

            return chi_square

        pos_init = rotate_coordinates(center,
                                      (self.m_position[1], self.m_position[0]),  # (y, x)
                                      self.m_extra_rot)

        for i, n_components in enumerate(self.m_pca_number):
            sys.stdout.write(f'\rSimplex minimization... {n_components} PC ')
            sys.stdout.flush()

            if self.m_reference_in_port is None:
                sklearn_pca = None

            else:
                ref_data = self.m_reference_in_port.get_all()

                im_shape = images.shape
                ref_shape = ref_data.shape

                if ref_shape[1:] != im_shape[1:]:
                    raise ValueError('The image size of the science data and the reference data '
                                     'should be identical.')

                # reshape reference data and select the unmasked pixels
                ref_reshape = ref_data.reshape(ref_shape[0], ref_shape[1]*ref_shape[2])

                mean_ref = np.mean(ref_reshape, axis=0)
                ref_reshape -= mean_ref

                # create the PCA basis
                sklearn_pca = PCA(n_components=n_components, svd_solver='arpack')
                sklearn_pca.fit(ref_reshape)

                # add mean of reference array as 1st PC and orthogonalize it to the PCA basis
                mean_ref_reshape = mean_ref.reshape((1, mean_ref.shape[0]))

                q_ortho, _ = np.linalg.qr(np.vstack((mean_ref_reshape,
                                                     sklearn_pca.components_[:-1, ])).T)

                sklearn_pca.components_ = q_ortho.T

            minimize(fun=_objective,
                     x0=[pos_init[0], pos_init[1], self.m_magnitude],
                     args=(i, n_components, sklearn_pca),
                     method='Nelder-Mead',
                     tol=None,
                     options={'xatol': self.m_tolerance, 'fatol': float('inf')})

        sys.stdout.write(' [DONE]\n')
        sys.stdout.flush()

        history = f'merit = {self.m_merit}'

        for item in self.m_flux_pos_port:
            item.copy_attributes(self.m_image_in_port)
            item.add_history('SimplexMinimizationModule', history)

        for item in self.m_res_out_port:
            item.copy_attributes(self.m_image_in_port)
            item.add_history('SimplexMinimizationModule', history)

        self.m_res_out_port[0].close_port()
def run_to_task(task_to):

    import general_utils
    from   general_utils import RuntimeDeterminedEnviromentVars
    import models.architectures as architectures
    from   data.load_ops import resize_rescale_image
    import utils
    from   data.task_data_loading import load_and_specify_preprocessors_for_representation_extraction
    import lib.data.load_ops as load_ops
    import pdb
    global synset
    synset_1000 = [" ".join(i.split(" ")[1:]) for i in synset]
    select = np.asarray([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
        1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.])

    with open('/home/ubuntu/task-taxonomy-331b/lib/data/places_class_names.txt', 'r') as fp:
        synset_places = [x.rstrip()[4:-1] for x,y in zip(fp.readlines(), select) if y == 1.]

    
    tf.logging.set_verbosity(tf.logging.ERROR)
   
    args = parser.parse_args()
    if args.task is not 'NONE':
        args.idx = list_of_tasks.index(args.task)
    for idx, task in enumerate(list_of_tasks):
        if idx != args.idx and args.idx != -1:
            continue
        if task == 'class_places':
            synset = synset_places
        elif task == 'class_1000':
            synset = synset_1000
        print("Doing {task}".format(task=task))
        general_utils = importlib.reload(general_utils)
        tf.reset_default_graph()
        training_runners = { 'sess': tf.InteractiveSession(), 'coord': tf.train.Coordinator() }

        # task = '{f}__{t}__{hs}'.format(f=task_from, t=task_to, hs=args.hs)
        CONFIG_DIR = '/home/ubuntu/task-taxonomy-331b/experiments/final/{TASK}'.format(TASK=task)

        ############## Load Configs ##############
        cfg = utils.load_config( CONFIG_DIR, nopause=True )
        RuntimeDeterminedEnviromentVars.register_dict( cfg )
        split_file = os.path.join('/home/ubuntu/task-taxonomy-331b/assets/aws_data/', 'video2_info.pkl')
        cfg['train_filenames'] = split_file
        cfg['val_filenames'] = split_file
        cfg['test_filenames'] = split_file 

        cfg['num_epochs'] = 2
        cfg['randomize'] = False
        root_dir = cfg['root_dir']
        cfg['num_read_threads'] = 1
        print(cfg['log_root'])
        cfg['model_path'] = os.path.join(
                cfg['log_root'],
                task,
                'model.permanent-ckpt'
            )

        print( cfg['model_path'])
        if cfg['model_path'] is None:
            continue
        cfg['dataset_dir'] = '/home/ubuntu'
        cfg['preprocess_fn'] = load_and_specify_preprocessors_for_representation_extraction
        ############## Set Up Inputs ##############
        # tf.logging.set_verbosity( tf.logging.INFO )
        inputs = utils.setup_input( cfg, is_training=ON_TEST_SET, use_filename_queue=False ) # is_training determines whether to use train/validaiton
        RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        RuntimeDeterminedEnviromentVars.populate_registered_variables()
        start_time = time.time()
        # utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

        ############## Set Up Model ##############
        model = utils.setup_model( inputs, cfg, is_training=IN_TRAIN_MODE )
        m = model[ 'model' ]
        model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] )

        ############## Start dataloading workers ##############
        data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( 
            inputs, cfg, is_training=ON_TEST_SET, use_filename_queue=False )

        prefetch_threads = threading.Thread(
            target=data_prefetch_init_fn,
            args=( training_runners[ 'sess' ], training_runners[ 'coord' ] ))
        prefetch_threads.start()
       
        list_of_fname = np.load('/home/ubuntu/task-taxonomy-331b/assets/aws_data/video2_fname.npy')
        import errno

        try:
            os.mkdir('/home/ubuntu/{}'.format(task))
            os.mkdir('/home/ubuntu/{}/vid1'.format(task))
            os.mkdir('/home/ubuntu/{}/vid2'.format(task))
            os.mkdir('/home/ubuntu/{}/vid3'.format(task))
            os.mkdir('/home/ubuntu/{}/vid4'.format(task))
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        curr_comp = np.zeros((3,64))
        curr_fit_img = np.zeros((256,256,3))
        embeddings = []
        ############## Run First Batch ##############

        for step_num in range(inputs['max_steps'] - 1):
        #for step_num in range(1):
            #if step_num > 0 and step_num % 20 == 0:
            print(step_num)
            if not hasattr(m, 'masks'):
                ( 
                    input_batch, target_batch, 
                    data_idx, 
                    predicted, loss,
                ) = training_runners['sess'].run( [ 
                    m.input_images, m.targets,
                    model[ 'data_idxs' ], 
                    m.decoder_output, m.total_loss] )
                mask_batch = 1.
            else:
                ( 
                    input_batch, target_batch, mask_batch,
                    data_idx, 
                    predicted, loss,
                ) = training_runners['sess'].run( [ 
                    m.input_images, m.targets, m.masks,
                    model[ 'data_idxs' ], 
                    m.decoder_output, m.total_loss] )

            if task == 'segment2d' or task == 'segment25d':
                from sklearn.decomposition import PCA  
                x = np.zeros((32,256,256,3), dtype='float')
                k_embed = 8
#                 for i in range(predicted.shape[0]):
                    # embedding_flattened = np.squeeze(predicted[i]).reshape((-1,64))
                    # pca = PCA(n_components=3)
                    # pca.fit(embedding_flattened)
                    # min_order = None
                    # min_dist = float('inf')
                    # for order in itertools.permutations([0,1,2]):
                        # reordered = pca.components_[list(order), :]
                        # dist = np.linalg.norm(curr_comp-reordered)
                        # if dist < min_dist:
                            # min_order = list(order)
                            # min_dist = dist
                    # print(min_order)
                    # pca.components_ = pca.components_[min_order, :]
                    # curr_comp = pca.components_
                    # lower_dim = pca.transform(embedding_flattened).reshape((256,256,-1))
                    # lower_dim = (lower_dim - lower_dim.min()) / (lower_dim.max() - lower_dim.min())
                    # x[i] = lower_dim
                for i in range(predicted.shape[0]):
                    embedding_flattened = np.squeeze(predicted[i]).reshape((-1,64))
                    embeddings.append(embedding_flattened)
                    if len(embeddings) > k_embed:
                        embeddings.pop(0)
                    pca = PCA(n_components=3)
                    pca.fit(np.vstack(embeddings))
                    min_order = None
                    min_dist = float('inf')
                    copy_of_comp = np.copy(pca.components_)
                    for order in itertools.permutations([0,1,2]):
                        #reordered = pca.components_[list(order), :]
                        #dist = np.linalg.norm(curr_comp-reordered)
                        pca.components_ = copy_of_comp[order, :]
                        lower_dim = pca.transform(embedding_flattened).reshape((256,256,-1))
                        lower_dim = (lower_dim - lower_dim.min()) / (lower_dim.max() - lower_dim.min())
                        dist = np.linalg.norm(lower_dim - curr_fit_img)
                        if dist < min_dist:
                            min_order = order 
                            min_dist = dist
                    pca.components_ = copy_of_comp[min_order, :]
                    lower_dim = pca.transform(embedding_flattened).reshape((256,256,-1))
                    lower_dim = (lower_dim - lower_dim.min()) / (lower_dim.max() - lower_dim.min())
                    curr_fit_img = np.copy(lower_dim)
                    x[i] = lower_dim
                predicted = x
            if task == 'curvature':
                std = [31.922, 21.658]
                mean = [123.572, 120.1]
                predicted = (predicted * std) + mean
                predicted[:,0,0,:] = 0.
                predicted[:,1,0,:] = 1.
                predicted = np.squeeze(np.clip(predicted.astype(int) / 255., 0., 1. )[:,:,:,0])

            just_rescale = ['autoencoder', 'denoise', 'edge2d', 
                            'edge3d', 'keypoint2d', 'keypoint3d',
                            'reshade', 'rgb2sfnorm']
            if task in just_rescale:
                predicted = (predicted + 1.) / 2.
                predicted = np.clip(predicted, 0., 1.)
                predicted[:,0,0,:] = 0.
                predicted[:,1,0,:] = 1.


            just_clip = ['rgb2depth', 'rgb2mist']
            if task in just_clip:
                predicted[:,0,0,:] = 0.
                predicted[:,1,0,:] = 1.

            if task == 'segmentsemantic_rb':
                label = np.argmax(predicted, axis=-1)
                COLORS = ('white','red', 'blue', 'yellow', 'magenta', 
                        'green', 'indigo', 'darkorange', 'cyan', 'pink', 
                        'yellowgreen', 'black', 'darkgreen', 'brown', 'gray',
                        'purple', 'darkviolet')
                rgb = (input_batch + 1.) / 2.
                preds = [color.label2rgb(np.squeeze(x), np.squeeze(y), colors=COLORS, kind='overlay')[np.newaxis,:,:,:] for x,y in zip(label, rgb)]
                predicted = np.vstack(preds) 

            if task in ['class_1000', 'class_places']:
                for file_idx, predict_output in zip(data_idx, predicted):
                    to_store_name = list_of_fname[file_idx].decode('utf-8').replace('video', task)
                    to_store_name = os.path.join('/home/ubuntu', to_store_name)
                    sorted_pred = np.argsort(predict_output)[::-1]
                    top_5_pred = [synset[sorted_pred[i]] for i in range(5)]
                    to_print_pred = "Top 5 prediction: \n {}\n {}\n {}\n {} \n {}".format(*top_5_pred)
                    img = Image.new('RGBA', (400, 200), (255, 255, 255))
                    d = ImageDraw.Draw(img)
                    fnt = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSerifCondensed.ttf', 25)
                    d.text((20, 5), to_print_pred, fill=(255, 0, 0), font=fnt)
                    img.save(to_store_name, 'PNG')
            else:
                for file_idx, predict_output in zip(data_idx, predicted):
                    to_store_name = list_of_fname[file_idx].decode('utf-8').replace('video', task)
                    to_store_name = os.path.join('/home/ubuntu', to_store_name)
                    scipy.misc.toimage(np.squeeze(predict_output), cmin=0.0, cmax=1.0).save(to_store_name)

        subprocess.call('tar -czvf /home/ubuntu/{t}.tar.gz /home/ubuntu/{t}'.format(t=task), shell=True)
        subprocess.call('aws s3 cp /home/ubuntu/{t}.tar.gz s3://task-preprocessing-512-oregon/video2/'.format(t=task), shell=True)
        subprocess.call('ffmpeg -r 29.97 -f image2 -s 256x256 -i /home/ubuntu/{t}/vid2/020%04d.png -vcodec libx264 -crf 15  -pix_fmt yuv420p {t}_2.mp4'.format(t=task), shell=True)
        subprocess.call('aws s3 cp {t}_2.mp4 s3://task-preprocessing-512-oregon/video2/'.format(t=task), shell=True)

                

        ############## Clean Up ##############
        training_runners[ 'coord' ].request_stop()
        training_runners[ 'coord' ].join()
        
        # if os.path.isfile(pickle_dir): 
        #     with open(pickle_dir, 'rb') as fp:
        #         all_outputs = pickle.load(fp)
                
        ############## Store to dict ##############
        
        print("Done: {}".format(task))
        # os.system("sudo cp {d} /home/ubuntu/s3/model_log".format(d=pickle_dir))

        ############## Reset graph and paths ##############            
        tf.reset_default_graph()
        training_runners['sess'].close()

    return
Exemple #34
0
def grid_search_approach(technique,
                         n,
                         clf,
                         parameters,
                         X,
                         y,
                         test_size,
                         random_state,
                         cv=7,
                         iid=False,
                         n_jobs=-1,
                         sss_flag=False,
                         type_classifier=None):
    '''Performs grid search technique, against a defined classifier or pipeline object and a dictionary of hyper-params.
    
    Params:
    -------
        - n: number or list of numbers, so numbers of principal components to be retained, exploited,
             in order to improve the overall performances.
        
        - clf: scikit-learn Pipeline object, made up of all the operations to be performed in a given order.
        
        - cv: integer, default=7, number to refer to attempt performed by cross-validation technique to create
              cv models picking up their mean.
        
        - iid: boolean, default=False, shows whether input data should be treated as independent and
               identically distributed data samples.
        
        - n_jobs: integer, default=-1, allows, or enables to let the work station within which the training script is lauched to discover
                  and eventually exploit a baunch of cpu for increasing the performance during training phase.
    '''
    # === Splitting dataset into training and test sets, respectively, both features and labels === #
    if sss_flag is False:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state)
    else:
        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=test_size,
                                     random_state=random_state)
        sss.get_n_splits(X, y)

        for train_index, test_index in sss.split(X, y):
            # print("TRAIN:", train_index, "TEST:", test_index)
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

        # sss = StratifiedShuffleSplit(n_splits=cv, test_size=test_size * 2, random_state=random_state)
        # cv = sss.split(X_train, y_train)

    # === Performing only once for all Principal Component === #
    n_components = X.shape[1]
    pca = PCA(n_components=n_components)

    pca = pca.fit(X_train)
    backup_pcs_ = copy.copy(pca.components_)
    # print(f"Shape principal componets: {backup_pcs_.shape}")

    print(
        f'==== GRID SEARCH METHOD APPLYED ON: {technique.split(",")[0]} Technique ===='
    )
    print(
        f'==== PREPROCESSING METHOD: {technique.split(",")[1]} Technique ====')
    for pos, n_components in enumerate(n):
        print('\n', "*" * 20, sep='')
        print(f"Grid Search attempt no. : {pos+1}")
        tmp_cv = cv

        if True:
            # === Preparing Feature Space by means of retained Principal Components === #
            n = len(pca.components_[n_components:])
            pca.components_[n_components:] = [[0] * X.shape[1]] * n

            X_train_pca = pca.transform(X_train)

            if sss_flag is True:
                sss = StratifiedShuffleSplit(n_splits=cv,
                                             test_size=test_size * 0.5,
                                             random_state=random_state)
                cv = sss.split(X_train, y_train)

            # === Performing training phase === #
            gs_clf = GridSearchCV(clf,
                                  parameters,
                                  cv=cv,
                                  iid=iid,
                                  n_jobs=n_jobs)
            gs_clf = gs_clf.fit(X_train_pca, y_train)

            # === Evaluating performances === #
            predicted = gs_clf.predict(pca.transform(X_test))

            print("--- Classification Report ---")
            print(
                metrics.classification_report(
                    y_test, predicted, target_names=['negative', 'positive']))

            print("--- Confusion Matrix ---")
            print(metrics.confusion_matrix(y_test, predicted))
            print(f"{np.mean(predicted == y_test)}")

            print(f"Best Score: {gs_clf.best_score_}")

            print("--- Best Params ---")
            print(f"n_components: {n_components}")
            for param_name in sorted(parameters.keys()):
                print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

            try:
                evaluate_best_current_model_(X, y, pca, gs_clf, test_size,
                                             random_state, type_classifier)
                # raise Exception('Ok')
            except Exception as err:
                print(err)
            # evaluate_best_current_model_(X, y, pca, gs_clf, test_size, random_state, type_classifier)
            # === Restoring overall pcs for further, subsequent evaluation === #
            pca.components_ = copy.copy(backup_pcs_)
            cv = tmp_cv

        # except Exception as err:
        else:
            pass  #print(err)
    pass
Exemple #35
0
# find correspondce between the skirt and the SMPL body
import pickle
import os.path as osp
import numpy as np
import cv2
from utils.rotation import get_Apose
from global_var import ROOT
from smpl_torch import SMPLNP, TorchSMPL4Garment
from sklearn.decomposition import PCA
from utils.renderer import Renderer

# load template skirt
style_model = np.load(osp.join(ROOT, 'skirt_female', 'style_model.npz'))
pca = PCA(n_components=4)
pca.components_ = style_model['pca_w']
pca.mean_ = style_model['mean']
skirt_v = pca.inverse_transform(np.zeros([1, 4])).reshape([-1, 3])

# move the skirt to the right position
with open(osp.join(ROOT, 'garment_class_info.pkl'), 'rb') as f:
    garment_meta = pickle.load(f)
skirt_f = garment_meta['skirt']['f']
vert_indices = garment_meta['pant']['vert_indices']
up_bnd_inds = np.load(osp.join(ROOT, 'skirt_upper_boundary.npy'))
pant_up_bnd_inds = np.load(osp.join(ROOT, 'pant_upper_boundary.npy'))
waist_body_inds = vert_indices[pant_up_bnd_inds]

smpl = SMPLNP(gender='female')
apose = get_Apose()
body_v, _ = smpl(np.zeros([300]), apose, None, None)
trans = np.mean(body_v[waist_body_inds], 0, keepdims=True) - np.mean(
Exemple #36
0
def compute_features(signals, dataset_type, sfreq, l_freq, h_freq, decim_factor, shiftFactor, scaleFactor, pca, tmin,
                     tmax, tlow, thigh, filter_method, verbose=False):
    '''
    Compute the features fed into the classifier for training or testing purpose.
    It does filtering, cropping, DC removal, normalization, downsamplin, and finally PCA.

    Parameters
    ----------
    signals : 3D numpy array
        Signal to be computed. Dimension is (trial x channel x sample)
    dataset_type: string
        Either 'train' or 'test'
    l_freq, h_freq: float
        Frequencies for bandpass filtering
    decim_factor: int
        decimation factor for downsampling (e.g. 4 -> takes on sample every 4)
    shiftFactor,scaleFactor: 2D array or None
        Normalization factor. Dimension is (channel x sample)
    pca: pca object
    tmin,tmax,tlow,thigh: int
        Timing parameter relative to onset ( -> t=0)
                  tlow    tmin     tmax     thigh
        ---0-------|-------|--------|---------|-------
         onset             <------->
          cue               feature
                            window
                <-------------------------->
                            total window
    filter_method: string
        Either 'WINDOWED' or something else ('NC', 'LFILT')
    verbose : bool
        Verbosity level
    '''
    if filter_method == 'WINDOWED':
        signals_bp = mne.filter.band_pass_filter(signals, sfreq, l_freq, h_freq, method='fft', copy=True,
                                                 iir_params=None)
        if verbose:
            print('Compute Features: window based filtering')
    else:  # == if FILTER_METHOD = 'NC' or FILTER_METHOD = 'LFILT'
        signals_bp = signals
        if verbose:
            print('Compute Features:No filtering')

    tlow_idx = int(sfreq * tlow)
    thigh_idx = int(sfreq * thigh)
    signals_bp = signals_bp[:, :, tlow_idx:thigh_idx]

    # Crop the padding area for bp
    paddingBefore_idx = int(round(sfreq * (tmin - tlow)))
    paddingAfter_idx = int(round(sfreq * (thigh - tmax)))

    tmin_idx = int(sfreq * tmin)
    tmax_idx = int(sfreq * tmax)

    duration_idxs = tmax_idx - tmin_idx
    # signals_bp= signals_bp[:,:,paddingIdx:(signals_bp.shape[2]-paddingIdx)]
    signals_bp = signals_bp[:, :, paddingBefore_idx:paddingBefore_idx + duration_idxs]
    if verbose:
        print('Compute Features: Crop the padding area for BP')

    # Remove DC offset due to filtering
    for trial in range(signals_bp.shape[0]):
        for ch in range(signals_bp.shape[1]):
            signals_bp[trial, ch, :] = signals_bp[trial, ch, :] - np.mean(signals_bp[trial, ch, :])
    if verbose:
        print('Compute Features:Removed DC offset')

    # Normalization
    if dataset_type == 'train':
        (signals_normalized, trainShiftFactor, trainScaleFactor) = normalizeAcrossEpoch(signals_bp, 'MinMax')
    elif dataset_type == 'test':
        # TODO: make sure shift and scale factor are actually existing
        signals_normalized = (signals_bp - shiftFactor) / scaleFactor
        trainShiftFactor = shiftFactor
        trainScaleFactor = scaleFactor
        if verbose:
            print('Compute Features: Normalized according to given shift and scale factor')
    # Downsample
    signals_downsampling = signals_normalized[:, :, ::decim_factor]
    if verbose:
        print('Compute Features:Removed DC offset')

    # Merge channel and time dimension
    signals_reshaped = signals_downsampling.reshape(signals_downsampling.shape[0], -1)

    if dataset_type == 'train':
        pca = PCA(0.95)
        pca.fit(signals_reshaped)
        pca.components_ = -pca.components_  # inversion of vector to be constistant with Inaki's code
        signals_pcaed = pca.transform(signals_reshaped)

    elif dataset_type == 'test':
        # PCA switch
        if pca is not None:
            signals_pcaed = pca.transform(signals_reshaped)
            if verbose:
                print('Compute Features: PCA according to given PCA factor')
        else:
            signals_pcaed = signals_reshaped

    return (signals_pcaed, pca, trainShiftFactor, trainScaleFactor)