def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_negative_ints(): le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_empty_array(): le = LabelEncoder() le.fit(np.array(["1", "2", "1", "2", "2"])) # test empty transform transformed = le.transform([]) assert_array_equal(np.array([]), transformed) # test empty inverse transform inverse_transformed = le.inverse_transform([]) assert_array_equal(np.array([]), inverse_transformed)
def test_label_encoder_negative_ints(): le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_empty_array(values): le = LabelEncoder() le.fit(values) # test empty transform transformed = le.transform([]) assert_array_equal(np.array([]), transformed) # test empty inverse transform inverse_transformed = le.inverse_transform([]) assert_array_equal(np.array([]), inverse_transformed)
def test_label_encoder_string_labels(): """Test LabelEncoder's transform and inverse_transform methods with non-numeric labels""" le = LabelEncoder() le.fit(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1]) assert_array_equal(le.inverse_transform([2, 2, 1]), ["tokyo", "tokyo", "paris"]) assert_raises(ValueError, le.transform, ["london"])
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, 1, -1]) assert_raises(ValueError, le.inverse_transform, [-1])
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, 1, -1]) assert_raises(ValueError, le.inverse_transform, [-1])
def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_string_labels(): """Test LabelEncoder's transform and inverse_transform methods with non-numeric labels""" le = LabelEncoder() le.fit(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1]) assert_array_equal(le.inverse_transform([2, 2, 1]), ["tokyo", "tokyo", "paris"]) assert_raises(ValueError, le.transform, ["london"])
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, -1, 1]) msg = "contains previously unseen labels" assert_raise_message(ValueError, msg, le.inverse_transform, [-2]) assert_raise_message(ValueError, msg, le.inverse_transform, [-2, -3, -4])
def preprocess(data): for column in data: if data.dtypes[column] == object: data[column].fillna("Não mensurado", inplace=True) encoder = LabelEncoder() encoder.fit(data[column].tolist()) data[column] = encoder.transform(data[column]) elif data.dtypes[column] == float: data[column].fillna(0, inplace=True) elif data.dtypes[column] == int: data[column].fillna(0, inplace=True) return data
def test_label_encoder(): # Test LabelEncoder's transform and inverse_transform methods le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) le.fit(["apple", "orange"]) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, -1, 1]) msg = "contains previously unseen labels" assert_raise_message(ValueError, msg, le.inverse_transform, [-2]) assert_raise_message(ValueError, msg, le.inverse_transform, [-2, -3, -4])
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)
def test_label_encoder(): # Test LabelEncoder's transform and inverse_transform methods le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) le.fit(["apple", "orange"]) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)
class LabelEncoderImpl(): def __init__(self): self._hyperparams = {} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def _conform_targets(targets): """ Conform targets to [0, n_targets-1]. Parameters ---------- targets : array (n_targets, ) Returns ------- targets_conformed : array (n_targets, ) targets are between 0 and n_targets-1 label_encoder : LabelEncoder fit on targets, used to invert back using label_encoder.inverse_transform """ le = LabelEncoder() le.fit(targets) return le.transform(targets), le
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() with pytest.raises(ValueError): le.transform([]) with pytest.raises(ValueError): le.inverse_transform([]) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, -1, 1]) msg = "contains previously unseen labels" with pytest.raises(ValueError, match=msg): le.inverse_transform([-2]) with pytest.raises(ValueError, match=msg): le.inverse_transform([-2, -3, -4]) # Fail on inverse_transform("") msg = "bad input shape ()" with pytest.raises(ValueError, match=msg): le.inverse_transform("")
def main(): print('\033[1m' + 'Loading all the datasets...' + '\033[0m') arffs_dic = obtain_arffs('./datasets/') # Extract an specific database dataset_name = 'breast-w' # possible datasets ('hypothyroid', 'breast-w', 'waveform') dat1 = arffs_dic[dataset_name] df1 = pd.DataFrame(dat1[0]) # original data in pandas dataframe groundtruth_labels = df1[df1.columns[ len(df1.columns) - 1]].values # original labels in a numpy array df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1) if dataset_name == 'hypothyroid': df1 = df1.drop( 'TBG', 1 ) # This column only contains NaNs so does not add any value to the clustering data1 = df1.values # original data in a numpy array without labels load = Preprocess() data_x = load.preprocess_method(data1) data_x = data_x.astype(np.float64) le = LabelEncoder() le.fit(np.unique(groundtruth_labels)) groundtruth_labels = le.transform(groundtruth_labels) num_clusters = len( np.unique(groundtruth_labels)) # Number of different labels # -------------------------------------------------------------------------------Compute covariance and eigenvectors original_mean = np.mean(data_x, axis=0) cov_m = compute_covariance(data_x, original_mean) eig_vals, eig_vect = np.linalg.eig(cov_m) idxsort = eig_vals.argsort()[::-1] eig_vals = eig_vals[idxsort].real eig_vect = eig_vect[:, idxsort].real # ---------------------------------------------------------------------Decide the number of features we want to keep prop_variance = 0.9 k = proportion_of_variance(eig_vals, prop_variance) print('\nThe value of K selected to obtain a proportion of variance = ' + str(prop_variance) + ' is: ' + str(k) + '\n') eig_vals_red = eig_vals[:k] eig_vect_red = eig_vect[:, :k] # Eigenvectors are in columns (8xk) # ---------------------------------------------------------------------------------Reduce dimensionality of the data # A1) Using our implementation of PCA transf_data_x = np.dot((eig_vect_red.T), (data_x - original_mean).T).T # B1) Using the PCA implementation of sklearn pca = PCA(n_components=k) transf_data_x_sklearn = pca.fit_transform(data_x) # C1) Using the incremental PCA implementation of sklearn incrementalpca = IncrementalPCA(n_components=k) transf_data_x_sklearn2 = incrementalpca.fit_transform(data_x) # --------------------------------------------------------------------------------------------------Reconstruct data # A2) Reconstruct data with our method reconstruct_data_x = np.dot(eig_vect_red, transf_data_x.T) reconstruct_data_x = reconstruct_data_x.T + original_mean # B2) Reconstruct data with PCA sklearn reconstruct_data_x1 = np.dot(pca.components_.T, transf_data_x_sklearn.T) reconstruct_data_x1 = reconstruct_data_x1.T + original_mean # C2) Reconstruct data with incremental PCA sklearn reconstruct_data_x2 = np.dot(incrementalpca.components_.T, transf_data_x_sklearn2.T) reconstruct_data_x2 = reconstruct_data_x2.T + original_mean # ----------------------------------------------------------------Error between original data and reconstructed data # A3) Error between original data and reconstruct data error = reconstruct_data_x - data_x total_error = (np.sum(abs(error)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error, 2)) + '%' + '\033[0m' + ' [using our implementation of PCA]') # B3) Error between original data and reconstruct data 1 error1 = reconstruct_data_x1 - data_x total_error1 = (np.sum(abs(error1)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error1, 2)) + '%' + '\033[0m' + ' [using pca.fit_transform of Sklearn]') # C3) Error between original data and reconstruct data 2 error2 = reconstruct_data_x2 - data_x total_error2 = (np.sum(abs(error2)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error2, 2)) + '%' + '\033[0m' + ' [using incrementalpca.fit_transform of Sklearn]') # ------------------------------------------------------------------------------Kmeans with dimensionality reduction print( '\n---------------------------------------------------------------------------------------------------------' ) print('K-MEANS APPLIED TO THE ORIGINAL DATA') tester_kmeans(data_x, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING OUR IMPLEMENTATION OF PCA' ) labels = tester_kmeans(transf_data_x, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING pca.fit_transform OF SKLEARN' ) tester_kmeans(transf_data_x_sklearn, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING incrementalpca.fit_transform OF SKLEARN' ) tester_kmeans(transf_data_x_sklearn2, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) # -----------------------------------------------------------------------------------------------------Scatter plots ploting_boolean = False plot_scatters = False # only change to True for a database with not too many features (like breast-w) if ploting_boolean: # Plot eigenvector plt.plot(eig_vals, 'ro-', linewidth=2, markersize=6) plt.title('Magnitude of the eigenvalues') plt.show() if plot_scatters: # Plottings: scatter plots # Original data with groundtruth labels ploting_v(data_x, num_clusters, groundtruth_labels, 'original data with groundtruth labels') # Transfomed data with our implementation of PCA and with groundtruth labels ploting_v(transf_data_x, num_clusters, groundtruth_labels, 'transformed data (our PCA) with groundtruth ' 'labels') # Transfomed data with pca.fit_transform and with groundtruth labels ploting_v( transf_data_x_sklearn, num_clusters, groundtruth_labels, 'transformed data (Sklearn PCA v1) ' 'with groundtruth labels') # Transfomed data with incrementalpca.fit_transform and with groundtruth labels ploting_v( transf_data_x_sklearn2, num_clusters, groundtruth_labels, 'transformed data (Sklearn PCA v2) ' 'with groundtruth labels') # ------------------------------------------------------------------------------------------------------3D plots # Plottings: 3D plots # Original data without labels ploting_v3d(data_x, 1, np.zeros(len(groundtruth_labels)), 'original data without labels') # Original data with groundtruth labels ploting_v3d(data_x, num_clusters, groundtruth_labels, 'original data with groundtruth labels') # Reconstructed data without labels ploting_v3d(reconstruct_data_x, 1, np.zeros(len(groundtruth_labels)), 'reconstructed data without labels') # Transfomed data with our implementation of PCA and without labels ploting_v3d(transf_data_x, 1, np.zeros(len(groundtruth_labels)), 'transformed data without labels') # Transfomed data with our implementation of PCA and with groundtruth_labels ploting_v3d(transf_data_x, num_clusters, groundtruth_labels, 'transformed data with groundtruth labels') # Transfomed data with our implementation of PCA and with the labels obtained with our K-means ploting_v3d(transf_data_x, num_clusters, labels, 'transformed data with labels from our K-means') # Plot of the correlation matrix of the dataset plot_corr_matrix(data_x, legend=False)
def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() le.fit(np.array(["apple", "orange"], dtype=dtype)) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() le.fit(np.array(["apple", "orange"], dtype=dtype)) msg = "bad input shape" with pytest.raises(ValueError, match=msg): le.transform("apple")
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance', average:str='weighted', return_y_pred:int=0, verbose:int=0, n_jobs:int=1) -> float: """ Calculate R-Precision (recall at R-th position). Parameters ---------- S : ndarray or CSR matrix Distance (similarity) matrix y : ndarray Target (ground truth) labels metric : 'distance' or 'similarity', optional, default: 'similarity' Define, whether `S` is a distance or similarity matrix. average : 'weighted', 'macro' or None, optional, default: 'weighted' Ignored. Weighted and macro precisions are returned. return_y_pred : int, optional, default: 0 If > 0, return the labels of the `return_y_pred` nearest neighbors verbose : int, optional, default: 0 Increasing level of output. n_jobs : int, optional, default: 1 Number of parallel processes to use. Returns ------- r_precision : dictionary with following keys: macro : float Macro R-Precision. weighted : float Weighted R-Precision. per_item : ndarray R-Precision at the object. relevant_items : ndarray Relevant items per class. y_true : ndarray Target labels (req. for weighting). y_pred : ndarray Labels of some k-nearest neighbors """ io.check_distance_matrix_shape(S) io.check_distance_matrix_shape_fits_labels(S, y) io.check_valid_metric_parameter(metric) log = ConsoleLogging() n, _ = S.shape S_is_sparse = issparse(S) if metric != 'similarity' or not S_is_sparse: raise NotImplementedError("Only sparse similarity matrices so far.") # Map labels to 0..n(labels)-1 le = LabelEncoder() # Add int.min for misclassifications incorr_orig = np.array([np.nan]).astype(int) le.fit(np.append(y, incorr_orig)) y = le.transform(y) incorrect = le.transform(incorr_orig) # Number of relevant items, i.e. number of each label relevant_items = np.bincount(y) - 1 # one less for self class # R-Precision for each item r_prec = np.zeros(n, dtype=np.float) # Classify each point in test set if verbose: log.message("Creating shared memory data.") n_random_pred = mp.Value(ctypes.c_int) n_random_pred.value = 0 if verbose and log: log.message("Spawning processes for prediction.") y_pred = np.zeros((n, return_y_pred), dtype=float) kwargs = {'y_pred' : return_y_pred, 'incorrect' : incorrect} with mp.Pool(processes=n_jobs, initializer=_load_shared_csr, initargs=(S, y, n_random_pred, relevant_items)) as pool: for i, r in enumerate( pool.imap( func=partial(_r_prec_worker, **kwargs), iterable=range(n), chunksize=int(1e2))): if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1): log.message("Classification: {} of {} on {}.".format( i+1, n, mp.current_process().name), flush=True) try: r_prec[i] = r[0] y_pred[i, :] = r[1] except: r_prec[i] = r if i == n-1: pass pool.join() if verbose and log: log.message("Retrieving nearest neighbors.") # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T if verbose and log: log.message("Finishing.") if n_random_pred.value: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers or there were no other " "objects in the same class.").format(n_random_pred.value)) return_dict = {'macro' : r_prec.mean(), 'weighted' : np.average(r_prec, weights=relevant_items[y]), 'per_item' : r_prec, 'relevant_items' : relevant_items, 'y_true' : y, 'y_pred' : y_pred} return return_dict
def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() le.fit(np.array(["apple", "orange"], dtype=dtype)) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def main(): print('\033[1m' + 'Loading all the datasets...' + '\033[0m') arffs_dic = obtain_arffs('./datasetsSelected/') # Extract an specific database dataset_name = 'sick' #sick # nursery dataset = arffs_dic[dataset_name] # ------------------------------------------------------------------------------------ Compute indices for each fold # Use folder 0 of that particular dataset to find indices of train and test for each fold ref_data = np.concatenate((dataset[0][0], dataset[0][1]), axis=0) df_aux = pd.DataFrame(ref_data) df_aux = df_aux.fillna('nonna').values ref_data_dic = {} for i in range(df_aux.shape[0]): ref_data_dic[str(df_aux[i, :])] = i trn_tst_dic = trn_tst_idxs(ref_data_dic, dataset) # --------------------------------------------------------------------------------- Reading parameters from keyboard C, kernel, decision_function = read_keyboard() # ------------------------------------------------------------------------------------------------------- Preprocess df1 = pd.DataFrame(ref_data) groundtruth_labels = df1[df1.columns[ len(df1.columns) - 1]].values # original labels in a numpy array df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1) if dataset_name == 'sick': df1 = df1.drop( 'TBG', 1 ) # This column only contains NaNs so does not add any value to the clustering data1 = df1.values # original data in a numpy array without labels load = Preprocess() # ---------------------------------------------------------------------------------------- Encode groundtruth labels le = LabelEncoder() le.fit(np.unique(groundtruth_labels)) groundtruth_labels = le.transform(groundtruth_labels) data_x = load.preprocess_method(data1) # -------------------------------------------------------------------------------------------- Supervised classifier # Compute accuracy for each fold accuracies = [] fold_number = 0 start_time = time.time() for trn_idxs, tst_idxs in trn_tst_dic.values(): fold_number = fold_number + 1 print('Computing accuracy for fold number ' + str(fold_number)) trn_data = data_x[trn_idxs] trn_labels = groundtruth_labels[trn_idxs] tst_data = data_x[tst_idxs] tst_labels = groundtruth_labels[tst_idxs] svecm = SVM_Algorithm(C, kernel, decision_function) acc = svecm.algorithm(trn_data, trn_labels, tst_data, tst_labels) accuracies.append(acc) mean_accuracies = str(round(np.mean(accuracies), 4)) std_accuracies = str(round(np.std(accuracies), 3)) print('\n\033[1m' + 'The mean accuracy of classification in the test set is: ' + mean_accuracies + ' ± ' + std_accuracies + '\033[0m') print('\033[1mRunning time for the 10 folds: %s seconds\033[0m' % round(time.time() - start_time, 4))
def preprocess_classes(classes): encoder = LabelEncoder() encoder.fit(classes) return encoder.transform(classes)