def subsample(self, index_locations): """ Method to get index locations as a sample object :param index_locations: list, tuple, numpy array or integer of index locations :returns: Sample object """ if isinstance(index_locations, list) or \ isinstance(index_locations, tuple) or \ isinstance(index_locations, np.ndarray) or \ isinstance(index_locations, int): warnings.simplefilter('ignore') outsamp = Samples() warnings.simplefilter('default') outsamp.x_name = self.x_name outsamp.y_name = self.y_name if isinstance(index_locations, int): loc = np.array([index_locations]) else: loc = np.array(index_locations) outsamp.x = self.x[np.array(loc), :] outsamp.y = self.y[np.array(loc)] outsamp.nsamp = outsamp.x.shape[0] outsamp.index = np.arange(0, outsamp.nsamp) outsamp.nfeat = outsamp.x.shape[1] return outsamp else: raise TypeError( "subsample() method works for list, tuple, numpy array or integer data types only" )
def euc_dist(vec1, vec2): """ Method to calculate euclidean distance between two vectors :param vec1: first vector :param vec2: second vector :return: scalar """ return np.linalg.norm(np.array(vec1) - np.array(vec2))
def difference(self, transpose=False): """ Method to calculate difference from scene center :return: matrix (numpy.ndarray) """ center = self.center diff_matrix = np.apply_along_axis(lambda row: np.array(row) - center, axis=1, arr=np.array(self.matrix)) if transpose: return diff_matrix.T else: return diff_matrix
def select(self, index_list): """ Method to select samples based on an index list :param index_list: :return: Samples object """ if type(index_list) in (list, tuple, None): index_list = np.array(list(set(index_list.copy()))) if (np.max(index_list) > self.nsamp) or (np.min(index_list) < 0): raise ValueError( "Index list out of bounds with {} min and/or {} max".format( str(np.min(index_list)), str(np.max(index_list)))) else: warnings.simplefilter('ignore') samp = Samples() warnings.simplefilter('default') samp.x_name = self.x_name samp.y_name = self.y_name samp.x = self.x[index_list, :] samp.y = self.y[index_list] samp.nsamp = samp.x.shape[0] samp.nfeat = samp.x.shape[1] samp.index = np.arange(0, samp.nsamp) if np.issubdtype(samp.x.dtype, np.number): samp.xmin = samp.x.min(0, initial=-self.max_allow_x) samp.xmax = samp.x.max(0, initial=self.max_allow_x) if np.issubdtype(samp.x.dtype, np.number): samp.ymin = samp.y.min(initial=-self.max_allow_y) samp.ymax = samp.y.max(initial=self.max_allow_y) return samp
def mat_dist(vec1, mat1): """ Method to calculate euclidean distance between between a vector and all the vectors in a matrix :param vec1: vector :param mat1: matrix (numpy array of vectors) :return: numpy array of scalars """ return np.apply_along_axis( lambda x: Euclidean.euc_dist(x, np.array(vec1)), 1, mat1)
def add_column(self, column_name=None, column_data=None, column_order=None): """ Function to add a column to the samples matrix. Column_order keyword is used after appending the column name and data to the right of the matrix but if column_data is None, self.x is re-ordered according to column_order :param column_name: Name of column to be added :param column_data: List of column values to be added :param column_order: List of numbers specifying column order for the column to be added (e.g. if for three samples, the first value in column_data is for second column, second value for first, third value for third, the column_order is [1, 0, 2] :return: Samples object with added column """ if column_data is None: if column_order is not None: self.x = self.x[:, np.array(column_order)] self.x_name = list(self.x_name[i] for i in column_order) return else: RuntimeError('No argument for add operation') else: column_data_ = np.array(column_data) self.x = np.hstack((self.x, column_data_[:, np.newaxis])) if column_name is None: column_name = 'Column_{}'.format(str(len(self.x_name) + 1)) self.x_name.append(column_name) if column_order is None or len(column_order) != self.x.shape[1]: warnings.warn('Inconsistent or missing order - ignored') column_order = list(range(0, self.x.shape[1])) self.x = self.x[:, np.array(column_order)] self.x_name = list(self.x_name[i] for i in column_order) self.columns = list(range(0, self.x.shape[1])) self.nvar = len(self.columns) self.nfeat = self.x.shape[1]
def cluster_center(self, method='median'): """ Method to determine cluster center of the sample matrix :param method: Type of reducer to use. Options: 'mean', 'median', 'percentile_xx' where xx is 1-99 :return: Cluster center (vector of column/dimension values) """ if self.matrix is not None: if method == 'median': self.center = np.array(np.median(self.matrix, axis=0))[0] elif method == 'mean': self.center = np.array(np.mean(self.matrix, axis=0))[0] elif 'percentile' in method: perc = int(method.replace('percentile', '')[1:]) self.center = np.array(np.percentile(self.matrix, perc, axis=0))[0] else: raise ValueError("Invalid or no reducer") else: raise ValueError("Sample matrix not found")
def sample_matrix(self): """ Method to convert sample dictionaries to sample matrix :return: Numpy matrix with columns as dimensions and rows as individual samples """ # dimensions of the sample matrix nsamp = len(self.samples) nvar = len(self.names) if nsamp > 1: # copy data to matrix self.matrix = np.array([[ Handler.string_to_type(self.samples[i][self.names[j]]) for j in range(0, nvar) ] for i in range(0, self.nsamp)]) else: raise ValueError('Not enough samples to make a matrix object')
def select_inverse(self, index_list): """ Method to select samples other than those on the index list :param index_list: :return: Samples object """ if type(index_list) in (list, tuple, None): index_list = np.array(list(set(index_list.copy()))) if (np.max(index_list) > self.nsamp) or (np.min(index_list) < 0): raise ValueError( "Index list out of bounds with {} min and/or {} max".format( str(np.min(index_list)), str(np.max(index_list)))) else: reverse_indices = self.index[~np.in1d(self.index, index_list)] return self.select(reverse_indices)
def covariance(self, inverse=False): """ Method to calculate a covariance matrix for a given sample matrix where rows are samples, columns are dimensions :param inverse: Should the inverse matrix be calculated :return: Covariance or inverse covariance matrix (numpy.matrix object) """ cov_mat = np.cov(self.matrix, rowvar=False) if inverse: # Inverse using SVD u, s, v = np.linalg.svd(cov_mat) try: return np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T) except ValueError: return None else: return np.array(cov_mat)
def select_features(self, name_list=None): """ Method to return a Samples instance using a selection of feature names :param name_list: List of feature names to make a new Samples() instance from :returns: Samples instance """ indx_list = [] for name in name_list: indx_list.append(self.x_name.index(name)) samp = Samples(label_colname=self.y_name, x=self.x[:, np.array(indx_list)], y=self.y, x_name=name_list, y_name=self.y_name, weights=self.weights, weights_colname=self.weights_colname, use_band_dict=self.use_band_dict, max_allow_x=self.max_allow_x, max_allow_y=self.max_allow_y) samp.csv_file = self.csv_file return samp
def __init__(self, csv_file=None, label_colname=None, x=None, y=None, x_name=None, y_name=None, weights=None, weights_colname=None, use_band_dict=None, max_allow_x=1e13, max_allow_y=1e13, line_limit=None, remove_null=True, **kwargs): """ :param csv_file: csv file that contains the features (training or validation samples) :param label_colname: column in csv file that contains the feature label (output value) :param x: 2d array containing features (samples) without the label :param y: 1d array of feature labels (same order as x) :param x_name: 1d array of feature names (bands). Can be used to select which columns to read from csv file. :param y_name: name of label :param use_band_dict: list of attribute (band) names :param max_allow_x: Maximum allowed values of x :param max_allow_y: Maximum allowed value of y """ self.csv_file = csv_file self.label_colname = label_colname if type(x).__name__ in ('ndarray', 'NoneType'): self.x = x else: self.x = np.array(list(x)) self.x_name = x_name if type(y).__name__ in ('ndarray', 'NoneType'): self.y = y else: self.y = np.array(list(y)) self.y_name = y_name self.weights = weights self.weights_colname = weights_colname self.use_band_dict = use_band_dict self.index = None self.nfeat = None self.xmin = None self.xmax = None self.ymin = None self.ymax = None self.y_hist = None self.y_bin_edges = None self.x_hist = None self.x_bin_edges = None self.max_allow_x = max_allow_x self.max_allow_y = max_allow_y # label name or csv file are provided if (label_colname is not None) and (csv_file is not None): temp = Handler(filename=csv_file).read_from_csv( return_dicts=True, line_limit=line_limit) header = list(temp[0]) # label name doesn't match if label_colname in header: loc = header.index(label_colname) else: raise ValueError("Label name mismatch.\nAvailable names: " + ', '.join(header)) feat_names = header.copy() _ = feat_names.pop(loc) # read from data dictionary if self.x_name is not None and type(self.x_name) in (list, tuple): self.x_name = [ elem for elem in feat_names if elem in self.x_name ] else: self.x_name = feat_names clean_list = [] if remove_null: for elem_dict in temp: val_chk = list((elem in ( None, '', ' ', 'null', 'NULL', '<null>', '<NULL>')) or (elem in (int, float) and np.isnan(elem)) for elem in elem_dict.values()) if any(val_chk): continue else: clean_list.append(elem_dict) else: clean_list = temp self.x = np.array( list( list(samp_dict[feat_name] for feat_name in feat_names) for samp_dict in clean_list)) self.y = np.array( list(samp_dict[label_colname] for samp_dict in clean_list)) self.y_name = label_colname # if band name dictionary is provided if use_band_dict is not None: self.y_name = [use_band_dict[b] for b in self.y_name] elif (label_colname is None) and (csv_file is not None): temp = Handler(filename=csv_file).read_from_csv( return_dicts=True, line_limit=line_limit) clean_list = [] if remove_null: for elem_dict in temp: val_chk = list((elem in ( None, '', ' ', 'null', 'NULL', '<null>', '<NULL>')) or (elem in (int, float) and np.isnan(elem)) for elem in elem_dict.values()) if any(val_chk): continue else: clean_list.append(elem_dict) else: clean_list = temp # read from data dictionary feat_names = list(clean_list[0].keys()) if self.x_name is not None and type(self.x_name) in (list, tuple): self.x_name = [ elem for elem in feat_names if elem in self.x_name ] else: self.x_name = feat_names self.x = np.array( list( list(samp_dict[feat_name] for feat_name in self.x_name) for samp_dict in clean_list)) else: warnings.warn( "Samples class initiated without data file and/or label", category=RuntimeWarning, stacklevel=1) if self.x is not None and self.y is not None: if self.y_name is None: self.y_name = 'y' if (self.x_name is None) or \ (type(self.x_name) not in (list, tuple)) or \ (len(self.x_name) != self.x.shape[1]): self.x_name = list('x{}'.format(str(i + 1)) for i in range(self.x.shape[1])) if weights is None: if weights_colname is not None: if csv_file is not None: # label name doesn't match if any(weights_colname in n for n in self.x_name): loc = self.x_name.index(weights_colname) else: raise ValueError("Weight column name mismatch") self.weights = self.x[:, loc] self.x = np.delete(self.x, loc, 1) else: raise ValueError("No csv_file specified for weights") # if keywords are supplied if kwargs is not None: # columns containing data if 'columns' in kwargs: if type(kwargs['columns']).__name__ == 'list': self.columns = np.array(kwargs['columns']) elif type(kwargs['columns']).__name__ in ('ndarray', 'NoneType'): self.columns = kwargs['columns'] else: self.columns = np.array(list(kwargs['columns'])) else: self.columns = None # IDs of samples if 'ids' in kwargs: self.ids = kwargs['ids'] else: self.ids = None else: self.columns = None self.ids = None if self.x is not None: if self.columns is None: self.columns = np.arange(0, self.x.shape[1]) self.nsamp = self.x.shape[0] self.nvar = self.x.shape[1] self.nfeat = self.x.shape[1] if np.issubdtype(self.x.dtype, np.number): self.xmin = self.x.min(0, initial=max_allow_x) self.xmax = self.x.max(0, initial=max_allow_y) self.index = np.arange(0, self.x.shape[0]) else: self.nsamp = 0 self.nvar = 0 if self.y is not None: if np.issubdtype(self.y.dtype, np.number): self.ymin = self.y.min(initial=-max_allow_y) self.ymax = self.y.max(initial=max_allow_y) if self.y is not None: self.head = '\n'.join( list( str(elem) for elem in [' '.join(list(self.x_name) + [self.y_name])] + list(' '.join( list( str(elem_) for elem_ in self.x[i, :].tolist() + [self.y[i]])) for i in range(10)))) else: self.head = '<empty>'