def test_check_sa(self): valid1 = np.array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=[('int', int), ('s', 'S0')]) valid2 = np.array([[1, 2, 3], [4, 5, 6]]) valid3 = [[1, 2, 3], [4, 5, 6]] valid4 = pd.DataFrame(valid1) for valid in (valid1, valid2, valid3, valid4): self.assertTrue(utils.is_sa(utils.check_sa(valid))) self.assertRaises(ValueError, utils.check_sa, None) self.assertRaises(ValueError, utils.check_sa, "lalala") utils.check_sa(valid1, n_rows=3, n_cols=2) self.assertRaises(ValueError, utils.check_sa, valid1, n_rows=4) self.assertRaises(ValueError, utils.check_sa, valid1, n_cols=3)
def pprint_sa(M, row_labels=None, col_labels=None): """Prints a nicely formatted Structured array (or similar object) to console Parameters ---------- M : numpy.ndarray or list of lists structured array or homogeneous array or list of lists to print row_labels : list or None labels to put in front of rows. Defaults to row number col_labels : list of str or None names to label columns with. If M is a structured array, its column names will be used instead """ M = utils.check_sa(M, col_names_if_converted=col_labels) if row_labels is None: row_labels = xrange(M.shape[0]) col_labels = M.dtype.names # From http://stackoverflow.com/questions/9535954/python-printing-lists-as-tabular-data col_lens = [max(max([len('{}'.format(cell)) for cell in M[name]]), len(name)) for name in col_labels] row_label_len = max([len('{}'.format(label)) for label in row_labels]) row_format =('{{:>{}}} '.format(row_label_len) + ' '.join(['{{:>{}}}'.format(col_len) for col_len in col_lens])) print row_format.format("", *col_labels) for row_name, row in zip(row_labels, M): print row_format.format(row_name, *row)
def choose_cols_where(M, arguments): """Returns a structured array containing only columns adhering to a query Parameters ---------- M : numpy.ndarray Structured array arguments : list of dict See module documentation Returns ------- numpy.ndarray Structured array with only specified columns """ M = utils.check_sa(M) args = __check_args_col_select(arguments) to_keep = np.ones(len(M.dtype), dtype=bool) for arg_set in arguments: lambd, vals = (arg_set['func'], arg_set['vals']) to_keep = np.logical_and(to_keep, lambd(M, vals)) keep_col_names = [col_name for col_name,included in zip(M.dtype.names, to_keep) if included] return M[keep_col_names]
def plot_correlation_matrix(M, verbose=True): """Plot correlation between variables in M Parameters ---------- M : numpy.ndarray structured array verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ # http://glowingpython.blogspot.com/2012/10/visualizing-correlation-matrices.html # TODO work on structured arrays or not # TODO ticks are col names M = utils.check_sa(M) names = M.dtype.names M = cast_np_sa_to_nd(M) #set rowvar =0 for rows are items, cols are features cc = np.corrcoef(M, rowvar=0) fig = plt.figure() plt.pcolor(cc) plt.colorbar() plt.yticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1])) plt.xticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1])) if verbose: plt.show() return fig
def remove_cols_where(M, arguments): """Returns a structured array containing columns not adhering to a query Parameters ---------- M : numpy.ndarray Structured array arguments : list of dict See module documentation Returns ------- numpy.ndarray Structured array without specified columns """ M = utils.check_sa(M) args = __check_args_col_select(arguments) to_remove = np.ones(len(M.dtype), dtype=bool) for arg_set in arguments: lambd, vals = (arg_set['func'], arg_set['vals']) to_remove = np.logical_and(to_remove, lambd(M, vals)) remove_col_names = [ col_name for col_name, included in zip(M.dtype.names, to_remove) if included ] return remove_cols(M, remove_col_names)
def label_encode(M): """Changes string cols to ints so that there is a 1-1 mapping between strings and ints Parameters ---------- M : numpy.ndarray structured array Returns ------- (numpy.ndarray, dict of str: array) A tuple: the first element is structured array with strings mapped to ints. The second element is a dictionary where keys are column names and values are arrays of the strings that belong to each class, as in: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html """ M = utils.check_sa(M) new_dtype = [] result_arrays = [] classes = {} for (col_name, fmt) in M.dtype.descr: if 'S' in fmt or 'O' in fmt: col = M[col_name] le = preprocessing.LabelEncoder() le.fit(col) classes[col_name] = le.classes_ result_arrays.append(le.transform(col)) new_dtype.append((col_name, int)) else: result_arrays.append(M[col_name]) new_dtype.append((col_name, fmt)) return (np.array(zip(*result_arrays), dtype=new_dtype), classes)
def remove_rows_where(M, arguments): """Returns a structured array containing rows not adhering to a query Parameters ---------- M : numpy.ndarray Structured array arguments : list of dict See module documentation Returns ------- numpy.ndarray Structured array without specified rows """ M = utils.check_sa(M) args = __check_args_row_select(M, arguments) to_remove = np.ones(M.size, dtype=bool) for arg_set in arguments: lambd, col_name, vals = (arg_set['func'], arg_set['col_name'], arg_set['vals']) to_remove = np.logical_and(to_remove, lambd(M, col_name, vals)) return M[np.logical_not(to_remove)]
def where_all_are_true(M, arguments): """Returns a boolean array which specifies which rows pass a query Parameters ---------- M : numpy.ndarray Structured array arguments : list of dict See module documentation Returns ------- numpy.ndarray boolean array specifying which rows pass a query """ M = utils.check_sa(M) args = __check_args_row_select(M, arguments) to_select = np.ones(M.size, dtype=bool) for arg_set in arguments: lambd, col_name, vals = (arg_set['func'], arg_set['col_name'], arg_set['vals']) to_select = np.logical_and(to_select, lambd(M, col_name, vals)) return to_select
def describe_cols(M, verbose=True): """Returns summary statistics for a numpy array Parameters ---------- M : numpy.ndarray structured array Returns ------- numpy.ndarray structured array of summary statistics for M """ M = utils.check_sa(M) descr_rows = [] for col_name, col_type in M.dtype.descr: if 'f' in col_type or 'i' in col_type: col = M[col_name] row = [col_name] + [func(col) for _, func in __describe_cols_metrics] else: row = [col_name] + __describe_cols_fill descr_rows.append(row) col_names = ['Column Name'] + [col_name for col_name, _ in __describe_cols_metrics] ret = convert_to_sa(descr_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def pprint_sa(M, row_labels=None, col_labels=None): """Prints a nicely formatted Structured array (or similar object) to console Parameters ---------- M : numpy.ndarray or list of lists structured array or homogeneous array or list of lists to print row_labels : list or None labels to put in front of rows. Defaults to row number col_labels : list of str or None names to label columns with. If M is a structured array, its column names will be used instead """ M = utils.check_sa(M, col_names_if_converted=col_labels) if row_labels is None: row_labels = xrange(M.shape[0]) col_labels = M.dtype.names # From http://stackoverflow.com/questions/9535954/python-printing-lists-as-tabular-data col_lens = [ max(max([len('{}'.format(cell)) for cell in M[name]]), len(name)) for name in col_labels ] row_label_len = max([len('{}'.format(label)) for label in row_labels]) row_format = ( '{{:>{}}} '.format(row_label_len) + ' '.join(['{{:>{}}}'.format(col_len) for col_len in col_lens])) print row_format.format("", *col_labels) for row_name, row in zip(row_labels, M): print row_format.format(row_name, *row)
def where_all_are_true(M, arguments): """Returns a boolean array which specifies which rows pass a query Parameters ---------- M : numpy.ndarray Structured array arguments : list of dict See module documentation Returns ------- numpy.ndarray boolean array specifying which rows pass a query """ M = utils.check_sa(M) args = __check_args_row_select(M, arguments) to_select = np.ones(M.size, dtype=bool) for arg_set in arguments: lambd, col_name, vals = (arg_set['func'], arg_set['col_name'], arg_set['vals']) to_select = np.logical_and(to_select, lambd(M, col_name, vals)) if not np.any(to_select): break return to_select
def describe_cols(M, verbose=True): """Returns summary statistics for a numpy array Parameters ---------- M : numpy.ndarray structured array Returns ------- numpy.ndarray structured array of summary statistics for M """ M = utils.check_sa(M) descr_rows = [] for col_name, col_type in M.dtype.descr: if 'f' in col_type or 'i' in col_type: col = M[col_name] row = [col_name ] + [func(col) for _, func in __describe_cols_metrics] else: row = [col_name] + __describe_cols_fill descr_rows.append(row) col_names = ['Column Name' ] + [col_name for col_name, _ in __describe_cols_metrics] ret = convert_to_sa(descr_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0): """Replace values signifying missing data with some substitute Parameters ---------- M : numpy.ndarray structured array strategy : {'mean', 'median', 'most_frequent', 'constant'} method to use to replace missing data missing_val : value that M uses to represent missint data. i.e. numpy.nan for floats or -999 for integers constant : int If the 'constant' strategy is chosen, this is the value to replace missing_val with """ # TODO support times, strings M = utils.check_sa(M) if strategy not in ['mean', 'median', 'most_frequent', 'constant']: raise ValueError('Invalid strategy') M_cp = M.copy() if strategy == 'constant': try: missing_is_nan = np.isnan(missing_val) except TypeError: # missing_val is not a float missing_is_nan = False if missing_is_nan: # we need to be careful about handling nan for col_name, col_type in M_cp.dtype.descr: if 'f' in col_type: col = M_cp[col_name] col[np.isnan(col)] = constant return M_cp for col_name, col_type in M_cp.dtype.descr: if 'i' in col_type or 'f' in col_type: col = M_cp[col_name] col[col == missing_val] = constant return M_cp # we're doing one of the sklearn imputer strategies imp = preprocessing.Imputer(missing_values=missing_val, strategy=strategy, axis=1) for col_name, col_type in M_cp.dtype.descr: if 'f' in col_type or 'i' in col_type: # The Imputer only works on float and int columns col = M_cp[col_name] col[:] = imp.fit_transform(col) return M_cp
def label_encode(M, force_columns=[]): """Changes string cols to ints so that there is a 1-1 mapping between strings and ints Parameters ---------- M : numpy.ndarray structured array force_columns : list of str By default, label_encode will only encode string columns. If the name of a numerical column is also present in force_columns, then that column will also be label encoded Returns ------- (numpy.ndarray, dict of str: array) A tuple: the first element is structured array with strings mapped to ints. The second element is a dictionary where keys are column names and values are arrays of the strings that belong to each class, as in: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html """ M = utils.check_sa(M) new_dtype = [] result_arrays = [] classes = {} for (col_name, fmt) in M.dtype.descr: if 'S' in fmt or 'O' in fmt or col_name in force_columns: col = M[col_name] le = preprocessing.LabelEncoder() le.fit(col) classes[col_name] = le.classes_ result_arrays.append(le.transform(col)) new_dtype.append((col_name, int)) else: result_arrays.append(M[col_name]) new_dtype.append((col_name, fmt)) return (np.array(zip(*result_arrays), dtype=new_dtype), classes)
def add_table(self, M): """Adds structured array to report""" M = utils.check_sa(M) sio = StringIO.StringIO() self.__np_to_html_table(M, sio) self.__objects.append(sio.getvalue())
def plot_correlation_scatter_plot(M, verbose=True): """Makes a grid of scatter plots representing relationship between variables Each scatter plot is one variable plotted against another variable Parameters ---------- M : numpy.ndarray structured array verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ # TODO work for all three types that M might be # TODO ignore classification variables # adapted from the excellent # http://stackoverflow.com/questions/7941207/is-there-a-function-to-make-scatterplot-matrices-in-matplotlib M = utils.check_sa(M) numdata = M.shape[0] numvars = len(M.dtype) names = M.dtype.names fig, axes = plt.subplots(numvars, numvars) fig.subplots_adjust(hspace=0.05, wspace=0.05) for ax in axes.flat: # Hide all ticks and labels ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) # Set up ticks only on one side for the "edge" subplots... if ax.is_first_col(): ax.yaxis.set_ticks_position('left') if ax.is_last_col(): ax.yaxis.set_ticks_position('right') if ax.is_first_row(): ax.xaxis.set_ticks_position('top') if ax.is_last_row(): ax.xaxis.set_ticks_position('bottom') # Plot the M. for i, j in zip(*np.triu_indices_from(axes, k=1)): for x, y in [(i,j), (j,i)]: axes[x,y].plot(M[M.dtype.names[x]], M[M.dtype.names[y]], '.') # Label the diagonal subplots... for i, label in enumerate(names): axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center') # Turn on the proper x or y axes ticks. for i, j in zip(range(numvars), it.cycle((-1, 0))): axes[j,i].xaxis.set_visible(True) axes[i,j].yaxis.set_visible(True) if verbose: plt.show() return fig
def plot_correlation_scatter_plot(M, verbose=True): """Makes a grid of scatter plots representing relationship between variables Each scatter plot is one variable plotted against another variable Parameters ---------- M : numpy.ndarray structured array verbose : boolean iff True, display the graph Returns ------- matplotlib.figure.Figure Figure containing plot """ # TODO work for all three types that M might be # TODO ignore classification variables # adapted from the excellent # http://stackoverflow.com/questions/7941207/is-there-a-function-to-make-scatterplot-matrices-in-matplotlib M = utils.check_sa(M) numdata = M.shape[0] numvars = len(M.dtype) names = M.dtype.names fig, axes = plt.subplots(numvars, numvars) fig.subplots_adjust(hspace=0.05, wspace=0.05) for ax in axes.flat: # Hide all ticks and labels ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) # Set up ticks only on one side for the "edge" subplots... if ax.is_first_col(): ax.yaxis.set_ticks_position('left') if ax.is_last_col(): ax.yaxis.set_ticks_position('right') if ax.is_first_row(): ax.xaxis.set_ticks_position('top') if ax.is_last_row(): ax.xaxis.set_ticks_position('bottom') # Plot the M. for i, j in zip(*np.triu_indices_from(axes, k=1)): for x, y in [(i, j), (j, i)]: axes[x, y].plot(M[M.dtype.names[x]], M[M.dtype.names[y]], '.') # Label the diagonal subplots... for i, label in enumerate(names): axes[i, i].annotate(label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center') # Turn on the proper x or y axes ticks. for i, j in zip(range(numvars), it.cycle((-1, 0))): axes[j, i].xaxis.set_visible(True) axes[i, j].yaxis.set_visible(True) if verbose: plt.show() return fig