Example #1
0
    def test_check_sa(self):
        valid1 = np.array([(1, 'a'), (2, 'b'), (3, 'c')], 
                          dtype=[('int', int), ('s', 'S0')])
        valid2 = np.array([[1, 2, 3], [4, 5, 6]])
        valid3 = [[1, 2, 3], [4, 5, 6]]
        valid4 = pd.DataFrame(valid1)
        for valid in (valid1, valid2, valid3, valid4):
            self.assertTrue(utils.is_sa(utils.check_sa(valid)))

        self.assertRaises(ValueError, utils.check_sa, None)
        self.assertRaises(ValueError, utils.check_sa, "lalala")

        utils.check_sa(valid1, n_rows=3, n_cols=2)
        self.assertRaises(ValueError, utils.check_sa, valid1, n_rows=4)
        self.assertRaises(ValueError, utils.check_sa, valid1, n_cols=3)
Example #2
0
def pprint_sa(M, row_labels=None, col_labels=None):
    """Prints a nicely formatted Structured array (or similar object) to console
    
    Parameters
    ----------
    M : numpy.ndarray or list of lists
        structured array or homogeneous array or list of lists to print
    row_labels : list or None
        labels to put in front of rows. Defaults to row number
    col_labels : list of str or None    
        names to label columns with. If M is a structured array, its column
        names will be used instead
        
    """
    M = utils.check_sa(M, col_names_if_converted=col_labels)
    if row_labels is None:
        row_labels = xrange(M.shape[0])
    col_labels = M.dtype.names
    # From http://stackoverflow.com/questions/9535954/python-printing-lists-as-tabular-data
    col_lens = [max(max([len('{}'.format(cell)) for cell in M[name]]), 
                len(name)) for name in col_labels]
    row_label_len = max([len('{}'.format(label)) for label in row_labels])
    row_format =('{{:>{}}} '.format(row_label_len) + 
                 ' '.join(['{{:>{}}}'.format(col_len) for col_len 
                           in col_lens]))
    print row_format.format("", *col_labels)
    for row_name, row in zip(row_labels, M):
        print row_format.format(row_name, *row)
Example #3
0
def choose_cols_where(M, arguments):
    """Returns a structured array containing only columns adhering to a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        Structured array with only specified columns

    """
    M = utils.check_sa(M)
    args = __check_args_col_select(arguments)

    to_keep = np.ones(len(M.dtype), dtype=bool)
    for arg_set in arguments:
        lambd, vals = (arg_set['func'], arg_set['vals'])
        to_keep = np.logical_and(to_keep, lambd(M,  vals))
    keep_col_names = [col_name for col_name,included in zip(M.dtype.names, to_keep) if included] 
    return M[keep_col_names]
Example #4
0
def plot_correlation_matrix(M, verbose=True):
    """Plot correlation between variables in M
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # http://glowingpython.blogspot.com/2012/10/visualizing-correlation-matrices.html
    # TODO work on structured arrays or not
    # TODO ticks are col names
    M = utils.check_sa(M)
    names = M.dtype.names
    M = cast_np_sa_to_nd(M)

    #set rowvar =0 for rows are items, cols are features
    cc = np.corrcoef(M, rowvar=0)

    fig = plt.figure()
    plt.pcolor(cc)
    plt.colorbar()
    plt.yticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    plt.xticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    if verbose:
        plt.show()
    return fig
Example #5
0
def plot_correlation_matrix(M, verbose=True):
    """Plot correlation between variables in M
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # http://glowingpython.blogspot.com/2012/10/visualizing-correlation-matrices.html
    # TODO work on structured arrays or not
    # TODO ticks are col names
    M = utils.check_sa(M)
    names = M.dtype.names
    M = cast_np_sa_to_nd(M)
    
    #set rowvar =0 for rows are items, cols are features
    cc = np.corrcoef(M, rowvar=0)
    
    fig = plt.figure()
    plt.pcolor(cc)
    plt.colorbar()
    plt.yticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    plt.xticks(np.arange(0.5, M.shape[1] + 0.5), range(0, M.shape[1]))
    if verbose:
        plt.show()
    return fig
Example #6
0
def remove_cols_where(M, arguments):
    """Returns a structured array containing columns not adhering to a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        Structured array without specified columns

    """
    M = utils.check_sa(M)
    args = __check_args_col_select(arguments)

    to_remove = np.ones(len(M.dtype), dtype=bool)
    for arg_set in arguments:
        lambd, vals = (arg_set['func'], arg_set['vals'])
        to_remove = np.logical_and(to_remove, lambd(M, vals))
    remove_col_names = [
        col_name for col_name, included in zip(M.dtype.names, to_remove)
        if included
    ]
    return remove_cols(M, remove_col_names)
Example #7
0
def label_encode(M):
    """Changes string cols to ints so that there is a 1-1 mapping between 
    strings and ints

    Parameters
    ----------
    M : numpy.ndarray
        structured array

    Returns
    -------
    (numpy.ndarray, dict of str: array)
        A tuple: the first element is structured array with strings mapped to
        ints. The second element is a dictionary where keys are column names
        and values are arrays of the strings that belong to each class, as in:
        http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
    """
    M = utils.check_sa(M)
    new_dtype = []
    result_arrays = []
    classes = {}
    for (col_name, fmt) in M.dtype.descr:
        if 'S' in fmt or 'O' in fmt:
            col = M[col_name]
            le = preprocessing.LabelEncoder()
            le.fit(col)
            classes[col_name] = le.classes_
            result_arrays.append(le.transform(col))
            new_dtype.append((col_name, int))
        else:
            result_arrays.append(M[col_name])
            new_dtype.append((col_name, fmt))
    return (np.array(zip(*result_arrays), dtype=new_dtype), classes)
Example #8
0
def remove_rows_where(M, arguments):
    """Returns a structured array containing rows not adhering to a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        Structured array without specified rows

    """
    M = utils.check_sa(M)
    args = __check_args_row_select(M, arguments)

    to_remove = np.ones(M.size, dtype=bool)
    for arg_set in arguments:
        lambd, col_name, vals = (arg_set['func'], arg_set['col_name'],
                                    arg_set['vals'])
        to_remove = np.logical_and(to_remove, lambd(M, col_name, vals))
    return M[np.logical_not(to_remove)]
Example #9
0
def where_all_are_true(M, arguments):
    """Returns a boolean array which specifies which rows pass a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        boolean array specifying which rows pass a query

    """
    M = utils.check_sa(M)
    args = __check_args_row_select(M, arguments)

    to_select = np.ones(M.size, dtype=bool)
    for arg_set in arguments:
        lambd, col_name, vals = (arg_set['func'], arg_set['col_name'],
                                    arg_set['vals'])
        to_select = np.logical_and(to_select, lambd(M, col_name, vals))
    return to_select
Example #10
0
def describe_cols(M, verbose=True):
    """Returns summary statistics for a numpy array

    Parameters
    ----------
    M : numpy.ndarray
        structured array
       
    Returns
    -------
    numpy.ndarray
        structured array of summary statistics for M
       
    """ 
    M = utils.check_sa(M)           
    descr_rows = []
    for col_name, col_type in M.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            col = M[col_name]
            row = [col_name] + [func(col) for _, func in 
                                __describe_cols_metrics]
        else:
            row = [col_name] + __describe_cols_fill
        descr_rows.append(row)
    col_names = ['Column Name'] + [col_name for col_name, _ in 
                                   __describe_cols_metrics]
    ret = convert_to_sa(descr_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
Example #11
0
def pprint_sa(M, row_labels=None, col_labels=None):
    """Prints a nicely formatted Structured array (or similar object) to console
    
    Parameters
    ----------
    M : numpy.ndarray or list of lists
        structured array or homogeneous array or list of lists to print
    row_labels : list or None
        labels to put in front of rows. Defaults to row number
    col_labels : list of str or None    
        names to label columns with. If M is a structured array, its column
        names will be used instead
        
    """
    M = utils.check_sa(M, col_names_if_converted=col_labels)
    if row_labels is None:
        row_labels = xrange(M.shape[0])
    col_labels = M.dtype.names
    # From http://stackoverflow.com/questions/9535954/python-printing-lists-as-tabular-data
    col_lens = [
        max(max([len('{}'.format(cell)) for cell in M[name]]), len(name))
        for name in col_labels
    ]
    row_label_len = max([len('{}'.format(label)) for label in row_labels])
    row_format = (
        '{{:>{}}} '.format(row_label_len) +
        ' '.join(['{{:>{}}}'.format(col_len) for col_len in col_lens]))
    print row_format.format("", *col_labels)
    for row_name, row in zip(row_labels, M):
        print row_format.format(row_name, *row)
Example #12
0
def where_all_are_true(M, arguments):
    """Returns a boolean array which specifies which rows pass a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        boolean array specifying which rows pass a query

    """
    M = utils.check_sa(M)
    args = __check_args_row_select(M, arguments)

    to_select = np.ones(M.size, dtype=bool)
    for arg_set in arguments:
        lambd, col_name, vals = (arg_set['func'], arg_set['col_name'],
                                 arg_set['vals'])
        to_select = np.logical_and(to_select, lambd(M, col_name, vals))
        if not np.any(to_select): break
    return to_select
Example #13
0
def describe_cols(M, verbose=True):
    """Returns summary statistics for a numpy array

    Parameters
    ----------
    M : numpy.ndarray
        structured array
       
    Returns
    -------
    numpy.ndarray
        structured array of summary statistics for M
       
    """
    M = utils.check_sa(M)
    descr_rows = []
    for col_name, col_type in M.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            col = M[col_name]
            row = [col_name
                   ] + [func(col) for _, func in __describe_cols_metrics]
        else:
            row = [col_name] + __describe_cols_fill
        descr_rows.append(row)
    col_names = ['Column Name'
                 ] + [col_name for col_name, _ in __describe_cols_metrics]
    ret = convert_to_sa(descr_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
Example #14
0
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0):
    """Replace values signifying missing data with some substitute
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    strategy : {'mean', 'median', 'most_frequent', 'constant'}
        method to use to replace missing data
    missing_val : value that M uses to represent missint data. i.e.
        numpy.nan for floats or -999 for integers
    constant : int
        If the 'constant' strategy is chosen, this is the value to
        replace missing_val with

    """
    # TODO support times, strings
    M = utils.check_sa(M)

    if strategy not in ['mean', 'median', 'most_frequent', 'constant']:
        raise ValueError('Invalid strategy')

    M_cp = M.copy()

    if strategy == 'constant':

        try:
            missing_is_nan = np.isnan(missing_val)
        except TypeError:
            # missing_val is not a float
            missing_is_nan = False

        if missing_is_nan:  # we need to be careful about handling nan
            for col_name, col_type in M_cp.dtype.descr:
                if 'f' in col_type:
                    col = M_cp[col_name]
                    col[np.isnan(col)] = constant
            return M_cp

        for col_name, col_type in M_cp.dtype.descr:
            if 'i' in col_type or 'f' in col_type:
                col = M_cp[col_name]
                col[col == missing_val] = constant
        return M_cp

    # we're doing one of the sklearn imputer strategies
    imp = preprocessing.Imputer(missing_values=missing_val,
                                strategy=strategy,
                                axis=1)
    for col_name, col_type in M_cp.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            # The Imputer only works on float and int columns
            col = M_cp[col_name]
            col[:] = imp.fit_transform(col)
    return M_cp
Example #15
0
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0):
    """Replace values signifying missing data with some substitute
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    strategy : {'mean', 'median', 'most_frequent', 'constant'}
        method to use to replace missing data
    missing_val : value that M uses to represent missint data. i.e.
        numpy.nan for floats or -999 for integers
    constant : int
        If the 'constant' strategy is chosen, this is the value to
        replace missing_val with

    """
    # TODO support times, strings
    M = utils.check_sa(M)

    if strategy not in ['mean', 'median', 'most_frequent', 'constant']:
        raise ValueError('Invalid strategy')

    M_cp = M.copy()

    if strategy == 'constant':

        try:
            missing_is_nan = np.isnan(missing_val)
        except TypeError:
            # missing_val is not a float
            missing_is_nan = False

        if missing_is_nan: # we need to be careful about handling nan
            for col_name, col_type in M_cp.dtype.descr:
                if 'f' in col_type:
                    col = M_cp[col_name]
                    col[np.isnan(col)] = constant
            return M_cp        

        for col_name, col_type in M_cp.dtype.descr:
            if 'i' in col_type or 'f' in col_type:
                col = M_cp[col_name]
                col[col == missing_val] = constant
        return M_cp

    # we're doing one of the sklearn imputer strategies
    imp = preprocessing.Imputer(missing_values=missing_val, strategy=strategy, axis=1)
    for col_name, col_type in M_cp.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            # The Imputer only works on float and int columns
            col = M_cp[col_name]
            col[:] = imp.fit_transform(col)
    return M_cp
Example #16
0
def label_encode(M, force_columns=[]):
    """Changes string cols to ints so that there is a 1-1 mapping between 
    strings and ints

    Parameters
    ----------
    M : numpy.ndarray
        structured array
    force_columns : list of str
        By default, label_encode will only encode string columns. If the name
        of a numerical column is also present in force_columns, then that 
        column will also be label encoded

    Returns
    -------
    (numpy.ndarray, dict of str: array)
        A tuple: the first element is structured array with strings mapped to
        ints. The second element is a dictionary where keys are column names
        and values are arrays of the strings that belong to each class, as in:
        http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
    """
    M = utils.check_sa(M)
    new_dtype = []
    result_arrays = []
    classes = {}
    for (col_name, fmt) in M.dtype.descr:
        if 'S' in fmt or 'O' in fmt or col_name in force_columns:
            col = M[col_name]
            le = preprocessing.LabelEncoder()
            le.fit(col)
            classes[col_name] = le.classes_
            result_arrays.append(le.transform(col))
            new_dtype.append((col_name, int))
        else:
            result_arrays.append(M[col_name])
            new_dtype.append((col_name, fmt))
    return (np.array(zip(*result_arrays), dtype=new_dtype), classes)
Example #17
0
 def add_table(self, M):
     """Adds structured array to report"""
     M = utils.check_sa(M)
     sio = StringIO.StringIO()
     self.__np_to_html_table(M, sio)
     self.__objects.append(sio.getvalue())
Example #18
0
def plot_correlation_scatter_plot(M, verbose=True):
    """Makes a grid of scatter plots representing relationship between variables
    
    Each scatter plot is one variable plotted against another variable
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # TODO work for all three types that M might be
    # TODO ignore classification variables
    # adapted from the excellent 
    # http://stackoverflow.com/questions/7941207/is-there-a-function-to-make-scatterplot-matrices-in-matplotlib
    
    M = utils.check_sa(M)

    numdata = M.shape[0]
    numvars = len(M.dtype)
    names = M.dtype.names
    fig, axes = plt.subplots(numvars, numvars)
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the M.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]: 
            axes[x,y].plot(M[M.dtype.names[x]], M[M.dtype.names[y]], '.')

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), it.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)
    if verbose:
        plt.show()
    return fig
Example #19
0
 def add_table(self, M):
     """Adds structured array to report"""
     M = utils.check_sa(M)
     sio = StringIO.StringIO()
     self.__np_to_html_table(M, sio)
     self.__objects.append(sio.getvalue())
Example #20
0
def plot_correlation_scatter_plot(M, verbose=True):
    """Makes a grid of scatter plots representing relationship between variables
    
    Each scatter plot is one variable plotted against another variable
    
    Parameters
    ----------
    M : numpy.ndarray
        structured array
    verbose : boolean
        iff True, display the graph

    Returns
    -------
    matplotlib.figure.Figure
        Figure containing plot
    
    """
    # TODO work for all three types that M might be
    # TODO ignore classification variables
    # adapted from the excellent
    # http://stackoverflow.com/questions/7941207/is-there-a-function-to-make-scatterplot-matrices-in-matplotlib

    M = utils.check_sa(M)

    numdata = M.shape[0]
    numvars = len(M.dtype)
    names = M.dtype.names
    fig, axes = plt.subplots(numvars, numvars)
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the M.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i, j), (j, i)]:
            axes[x, y].plot(M[M.dtype.names[x]], M[M.dtype.names[y]], '.')

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i, i].annotate(label, (0.5, 0.5),
                            xycoords='axes fraction',
                            ha='center',
                            va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), it.cycle((-1, 0))):
        axes[j, i].xaxis.set_visible(True)
        axes[i, j].yaxis.set_visible(True)
    if verbose:
        plt.show()
    return fig