コード例 #1
0
    def describe_1d(data):
        # Is unique
        # Percent missing
        names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize']
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode,
                        distinct_count / count, data.memory_usage()]
        result = pd.Series(results_data, index=names, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
コード例 #2
0
ファイル: base.py プロジェクト: danilito19/pandas-profiling
    def describe_1d(data):
        # Is unique
        # Percent missing
        names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize']
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode,
                        distinct_count / count, data.memory_usage()]
        result = pd.Series(results_data, index=names, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
コード例 #3
0
    def restore_type(self, dtype, sample=None):
        """Restore type from Pandas
        """

        # Pandas types
        if pdc.is_bool_dtype(dtype):
            return 'boolean'
        elif pdc.is_datetime64_any_dtype(dtype):
            return 'datetime'
        elif pdc.is_integer_dtype(dtype):
            return 'integer'
        elif pdc.is_numeric_dtype(dtype):
            return 'number'

        # Python types
        if sample is not None:
            if isinstance(sample, (list, tuple)):
                return 'array'
            elif isinstance(sample, datetime.date):
                return 'date'
            elif isinstance(sample, isodate.Duration):
                return 'duration'
            elif isinstance(sample, dict):
                return 'object'
            elif isinstance(sample, six.string_types):
                return 'string'
            elif isinstance(sample, datetime.time):
                return 'time'

        return 'string'
コード例 #4
0
ファイル: geom.py プロジェクト: jfear/snakemakelib-core
def dotplot(x, y, df, return_source=False, marker='circle',
            **kwargs):
    # setup figure
    fig = utils.create_bokeh_fig_set_props(plot_height=kwargs.pop('plot_height', None),
                                           plot_width=kwargs.pop('plot_width', None),
                                           **kwargs)
    xaxis(fig, **kwargs)
    yaxis(fig, **kwargs)
    color = kwargs.get('color', None)
    source = utils.df_to_source(df)
    if com.is_numeric_dtype(source.to_df()[x]) == True:
        raise TypeError("{}: dependant variable must not be numerical type".format(__name__))
    if isinstance(y, list):
        color = [None] * len(y)
        if 'color' in kwargs:
            if isinstance(kwargs['color'], list) and len(kwargs['color']) == len(y):
                color = kwargs['color']
            else:
                color = [kwargs['color']] * len(y)
        for yy, c in zip(y, color):
            if not c is None:
                kwargs['color'] = c
            fig = utils.add_glyph(fig, x, yy, source, marker, **kwargs)
    else:
        fig = utils.add_glyph(fig, x, y, source, marker, **kwargs)
    return fig
コード例 #5
0
 def _get_columns_info(self, stats):
     column_info = {}
     column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] ==
                                                        1].index
     column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] ==
                                                    2].index
     rest_columns = self.get_columns(
         self.df, self.EXCLUDE,
         column_info['constant'].union(column_info['bool']))
     column_info[self.TYPE_NUMERIC] = pd.Index(
         [c for c in rest_columns if common.is_numeric_dtype(self.df[c])])
     rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE,
                                     column_info['numeric'])
     column_info[self.TYPE_DATE] = pd.Index([
         c for c in rest_columns if common.is_datetime64_dtype(self.df[c])
     ])
     rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE,
                                     column_info['date'])
     unique_columns = stats['uniques'][rest_columns] == stats['counts'][
         rest_columns]
     column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][
         unique_columns].index
     column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][
         ~unique_columns].index
     return column_info
コード例 #6
0
ファイル: base.py プロジェクト: akansal1/pandas-profiling
    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng,
                        'n_missing': leng - count,
                        'is_unique': distinct_count == leng,
                        'mode': mode,
                        'p_unique': distinct_count / count}
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
コード例 #7
0
def mdl_1d_cat(x, y):
    """builds univariate model to calculate AUC"""
    if x.nunique() > 10 and com.is_numeric_dtype(x):
        x = sb_cutz(x)

    series = pd.get_dummies(x, dummy_na=True)
    lr = LogisticRegressionCV(scoring='roc_auc')

    lr.fit(series, y)

    try:
        preds = (lr.predict_proba(series)[:, -1])
        #preds = (preds > preds.mean()).astype(int)
    except ValueError:
        Tracer()()

    plot = plot_cat(x, y)

    imgdata = BytesIO()
    plot.savefig(imgdata)
    imgdata.seek(0)

    aucz = roc_auc_score(y, preds)
    cmatrix = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdata.getvalue()))
    plt.close()
    return aucz, cmatrix
コード例 #8
0
ファイル: mgeom.py プロジェクト: percyfal/bokehutils
def mdotplot(fig, x, y, df=None, source=None,
             color=False, legend=False, binaxis="x", **kwargs):
    """mdotplot: make a mdotplot.

    In this implementation, the explanatory variable is treated as a
    factor.

    Args:
      fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object
      x (str): string for x component
      y (str): string for y component
      df (:py:class:`~pandas.DataFrame`): pandas DataFram
      source (:py:class:`~bokeh.models.sources.ColumnDataSource`): bokeh sources.ColumnDataSource object
      color (bool): set color
      legend (bool): set legend
      binaxis (str): axis to bin dots on
      kwargs: keyword arguments to pass to glyph drawing function

    Example:

      .. bokeh-plot::
          :source-position: above

          import pandas as pd
          from bokeh.plotting import figure, show
          from bokehutils.mgeom import mdotplot

          df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]],
                            columns=["x", "y", "foo"])
          # NB: currently *must* set the range here
          f = figure(title="Dotplot", plot_width=400, plot_height=400,
                     x_range=list(set(df["foo"])))
          mdotplot(f, "foo", ["y", "x"], df)

          show(f)

    Note that we in the example we must set the range in the call to
    figure, otherwise figure will use linear axis by default. It is
    currently cumbersome to change axes types in an existing figure.
    See `categorical axes
    <http://bokeh.pydata.org/en/latest/docs/user_guide/plotting.html#categorical-axes>`_
    for more information.
    """
    logger.debug("Adding dotplot to figure {}".format(fig))
    if com.is_numeric_dtype(source.to_df()[x]) == True:
        raise TypeError("{}: dependant variable must not be numerical type".format(__name__))
    for i in range(len(y)):
        dotplot(fig=fig, x=x, y=y[i], source=source, **kwargs)
        if color:
            # Add color here
            # color = brewer["PiYG"][min(max(3, len(y)), 10)]
            pass
        if legend:
            # Add legend here via legend function
            pass
コード例 #9
0
def dtype_to_jtstype(dtype):
    # Convert
    if pdc.is_bool_dtype(dtype):
        return 'boolean'
    elif pdc.is_integer_dtype(dtype):
        return 'integer'
    elif pdc.is_numeric_dtype(dtype):
        return 'number'
    elif pdc.is_datetime64_any_dtype(dtype):
        return 'datetime'
    else:
        return 'string'
コード例 #10
0
    def describe_1d(data):
        leng = len(data)  # number of observations in the Series
        count = data.count()  # number of non-NaN observations in the Series

        # Replace infinite values with NaNs to avoid issues with
        # histograms later.
        data.replace(to_replace=[np.inf, np.NINF, np.PINF],
                     value=np.nan,
                     inplace=True)

        n_infinite = count - data.count(
        )  # number of infinte observations in the Series

        distinct_count = data.nunique(
            dropna=False)  # number of unique elements in the Series
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {
            'count': count,
            'distinct_count': distinct_count,
            'p_missing': 1 - count / leng,
            'n_missing': leng - count,
            'p_infinite': n_infinite / leng,
            'n_infinite': n_infinite,
            'is_unique': distinct_count == leng,
            'mode': mode,
            'p_unique': distinct_count / count
        }
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
            result['type'] = 'CAT'

        return result
コード例 #11
0
def mdl_1d(x, y):
    """builds univariate model to calculate AUC"""
    lr = LogisticRegressionCV(scoring='roc_auc')
    lars = LassoLarsIC(criterion='aic')

    if x.nunique() > 10 and com.is_numeric_dtype(x):
        x2 = sb_cutz(x)
        series = pd.get_dummies(x2, dummy_na=True)
    else:
        series = pd.get_dummies(x, dummy_na=True)

    lr.fit(series, y)
    lars.fit(series, y)

    try:
        preds = (lr.predict_proba(series)[:, -1])
        #preds = (preds > preds.mean()).astype(int)
    except ValueError:
        Tracer()()

    # try:
    #    cm = confusion_matrix(y, (preds > y.mean()).astype(int))
    # except ValueError:
    #    Tracer()()

    aucz = roc_auc_score(y, preds)

    ns = num_bin_stats(x, y)

    nplot = plot_num(ns)
    #plot = plot_confusion_matrix(cm, y)

    imgdata = BytesIO()
    nplot.savefig(imgdata)
    imgdata.seek(0)
    nplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdata.getvalue()))
    plt.close()

    bplot = plot_bubble(ns)
    imgdatab = BytesIO()
    bplot.savefig(imgdatab)
    imgdatab.seek(0)
    bplot = 'data:image/png;base64,' + \
        quote(base64.b64encode(imgdatab.getvalue()))
    plt.close()

    return aucz, nplot, bplot
コード例 #12
0
ファイル: base.py プロジェクト: bartlesy/pandas-profiling
    def describe_1d(data):
        leng = len(data)  # number of observations in the Series
        count = data.count()  # number of non-NaN observations in the Series

        # Replace infinite values with NaNs to avoid issues with
        # histograms later.
        data.replace(to_replace=[np.inf, np.NINF,
                                 np.PINF], value=np.nan, inplace=True)

        n_infinite = count - data.count()  # number of infinte observations in the Series

        # number of unique elements in the Series
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {'count': count,
                        'distinct_count': distinct_count,
                        'p_missing': 1 - count / leng,
                        'n_missing': leng - count,
                        'p_infinite': n_infinite / leng,
                        'n_infinite': n_infinite,
                        'is_unique': distinct_count == leng,
                        'mode': mode,
                        'p_unique': distinct_count / count,
                        'ft_dfn': ft_names.get(data.name, '')}
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))

        return result
コード例 #13
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
コード例 #14
0
 def _get_columns_info(self, stats):
     column_info = {}
     column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index
     column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index
     rest_columns = self.get_columns(self.df,
                                     self.EXCLUDE,
                                     column_info['constant'].union(column_info['bool']))
     column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
                                                if common.is_numeric_dtype(self.df[c])])
     rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
     column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
                                             if common.is_datetime64_dtype(self.df[c])])
     rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['date'])
     unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
     column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
     column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
     return column_info
コード例 #15
0
ファイル: table_schema.py プロジェクト: quantopian/qgrid
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
コード例 #16
0
def dotplot(fig, x, y, df=None, source=None,
            binaxis="x", **kwargs):
    """dotplot: make a dotplot.

    In this implementation, the explanatory variable is treated as a
    factor.

    Args:
      fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object
      x (str): string for x component
      y (str): string for y component
      df (:py:class:`~pandas.DataFrame`): pandas DataFram
      source (:py:class:`~bokeh.models.ColumnDataSource`): bokeh ColumnDataSource object
      binaxis (str): axis to bin dots on
      kwargs: keyword arguments to pass to glyph drawing function

    Example:

      .. bokeh-plot::
          :source-position: above

          import pandas as pd
          from bokeh.plotting import figure, show
          from bokehutils.geom import dotplot
          from bokehutils.axes import grid

          df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]], columns=["x", "y", "foo"])

          # NB: currently *must* set the range here, otherwise figure
          # will use linear axis by default. It is currently cumbersome
          # to change axes types in an existing figure.
          f = figure(title="Dotplot", plot_width=400, plot_height=400, x_range=list(df["foo"]))
          dotplot(f, "foo", "y", df)
          grid(f, grid_line_color=None)

          show(f)

    """
    logger.debug("Adding dotplot to figure {}".format(fig))
    # FIXME: once axes can be modified, one could also transform
    # numerical ranges into factors on the fly
    if com.is_numeric_dtype(source.to_df()[x]) == True:
        raise TypeError("{}: dependant variable must not be numerical type".format(__name__))
    fig.circle(x=x, y=y, source=source, **kwargs)
コード例 #17
0
ファイル: base.py プロジェクト: ericmuijs/pandas-profiling
    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {
            'count': count,
            'distinct_count': distinct_count,
            'p_missing': 1 - count / leng,
            'n_missing': leng - count,
            'is_unique': distinct_count == leng,
            'mode': mode,
            'p_unique': distinct_count / count
        }
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
コード例 #18
0
ファイル: util.py プロジェクト: adneu/pandas
def to_numeric(arg, errors='raise', downcast=None):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

        .. versionadded:: 0.19.0

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64
    """
    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
        raise ValueError('invalid downcasting method provided')

    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif np.isscalar(arg):
        if com.is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    try:
        if com.is_numeric_dtype(values):
            pass
        elif com.is_datetime_or_timedelta_dtype(values):
            values = values.astype(np.int64)
        else:
            values = com._ensure_object(values)
            coerce_numeric = False if errors in ('ignore', 'raise') else True

            values = lib.maybe_convert_numeric(values, set(),
                                               coerce_numeric=coerce_numeric)

    except Exception:
        if errors == 'raise':
            raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and com.is_numeric_dtype(values):
        typecodes = None

        if downcast in ('integer', 'signed'):
            typecodes = np.typecodes['Integer']
        elif downcast == 'unsigned' and np.min(values) > 0:
            typecodes = np.typecodes['UnsignedInteger']
        elif downcast == 'float':
            typecodes = np.typecodes['Float']

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                if np.dtype(dtype).itemsize < values.dtype.itemsize:
                    values = com._possibly_downcast_to_dtype(
                        values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
コード例 #19
0
def to_numeric(arg, errors='raise', downcast=None):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

        .. versionadded:: 0.19.0

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64
    """
    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
        raise ValueError('invalid downcasting method provided')

    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif np.isscalar(arg):
        if com.is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    try:
        if com.is_numeric_dtype(values):
            pass
        elif com.is_datetime_or_timedelta_dtype(values):
            values = values.astype(np.int64)
        else:
            values = com._ensure_object(values)
            coerce_numeric = False if errors in ('ignore', 'raise') else True

            values = lib.maybe_convert_numeric(values,
                                               set(),
                                               coerce_numeric=coerce_numeric)

    except Exception:
        if errors == 'raise':
            raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and com.is_numeric_dtype(values):
        typecodes = None

        if downcast in ('integer', 'signed'):
            typecodes = np.typecodes['Integer']
        elif downcast == 'unsigned' and np.min(values) > 0:
            typecodes = np.typecodes['UnsignedInteger']
        elif downcast == 'float':
            typecodes = np.typecodes['Float']

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                if np.dtype(dtype).itemsize < values.dtype.itemsize:
                    values = com._possibly_downcast_to_dtype(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
コード例 #20
0
ファイル: util.py プロジェクト: AkiraKane/pandas
def to_numeric(arg, errors='raise'):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    >>> pd.to_numeric(s, errors='coerce')
    """
    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif np.isscalar(arg):
        if com.is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    if com.is_numeric_dtype(values):
        pass
    elif com.is_datetime_or_timedelta_dtype(values):
        values = values.astype(np.int64)
    else:
        values = com._ensure_object(values)
        coerce_numeric = False if errors in ('ignore', 'raise') else True

        try:
            values = lib.maybe_convert_numeric(values, set(),
                                               coerce_numeric=coerce_numeric)
        except:
            if errors == 'raise':
                raise

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
コード例 #21
0
def to_numeric(arg, errors='raise'):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    >>> pd.to_numeric(s, errors='coerce')
    """
    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif np.isscalar(arg):
        if com.is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    if com.is_numeric_dtype(values):
        pass
    elif com.is_datetime_or_timedelta_dtype(values):
        values = values.astype(np.int64)
    else:
        values = com._ensure_object(values)
        coerce_numeric = False if errors in ('ignore', 'raise') else True

        try:
            values = lib.maybe_convert_numeric(values,
                                               set(),
                                               coerce_numeric=coerce_numeric)
        except:
            if errors == 'raise':
                raise

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
コード例 #22
0
ファイル: mgeom.py プロジェクト: radovankavicky/bokehutils
def mdotplot(fig,
             x,
             y,
             df=None,
             source=None,
             color=False,
             legend=False,
             binaxis="x",
             **kwargs):
    """mdotplot: make a mdotplot.

    In this implementation, the explanatory variable is treated as a
    factor.

    Args:
      fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object
      x (str): string for x component
      y (str): string for y component
      df (:py:class:`~pandas.DataFrame`): pandas DataFram
      source (:py:class:`~bokeh.models.sources.ColumnDataSource`): bokeh sources.ColumnDataSource object
      color (bool): set color
      legend (bool): set legend
      binaxis (str): axis to bin dots on
      kwargs: keyword arguments to pass to glyph drawing function

    Example:

      .. bokeh-plot::
          :source-position: above

          import pandas as pd
          from bokeh.plotting import figure, show
          from bokehutils.mgeom import mdotplot

          df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]],
                            columns=["x", "y", "foo"])
          # NB: currently *must* set the range here
          f = figure(title="Dotplot", plot_width=400, plot_height=400,
                     x_range=list(set(df["foo"])))
          mdotplot(f, "foo", ["y", "x"], df)

          show(f)

    Note that we in the example we must set the range in the call to
    figure, otherwise figure will use linear axis by default. It is
    currently cumbersome to change axes types in an existing figure.
    See `categorical axes
    <http://bokeh.pydata.org/en/latest/docs/user_guide/plotting.html#categorical-axes>`_
    for more information.
    """
    logger.debug("Adding dotplot to figure {}".format(fig))
    if com.is_numeric_dtype(source.to_df()[x]) == True:
        raise TypeError(
            "{}: dependant variable must not be numerical type".format(
                __name__))
    for i in range(len(y)):
        dotplot(fig=fig, x=x, y=y[i], source=source, **kwargs)
        if color:
            # Add color here
            # color = brewer["PiYG"][min(max(3, len(y)), 10)]
            pass
        if legend:
            # Add legend here via legend function
            pass