def describe_1d(data): # Is unique # Percent missing names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize'] count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode, distinct_count / count, data.memory_usage()] result = pd.Series(results_data, index=names, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def restore_type(self, dtype, sample=None): """Restore type from Pandas """ # Pandas types if pdc.is_bool_dtype(dtype): return 'boolean' elif pdc.is_datetime64_any_dtype(dtype): return 'datetime' elif pdc.is_integer_dtype(dtype): return 'integer' elif pdc.is_numeric_dtype(dtype): return 'number' # Python types if sample is not None: if isinstance(sample, (list, tuple)): return 'array' elif isinstance(sample, datetime.date): return 'date' elif isinstance(sample, isodate.Duration): return 'duration' elif isinstance(sample, dict): return 'object' elif isinstance(sample, six.string_types): return 'string' elif isinstance(sample, datetime.time): return 'time' return 'string'
def dotplot(x, y, df, return_source=False, marker='circle', **kwargs): # setup figure fig = utils.create_bokeh_fig_set_props(plot_height=kwargs.pop('plot_height', None), plot_width=kwargs.pop('plot_width', None), **kwargs) xaxis(fig, **kwargs) yaxis(fig, **kwargs) color = kwargs.get('color', None) source = utils.df_to_source(df) if com.is_numeric_dtype(source.to_df()[x]) == True: raise TypeError("{}: dependant variable must not be numerical type".format(__name__)) if isinstance(y, list): color = [None] * len(y) if 'color' in kwargs: if isinstance(kwargs['color'], list) and len(kwargs['color']) == len(y): color = kwargs['color'] else: color = [kwargs['color']] * len(y) for yy, c in zip(y, color): if not c is None: kwargs['color'] = c fig = utils.add_glyph(fig, x, yy, source, marker, **kwargs) else: fig = utils.add_glyph(fig, x, y, source, marker, **kwargs) return fig
def _get_columns_info(self, stats): column_info = {} column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index rest_columns = self.get_columns( self.df, self.EXCLUDE, column_info['constant'].union(column_info['bool'])) column_info[self.TYPE_NUMERIC] = pd.Index( [c for c in rest_columns if common.is_numeric_dtype(self.df[c])]) rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['numeric']) column_info[self.TYPE_DATE] = pd.Index([ c for c in rest_columns if common.is_datetime64_dtype(self.df[c]) ]) rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['date']) unique_columns = stats['uniques'][rest_columns] == stats['counts'][ rest_columns] column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][ unique_columns].index column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][ ~unique_columns].index return column_info
def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def mdl_1d_cat(x, y): """builds univariate model to calculate AUC""" if x.nunique() > 10 and com.is_numeric_dtype(x): x = sb_cutz(x) series = pd.get_dummies(x, dummy_na=True) lr = LogisticRegressionCV(scoring='roc_auc') lr.fit(series, y) try: preds = (lr.predict_proba(series)[:, -1]) #preds = (preds > preds.mean()).astype(int) except ValueError: Tracer()() plot = plot_cat(x, y) imgdata = BytesIO() plot.savefig(imgdata) imgdata.seek(0) aucz = roc_auc_score(y, preds) cmatrix = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdata.getvalue())) plt.close() return aucz, cmatrix
def mdotplot(fig, x, y, df=None, source=None, color=False, legend=False, binaxis="x", **kwargs): """mdotplot: make a mdotplot. In this implementation, the explanatory variable is treated as a factor. Args: fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object x (str): string for x component y (str): string for y component df (:py:class:`~pandas.DataFrame`): pandas DataFram source (:py:class:`~bokeh.models.sources.ColumnDataSource`): bokeh sources.ColumnDataSource object color (bool): set color legend (bool): set legend binaxis (str): axis to bin dots on kwargs: keyword arguments to pass to glyph drawing function Example: .. bokeh-plot:: :source-position: above import pandas as pd from bokeh.plotting import figure, show from bokehutils.mgeom import mdotplot df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]], columns=["x", "y", "foo"]) # NB: currently *must* set the range here f = figure(title="Dotplot", plot_width=400, plot_height=400, x_range=list(set(df["foo"]))) mdotplot(f, "foo", ["y", "x"], df) show(f) Note that we in the example we must set the range in the call to figure, otherwise figure will use linear axis by default. It is currently cumbersome to change axes types in an existing figure. See `categorical axes <http://bokeh.pydata.org/en/latest/docs/user_guide/plotting.html#categorical-axes>`_ for more information. """ logger.debug("Adding dotplot to figure {}".format(fig)) if com.is_numeric_dtype(source.to_df()[x]) == True: raise TypeError("{}: dependant variable must not be numerical type".format(__name__)) for i in range(len(y)): dotplot(fig=fig, x=x, y=y[i], source=source, **kwargs) if color: # Add color here # color = brewer["PiYG"][min(max(3, len(y)), 10)] pass if legend: # Add legend here via legend function pass
def dtype_to_jtstype(dtype): # Convert if pdc.is_bool_dtype(dtype): return 'boolean' elif pdc.is_integer_dtype(dtype): return 'integer' elif pdc.is_numeric_dtype(dtype): return 'number' elif pdc.is_datetime64_any_dtype(dtype): return 'datetime' else: return 'string'
def describe_1d(data): leng = len(data) # number of observations in the Series count = data.count() # number of non-NaN observations in the Series # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) n_infinite = count - data.count( ) # number of infinte observations in the Series distinct_count = data.nunique( dropna=False) # number of unique elements in the Series if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = { 'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'p_infinite': n_infinite / leng, 'n_infinite': n_infinite, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count } try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) result['type'] = 'CAT' return result
def mdl_1d(x, y): """builds univariate model to calculate AUC""" lr = LogisticRegressionCV(scoring='roc_auc') lars = LassoLarsIC(criterion='aic') if x.nunique() > 10 and com.is_numeric_dtype(x): x2 = sb_cutz(x) series = pd.get_dummies(x2, dummy_na=True) else: series = pd.get_dummies(x, dummy_na=True) lr.fit(series, y) lars.fit(series, y) try: preds = (lr.predict_proba(series)[:, -1]) #preds = (preds > preds.mean()).astype(int) except ValueError: Tracer()() # try: # cm = confusion_matrix(y, (preds > y.mean()).astype(int)) # except ValueError: # Tracer()() aucz = roc_auc_score(y, preds) ns = num_bin_stats(x, y) nplot = plot_num(ns) #plot = plot_confusion_matrix(cm, y) imgdata = BytesIO() nplot.savefig(imgdata) imgdata.seek(0) nplot = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdata.getvalue())) plt.close() bplot = plot_bubble(ns) imgdatab = BytesIO() bplot.savefig(imgdatab) imgdatab.seek(0) bplot = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdatab.getvalue())) plt.close() return aucz, nplot, bplot
def describe_1d(data): leng = len(data) # number of observations in the Series count = data.count() # number of non-NaN observations in the Series # Replace infinite values with NaNs to avoid issues with # histograms later. data.replace(to_replace=[np.inf, np.NINF, np.PINF], value=np.nan, inplace=True) n_infinite = count - data.count() # number of infinte observations in the Series # number of unique elements in the Series distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'p_infinite': n_infinite / leng, 'n_infinite': n_infinite, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count, 'ft_dfn': ft_names.get(data.name, '')} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def _get_columns_info(self, stats): column_info = {} column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index rest_columns = self.get_columns(self.df, self.EXCLUDE, column_info['constant'].union(column_info['bool'])) column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns if common.is_numeric_dtype(self.df[c])]) rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['numeric']) column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns if common.is_datetime64_dtype(self.df[c])]) rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['date']) unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns] column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index return column_info
def dotplot(fig, x, y, df=None, source=None, binaxis="x", **kwargs): """dotplot: make a dotplot. In this implementation, the explanatory variable is treated as a factor. Args: fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object x (str): string for x component y (str): string for y component df (:py:class:`~pandas.DataFrame`): pandas DataFram source (:py:class:`~bokeh.models.ColumnDataSource`): bokeh ColumnDataSource object binaxis (str): axis to bin dots on kwargs: keyword arguments to pass to glyph drawing function Example: .. bokeh-plot:: :source-position: above import pandas as pd from bokeh.plotting import figure, show from bokehutils.geom import dotplot from bokehutils.axes import grid df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]], columns=["x", "y", "foo"]) # NB: currently *must* set the range here, otherwise figure # will use linear axis by default. It is currently cumbersome # to change axes types in an existing figure. f = figure(title="Dotplot", plot_width=400, plot_height=400, x_range=list(df["foo"])) dotplot(f, "foo", "y", df) grid(f, grid_line_color=None) show(f) """ logger.debug("Adding dotplot to figure {}".format(fig)) # FIXME: once axes can be modified, one could also transform # numerical ranges into factors on the fly if com.is_numeric_dtype(source.to_df()[x]) == True: raise TypeError("{}: dependant variable must not be numerical type".format(__name__)) fig.circle(x=x, y=y, source=source, **kwargs)
def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = { 'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count } try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if com.is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if com.is_numeric_dtype(values): pass elif com.is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = com._ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and com.is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: values = com._possibly_downcast_to_dtype( values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if com.is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if com.is_numeric_dtype(values): pass elif com.is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = com._ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and com.is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: values = com._possibly_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def to_numeric(arg, errors='raise'): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') >>> pd.to_numeric(s, errors='coerce') """ is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if com.is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg if com.is_numeric_dtype(values): pass elif com.is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = com._ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True try: values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except: if errors == 'raise': raise if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def mdotplot(fig, x, y, df=None, source=None, color=False, legend=False, binaxis="x", **kwargs): """mdotplot: make a mdotplot. In this implementation, the explanatory variable is treated as a factor. Args: fig (:py:class:`~bokeh.plotting.Plot`): bokeh Plot object x (str): string for x component y (str): string for y component df (:py:class:`~pandas.DataFrame`): pandas DataFram source (:py:class:`~bokeh.models.sources.ColumnDataSource`): bokeh sources.ColumnDataSource object color (bool): set color legend (bool): set legend binaxis (str): axis to bin dots on kwargs: keyword arguments to pass to glyph drawing function Example: .. bokeh-plot:: :source-position: above import pandas as pd from bokeh.plotting import figure, show from bokehutils.mgeom import mdotplot df = pd.DataFrame([[1,2,"A"], [2,5,"B"], [3,9,"A"]], columns=["x", "y", "foo"]) # NB: currently *must* set the range here f = figure(title="Dotplot", plot_width=400, plot_height=400, x_range=list(set(df["foo"]))) mdotplot(f, "foo", ["y", "x"], df) show(f) Note that we in the example we must set the range in the call to figure, otherwise figure will use linear axis by default. It is currently cumbersome to change axes types in an existing figure. See `categorical axes <http://bokeh.pydata.org/en/latest/docs/user_guide/plotting.html#categorical-axes>`_ for more information. """ logger.debug("Adding dotplot to figure {}".format(fig)) if com.is_numeric_dtype(source.to_df()[x]) == True: raise TypeError( "{}: dependant variable must not be numerical type".format( __name__)) for i in range(len(y)): dotplot(fig=fig, x=x, y=y[i], source=source, **kwargs) if color: # Add color here # color = brewer["PiYG"][min(max(3, len(y)), 10)] pass if legend: # Add legend here via legend function pass