def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), lambda x: isinstance(x,string_types) ] ]) sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def na_op(x, y): try: result = op(x, y) except TypeError: if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (np.ndarray, pd.Series)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: x = com._ensure_object(x) y = com._ensure_object(y) result = lib.vec_binop(x, y, op) else: try: # let null fall thru if not isnull(y): y = bool(y) result = lib.scalar_binop(x, y, op) except: raise TypeError("cannot compare a dtyped [{0}] array with " "a scalar of type [{1}]".format( x.dtype, type(y).__name__)) return result
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map['generic'] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _factorize_keys(lk, rk, sort=True): if com._is_int_or_datetime_dtype(lk) and com._is_int_or_datetime_dtype(rk): klass = lib.Int64Factorizer lk = com._ensure_int64(lk) rk = com._ensure_int64(rk) else: klass = lib.Factorizer lk = com._ensure_object(lk) rk = com._ensure_object(rk) rizer = klass(max(len(lk), len(rk))) llab = rizer.factorize(lk) rlab = rizer.factorize(rk) count = rizer.get_count() if sort: uniques = rizer.uniques.to_array() llab, rlab = _sort_labels(uniques, llab, rlab) # NA group lmask = llab == -1; lany = lmask.any() rmask = rlab == -1; rany = rmask.any() if lany or rany: if lany: np.putmask(llab, lmask, count) if rany: np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count
def _from_arraylike(cls, data, freq, tz): if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if np.isscalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) try: data = com._ensure_int64(data) if freq is None: raise ValueError('freq not specified') data = np.array([Period(x, freq=freq).ordinal for x in data], dtype=np.int64) except (TypeError, ValueError): data = com._ensure_object(data) if freq is None and len(data) > 0: freq = getattr(data[0], 'freq', None) if freq is None: raise ValueError('freq not specified and cannot be ' 'inferred from first element') data = _get_ordinals(data, freq) else: if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: freq = data.freq data = data.values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data.values, base1, base2, 1) else: if freq is None and len(data) > 0: freq = getattr(data[0], 'freq', None) if freq is None: raise ValueError('freq not specified and cannot be ' 'inferred from first element') if data.dtype != np.int64: if np.issubdtype(data.dtype, np.datetime64): data = dt64arr_to_periodarr(data, freq, tz) else: try: data = com._ensure_int64(data) except (TypeError, ValueError): data = com._ensure_object(data) data = _get_ordinals(data, freq) return data, freq
def _get_codes_for_values(values, levels): from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != levels.dtype: values = com._ensure_object(values) levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) t.map_locations(levels) return com._ensure_platform_int(t.lookup(values))
def _get_codes_for_values(values, categories): """" utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != categories.dtype: values = com._ensure_object(values) categories = com._ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) t.map_locations(com._values_from_object(categories)) return com._ensure_platform_int(t.lookup(values))
def duplicated(self, keep="first"): keys = com._values_from_object(com._ensure_object(self.values)) duplicated = lib.duplicated(keys, keep=keep) try: return self._constructor(duplicated, index=self.index).__finalize__(self) except AttributeError: return np.array(duplicated, dtype=bool)
def duplicated(self, take_last=False): keys = com._ensure_object(self.values) duplicated = lib.duplicated(keys, take_last=take_last) try: return self._constructor(duplicated, index=self.index).__finalize__(self) except AttributeError: return np.array(duplicated, dtype=bool)
def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError as e: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) return arg arg = com._ensure_object(arg) try: if format is not None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _convert_listlike(arg, box, unit): if isinstance(arg, (list,tuple)) or ((hasattr(arg,'__iter__') and not hasattr(arg,'dtype'))): arg = np.array(list(arg), dtype='O') if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): # these are shortcutable value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]') else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce) except: # try to process strings fast; may need to fallback try: value = np.array([ _get_string_converter(r, unit=unit)() for r in arg ],dtype='m8[ns]') except: value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit, coerce=coerce) for r in arg ]) value = value.astype('timedelta64[ns]', copy=False) if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value,unit='ns') return value
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _sqlalchemy_type(self, col): from sqlalchemy.types import (BigInteger, Float, Text, Boolean, DateTime, Date, Time, Interval) if com.is_datetime64_dtype(col): try: tz = col.tzinfo return DateTime(timezone=True) except: return DateTime if com.is_timedelta64_dtype(col): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) return BigInteger elif com.is_float_dtype(col): return Float elif com.is_integer_dtype(col): # TODO: Refine integer size. return BigInteger elif com.is_bool_dtype(col): return Boolean inferred = lib.infer_dtype(com._ensure_object(col)) if inferred == 'date': return Date if inferred == 'time': return Time return Text
def _sql_type_name(self, col): pytype = col.dtype.type pytype_name = "text" if issubclass(pytype, np.floating): pytype_name = "float" elif com.is_timedelta64_dtype(pytype): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) pytype_name = "int" elif issubclass(pytype, np.integer): pytype_name = "int" elif issubclass(pytype, np.datetime64) or pytype is datetime: # Caution: np.datetime64 is also a subclass of np.number. pytype_name = "datetime" elif issubclass(pytype, np.bool_): pytype_name = "bool" elif issubclass(pytype, np.object): pytype = lib.infer_dtype(com._ensure_object(col)) if pytype == "date": pytype_name = "date" elif pytype == "time": pytype_name = "time" return _SQL_TYPES[pytype_name][self.pd_sql.flavor]
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well) Returns ------- ret : datetime if parsing succeeded """ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = lib.array_to_datetime(com._ensure_object(arg.values), raise_=errors == 'raise', utc=utc, dayfirst=dayfirst) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') result = lib.array_to_datetime(com._ensure_object(arg), raise_=errors == 'raise', utc=utc, dayfirst=dayfirst) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result try: if not arg: return arg return parse(arg, dayfirst=dayfirst) except Exception: if errors == 'raise': raise return arg
def duplicated(self, take_last=False): keys = com._ensure_object(self.values) duplicated = lib.duplicated(keys, take_last=take_last) try: return self._constructor(duplicated, index=self.index).__finalize__(self) except AttributeError: from pandas.core.index import Index return Index(duplicated)
def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = com._ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) times = [] if format is not None: for element in arg: try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): if errors == 'raise': raise ValueError("Cannot convert %s to a time with " "given format %s" % (element, format)) elif errors == 'ignore': return arg else: times.append(None) else: formats = _time_formats[:] format_found = False for element in arg: time_object = None for time_format in formats: try: time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) formats.insert(0, fmt) format_found = True break except (ValueError, TypeError): continue if time_object is not None: times.append(time_object) elif errors == 'raise': raise ValueError("Cannot convert arg {arg} to " "a time".format(arg=arg)) elif errors == 'ignore': return arg else: times.append(None) return times
def to_datetime(arg, errors='ignore', dayfirst=False): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) Returns ------- ret : datetime if parsing succeeded """ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = lib.string_to_datetime(com._ensure_object(arg.values), raise_=errors == 'raise', dayfirst=dayfirst) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') result = lib.string_to_datetime(com._ensure_object(arg), raise_=errors == 'raise', dayfirst=dayfirst) if com.is_datetime64_dtype(result): result = DatetimeIndex(result) return result try: if not arg: return arg return _dtparser.parse(arg, dayfirst=dayfirst) except Exception: if errors == 'raise': raise return arg
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _get_hash_table_and_cast(values): if com.is_float_dtype(values): klass = lib.Float64HashTable values = com._ensure_float64(values) elif com.is_integer_dtype(values): klass = lib.Int64HashTable values = com._ensure_int64(values) else: klass = lib.PyObjectHashTable values = com._ensure_object(values) return klass, values
def to_numeric(arg, errors='raise'): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') >>> pd.to_numeric(s, errors='coerce') """ index = name = None if isinstance(arg, pd.Series): index, name = arg.index, arg.name elif isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') conv = arg arg = com._ensure_object(arg) coerce_numeric = False if errors in ('ignore', 'raise') else True try: conv = lib.maybe_convert_numeric(arg, set(), coerce_numeric=coerce_numeric) except: if errors == 'raise': raise if index is not None: return pd.Series(conv, index=index, name=name) else: return conv
def _value_counts_arraylike(values, dropna=True): is_datetimetz = com.is_datetimetz(values) is_period = (isinstance(values, gt.ABCPeriodIndex) or com.is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if com.is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz: if isinstance(orig, gt.ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def _factorize_keys(lk, rk, sort=True): if com.is_integer_dtype(lk) and com.is_integer_dtype(rk): klass = lib.Int64Factorizer lk = com._ensure_int64(lk) rk = com._ensure_int64(rk) else: klass = lib.Factorizer lk = com._ensure_object(lk) rk = com._ensure_object(rk) rizer = klass(max(len(lk), len(rk))) llab, _ = rizer.factorize(lk) rlab, _ = rizer.factorize(rk) count = rizer.get_count() if sort: llab, rlab = _sort_labels(rizer.uniques, llab, rlab) # TODO: na handling return llab, rlab, count
def value_counts(values, sort=True, ascending=False, normalize=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys,dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def _convert_f(arg): arg = com._ensure_object(arg) try: result = lib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError, e: try: values, tz = lib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map["generic"] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) try: if format is not None: result = None # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg) except: raise ValueError("cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, coerce=coerce) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) uniques = uniques.view("M8[ns]") elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) return uniques
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(_ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('M8[ns]') elif np.issubdtype(values.dtype, np.timedelta64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) uniques = uniques.view('m8[ns]') elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(_ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(_ensure_object(values)) return uniques
def _convert_listlike(arg, box, unit): if isinstance(arg, (list, tuple)) or ((hasattr(arg, '__iter__') and not hasattr(arg, 'dtype'))): arg = np.array(list(arg), dtype='O') if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): # these are shortcutable value = arg.astype( 'timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]') else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce) except: # try to process strings fast; may need to fallback try: value = np.array( [_get_string_converter(r, unit=unit)() for r in arg], dtype='m8[ns]') except: value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit, coerce=coerce) for r in arg ]) if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns') return value
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] return result
def _convert_listlike(arg, box): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError as e: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) return arg arg = com._ensure_object(arg) try: if format is not None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : deprecated na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ if order is not None: msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926" warn(msg, FutureWarning, stacklevel=2) from pandas.core.index import Index from pandas.core.series import Series vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort( np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [ lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types) ] ]) sorter = com._ensure_platform_int( t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format=None, coerce=False, unit='ns'): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) dayfirst : boolean, default False If True parses dates with the day first, eg 20/01/2005 Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug). utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well) box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number Returns ------- ret : datetime if parsing succeeded """ from pandas import Timestamp from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError, e: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) return arg arg = com._ensure_object(arg) try: if format is not None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError, e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _convert_listlike(arg, box, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) if format_is_iso8601: format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, coerce=coerce) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, coerce=coerce) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # Only raise this error if the user provided the # datetime format, and not when it was inferred if not infer_datetime_format: raise if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas.tseries.period import PeriodIndex is_period = com.is_period_arraylike(values) values = Series(values).values is_category = com.is_categorical_dtype(values.dtype) if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes elif is_category: bins = values.categories cat = values values = cat.codes dtype = values.dtype if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period: if is_period: values = PeriodIndex(values) values = values.view(np.int64) keys, counts = htable.value_count_int64(values) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna: keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) if not is_category: result.index = bins[:-1] else: result.index = cat.categories if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif com.is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = com._ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if com.is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if com.is_numeric_dtype(values): pass elif com.is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = com._ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and com.is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: values = com._possibly_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif format is None and com.is_integer_dtype(arg) and unit == 'ns': result = arg.astype('datetime64[ns]') if box: return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result arg = com._ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = (('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and format != '%Y') if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, unit=unit, require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.labels if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys, dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.levels)), fill_value=0) result.index = bins[:-1] if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- labels : the indexer to the original array uniques : the unique values note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort( np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [ lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types) ] ]) sorter = com._ensure_platform_int( t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def to_numeric(arg, errors='raise'): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') >>> pd.to_numeric(s, errors='coerce') """ is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if com.is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg if com.is_numeric_dtype(values): pass elif com.is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = com._ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True try: values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except: if errors == 'raise': raise if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values