Esempio n. 1
0
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({'A': data_missing})
    result = df['A'].astype('object')
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df['A'].astype('object').values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            # PY2 can be int or long
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e)
Esempio n. 2
0
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({'A': data_missing})
    result = df['A'].astype('object')
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df['A'].astype('object').values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            # PY2 can be int or long
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e)
Esempio n. 3
0
        def scalar_add(a, b):

            # TODO; should really be a type specific NA
            if pd.isna(a) or pd.isna(b):
                return np.nan
            if is_integer(a):
                a = int(a)
            elif is_integer(b):
                b = int(b)
            return a + b
Esempio n. 4
0
        def scalar_add(a, b):

            # TODO; should really be a type specific NA
            if pd.isna(a) or pd.isna(b):
                return np.nan
            if is_integer(a):
                a = int(a)
            elif is_integer(b):
                b = int(b)
            return a + b
Esempio n. 5
0
def _maybe_convert_usecols(usecols):
    """
    Convert `usecols` into a compatible format for parsing in `parsers.py`.
    Parameters

    COPIED from:
    https://github.com/pandas-dev/pandas/blob/d47fc0cba3cf94ebd289ad3776bf7ff3fe60dfb8/pandas/io/excel/_util.py#L119

    ----------
    usecols : object
        The use-columns object to potentially convert.
    Returns
    -------
    converted : object
        The compatible format of `usecols`.
    """
    if usecols is None:
        return usecols

    if is_integer(usecols):
        import warnings

        warnings.warn(
            ("Passing in an integer for `usecols` has been "
             "deprecated. Please pass in a list of int from "
             "0 to `usecols` inclusive instead."),
            FutureWarning,
            stacklevel=2,
        )
        return list(range(usecols + 1))

    if isinstance(usecols, str):
        return _range2cols(usecols)

    return usecols
Esempio n. 6
0
def series2col(s, name):
    kw = {
        'name': name,
        'kind': fpb.Column.SLICE,
    }

    if is_integer(s.dtype):
        kw['dtype'] = fpb.INTEGER
        kw['ints'] = s
    elif is_float(s.dtype):
        kw['dtype'] = fpb.FLOAT
        kw['floats'] = s
    elif s.dtype == np.object:  # Pandas dtype for str is object
        kw['strings'] = s
        kw['dtype'] = fpb.STRING
    elif is_bool(s.dtype):
        kw['bools'] = s
        kw['dtype'] = fpb.BOOLEAN
    elif is_datetime(s.dtype):
        if s.dt.tz:
            try:
                s = s.dt.tz_localize(pytz.UTC)
            except TypeError:
                s = s.dt.tz_convert('UTC')
        kw['times'] = s.astype(np.int64)
        kw['dtype'] = fpb.TIME
    elif is_categorical_dtype(s.dtype):
        # We assume catgorical data is strings
        kw['strings'] = s.astype(str)
        kw['dtype'] = fpb.STRING
    else:
        raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype))

    return fpb.Column(**kw)
Esempio n. 7
0
    def insert(self, loc, column, value, allow_duplicates=False):
        if not is_integer(loc):
            raise TypeError("'loc' must be an integer")

        elif loc < 0:
            raise ValueError("unbounded slice")

        elif loc > len(self.columns):
            raise IndexError(f"index {loc} is out of bounds for axis 0 with "
                             f"size {len(self.columns)}")

        elif not allow_duplicates and column in self.columns:
            raise ValueError(f"cannot insert {column}, already exists")

        value = self._ensure_valid_frame(value)

        if not is_scalar(value):
            if not value._is_series and len(value.columns) != 1:
                raise ValueError(
                    "Wrong number of items passed 2, placement implies 1")

            _, value = self._align_frame(value, join="left", axis=0)
            value = value._frame

        if self.empty and self._raw_index is None:
            if is_scalar(value):
                frame = DataFrame(columns=[column])._frame
                self._update_frame(frame)
            else:
                self._update_frame(value)
            self._replace_columns([column])
        else:
            self._update_frame(self._frame.insert(loc, value))
            self._replace_columns(self.columns.insert(loc, column))
Esempio n. 8
0
def _rename_chroms(grp, rename_dict, h5opts):
    chroms = get(grp["chroms"]).set_index("name")
    n_chroms = len(chroms)
    new_names = np.array(chroms.rename(rename_dict).index.values,
                         dtype=CHROM_DTYPE)  # auto-adjusts char length

    del grp["chroms/name"]
    grp["chroms"].create_dataset("name",
                                 shape=(n_chroms, ),
                                 dtype=new_names.dtype,
                                 data=new_names,
                                 **h5opts)

    bins = get(grp["bins"])
    n_bins = len(bins)
    idmap = dict(zip(new_names, range(n_chroms)))
    if is_categorical(bins["chrom"]) or is_integer(bins["chrom"]):
        chrom_ids = bins["chrom"].cat.codes
        chrom_dtype = h5py.special_dtype(enum=(CHROMID_DTYPE, idmap))
        del grp["bins/chrom"]
        try:
            grp["bins"].create_dataset("chrom",
                                       shape=(n_bins, ),
                                       dtype=chrom_dtype,
                                       data=chrom_ids,
                                       **h5opts)
        except ValueError:
            # If HDF5 enum header would be too large,
            # try storing chrom IDs as raw int instead
            chrom_dtype = CHROMID_DTYPE
            grp["bins"].create_dataset("chrom",
                                       shape=(n_bins, ),
                                       dtype=chrom_dtype,
                                       data=chrom_ids,
                                       **h5opts)
Esempio n. 9
0
def _random_state(state=None):
    """
    Helper function for processing random_state arguments.

    Parameters
    ----------
    state : int, np.random.RandomState, None.
        If receives an int, passes to np.random.RandomState() as seed.
        If receives an np.random.RandomState object, just returns object.
        If receives `None`, returns np.random.
        If receives anything else, raises an informative ValueError.
        Default None.

    Returns
    -------
    np.random.RandomState
    """

    if types.is_integer(state):
        return np.random.RandomState(state)
    elif isinstance(state, np.random.RandomState):
        return state
    elif state is None:
        return np.random
    else:
        raise ValueError("random_state must be an integer, a numpy "
                         "RandomState, or None")
Esempio n. 10
0
def _random_state(state=None):
    """
    Helper function for processing random_state arguments.

    Parameters
    ----------
    state : int, np.random.RandomState, None.
        If receives an int, passes to np.random.RandomState() as seed.
        If receives an np.random.RandomState object, just returns object.
        If receives `None`, returns np.random.
        If receives anything else, raises an informative ValueError.
        Default None.

    Returns
    -------
    np.random.RandomState
    """

    if types.is_integer(state):
        return np.random.RandomState(state)
    elif isinstance(state, np.random.RandomState):
        return state
    elif state is None:
        return np.random
    else:
        raise ValueError("random_state must be an integer, a numpy "
                         "RandomState, or None")
Esempio n. 11
0
 def classify(self, tree, sample):
     if is_integer(tree):
         return tree
     else:
         feat = list(tree.keys())[0]
         if sample[feat] > tree[feat]['splitVal']:
             return self.classify(tree[feat]['>'], sample)
         else:
             return self.classify(tree[feat]['<='], sample)
Esempio n. 12
0
    def __getitem__(self, item):
        """Select subset of self.

        Parameters
        ----------
        item: int, slice
            * int: The position in 'self' to get.
            * slice: A slice object, where 'start', 'stop', and 'step' are
            integers or None
            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

        Returns
        --------
        item: scalar or ExtensionArray

        Notes
        -----
        For scalar ``item``, return a scalar value suitable for the array's
        type. This should be an instance of ``self.dtype.type``.
        For slice ``key``, return an instance of ``ExtensionArray``, even
        if the slice is length 0 or 1.
        For a boolean mask, return an instance of ``ExtensionArray``, filtered
        to the values where ``item`` is True.
        """
        if isinstance(item, slice):
            start = item.start or 0
            stop = item.stop if item.stop is not None else len(self.data)
            stop = min(stop, len(self.data))
            if stop - start == 0:
                return type(self)(xnd.xnd([], type=self.data.type))

        elif isinstance(item, Iterable):
            if not is_array_like(item):
                item = np.array(item)
            if is_integer_dtype(item):
                return self.take(item)
            elif is_bool_dtype(item):
                indices = np.array(item)
                indices = np.argwhere(indices).flatten()
                return self.take(indices)
            else:
                raise IndexError(
                    "Only integers, slices and integer or boolean \
                    arrays are valid indices.")

        elif is_integer(item):
            if item < 0:
                item += len(self)
            if item >= len(self):
                return None
            else:

                return self.data[item]

        value = self.data[item]
        return type(self)(value)
Esempio n. 13
0
 def default_display_func(x):
     if self.na_rep is not None and pd.isna(x):
         return self.na_rep
     elif is_float(x):
         n_precision = len(str(int(x))) + self.precision
         display_format = f"{x:.{n_precision}n}"
         return display_format
     elif is_integer(x):
         display_format = f"{x:n}"
         return display_format
     else:
         return x
Esempio n. 14
0
 def classify(self, tree, sample):
     if is_integer(tree):
         return tree
     else:
         axis = list(tree.keys())[0]
         ret = None
         try:
             ret = self.classify(tree[axis][sample[axis]], sample)
         except KeyError:
             print('feat: ', axis, ' feat value: ', sample[axis])
             if DEBUG: print('tree: ', tree[axis])
             ret = -1
         return ret
Esempio n. 15
0
def _get_indexer(columns, to_lookup, opt_name):
    indexer = []
    for val in to_lookup:
        if is_integer(val):
            indexer.append(val)
        elif isinstance(val, str):
            idxr = columns.get_indexer_for([val])
            if idxr[0] == -1:
                raise KeyError(val)
            indexer.append(idxr[0])
        else:
            raise ValueError(
                f"Unsupported value type {type(val)} for '{opt_name}'")
    return indexer
Esempio n. 16
0
    def _validate_locators(self, tup):
        if util.is_tuple(tup) and len(tup) >= 1:
            if len(tup) > 2:
                raise ValueError("Too many indexers")
            row_loc = tup[0]
            col_loc = tup[1] if len(tup) == 2 else slice(None)
        else:
            row_loc = tup
            col_loc = slice(None)

        if isinstance(row_loc, slice) and row_loc.step is not None:
            raise err._unsupported_error(
                "row slicer cannot have a step for now")

        row_scalar = is_scalar(row_loc) or util.is_tuple(row_loc)
        col_scalar = is_scalar(col_loc) or util.is_tuple(col_loc)

        if self.is_at:
            if not util.is_tuple(tup) or len(tup) != 2:
                raise ValueError("Need two indexers")

            if self.is_loc:
                if not row_scalar or not col_scalar:
                    raise ValueError(
                        "At based indexing can only have scalar indexers")
            else:
                if not is_integer(row_loc) or not is_integer(col_loc):
                    raise ValueError(
                        "iAt based indexing can only have integer indexers")

        return (
            row_loc,
            [col_loc] if col_scalar else col_loc,
            row_scalar,
            col_scalar,
            _compute_ndim(row_loc, col_loc),
        )
Esempio n. 17
0
def get_actual_types(df):
    column_types = {}

    for col_name in df.columns:
        col = df[col_name]
        if is_integer(col.dtype):
            column_types[col.name] = fpb.INTEGER
        elif is_float(col.dtype):
            column_types[col.name] = fpb.FLOAT
        elif is_string(col.dtype):
            has_data = False
            for x in col:
                if pd.isnull(x):
                    continue
                if isinstance(x, str):
                    column_types[col.name] = fpb.STRING
                    has_data = True
                    break
                if isinstance(x, bool):
                    column_types[col.name] = fpb.BOOLEAN
                    has_data = True
                    break
                if isinstance(x, pd.Timestamp):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                if isinstance(x, datetime):
                    column_types[col.name] = fpb.TIME
                    has_data = True
                    break
                raise WriteError(
                    '{} - contains an unsupported value type - {}'.format(
                        col_name, type(x)))
            # If all items in the column are None
            # it does not matter what type the column will be, set the column as INTEGER
            if not has_data:
                column_types[col.name] = fpb.NULL
        elif is_bool(col.dtype):
            column_types[col.name] = fpb.BOOLEAN
        elif is_datetime(col.dtype):
            column_types[col.name] = fpb.TIME
        elif is_categorical_dtype(col.dtype):
            # We assume catgorical data is strings
            column_types[col.name] = fpb.STRING
        else:
            raise WriteError('{} - unsupported type - {}'.format(
                col_name, col.dtype))

    return column_types
Esempio n. 18
0
def get_meta(columns,
             dtype=None,
             index_columns=None,
             index_names=None,
             default_dtype=np.object):
    """
    Extracted and modified from pandas/io/parsers.py :
        _get_empty_meta (BSD licensed).

    """
    columns = list(columns)

    # Convert `dtype` to a defaultdict of some kind.
    # This will enable us to write `dtype[col_name]`
    # without worrying about KeyError issues later on.
    if not isinstance(dtype, dict):
        # if dtype == None, default will be default_dtype.
        dtype = defaultdict(lambda: dtype or default_dtype)
    else:
        # Save a copy of the dictionary.
        _dtype = dtype.copy()
        dtype = defaultdict(lambda: default_dtype)

        # Convert column indexes to column names.
        for k, v in six.iteritems(_dtype):
            col = columns[k] if is_integer(k) else k
            dtype[col] = v

    if index_columns is None or index_columns is False:
        index = pd.Index([])
    else:
        data = [pd.Series([], dtype=dtype[name]) for name in index_names]
        if len(data) == 1:
            index = pd.Index(data[0], name=index_names[0])
        else:
            index = pd.MultiIndex.from_arrays(data, names=index_names)
        index_columns.sort()
        for i, n in enumerate(index_columns):
            columns.pop(n - i)

    col_dict = {
        col_name: pd.Series([], dtype=dtype[col_name])
        for col_name in columns
    }

    return pd.DataFrame(col_dict, columns=columns, index=index)
Esempio n. 19
0
    def __getitem__(self, item):
        # type (Any) -> Any
        """Select a subset of self.
        Parameters
        ----------
        item : int, slice, or ndarray
            * int: The position in 'self' to get.
            * slice: A slice object, where 'start', 'stop', and 'step' are
              integers or None
            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
        Returns
        -------
        item : scalar or ExtensionArray
        Notes
        -----
        For scalar ``item``, return a scalar value suitable for the array's
        type. This should be an instance of ``self.dtype.type``.
        For slice ``key``, return an instance of ``ExtensionArray``, even
        if the slice is length 0 or 1.
        For a boolean mask, return an instance of ``ExtensionArray``, filtered
        to the values where ``item`` is True.
        """
        # Workaround for Arrow bug that segfaults on empty slice.
        # This is fixed in Arrow master, will be released in 0.10
        if isinstance(item, slice):
            start = item.start or 0
            stop = item.stop if item.stop is not None else len(self.data)
            stop = min(stop, len(self.data))
            if stop - start == 0:
                return type(self)(pa.array([], type=self.data.type))

        elif isinstance(item, Iterable):
            # alternative: np.where(np.array(item))[0]
            indices = np.array(item)
            indices = np.argwhere(indices).flatten()
            return self.take(indices)
        elif is_integer(item):
            if item < 0:
                item += len(self)
            if item >= len(self):
                return None
        value = self.data[item]
        if isinstance(value, pa.ChunkedArray):
            return type(self)(value)
        else:
            return value.as_py()
Esempio n. 20
0
    def __getitem__(self, item):
        # type (Any) -> Any
        """Select a subset of self.
        Parameters
        ----------
        item : int, slice, or ndarray
            * int: The position in 'self' to get.
            * slice: A slice object, where 'start', 'stop', and 'step' are
              integers or None
            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
        Returns
        -------
        item : scalar or PintArray
        """
        if is_integer(item):
            return self._data[item] * self.units

        return self.__class__(self._data[item], self.dtype)
Esempio n. 21
0
 def classify(self, tree, sample):
     if is_integer(tree):
         return tree
     else:
         feat = list(tree.keys())[0]
         if feat in self.continueFeatVals:
             if sample[feat] > tree[feat]['splitVal']:
                 return self.classify(tree[feat]['>'], sample)
             else:
                 return self.classify(tree[feat]['<='], sample)
         else:
             ret = None
             try:
                 ret = self.classify(tree[feat][sample[feat]], sample)
             except KeyError:
                 print('feat: ', feat, ' feat value: ', sample[feat])
                 if DEBUG: print('tree: ', tree[feat])
                 ret = -1
             return ret
Esempio n. 22
0
    def __getitem__(self, item):
        # type (Any) -> Any
        """Select a subset of self.

        Parameters
        ----------
        item : int, slice, or ndarray
            * int: The position in 'self' to get.
            * slice: A slice object, where 'start', 'stop', and 'step' are
              integers or None
            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

        Returns
        -------
        item : scalar or FletcherArray
        Notes
        -----
        For scalar ``item``, return a scalar value suitable for the array's
        type. This should be an instance of ``self.dtype.type``.
        For slice ``key``, return an instance of ``FletcherArray``, even
        if the slice is length 0 or 1.
        For a boolean mask, return an instance of ``FletcherArray``, filtered
        to the values where ``item`` is True.
        """
        if PANDAS_GE_0_26_0:
            item = check_array_indexer(self, item)

        if is_integer(item):
            return self.data[int(item)].as_py()
        if (not isinstance(item, slice) and len(item) > 0
                and np.asarray(item[:1]).dtype.kind == "b"):
            item = np.argwhere(item).flatten()
        elif isinstance(item, slice):
            if item.step == 1 or item.step is None:
                return FletcherArray(self.data[item])
            else:
                item = np.arange(len(self), dtype=self._indices_dtype)[item]
        return self.take(item)
Esempio n. 23
0
    def _validate_locator(self, row_loc):
        if util.is_tuple(row_loc):
            if len(row_loc) > 1:
                raise ValueError("Too many indexers")
            row_loc = row_loc[0]

        if isinstance(row_loc, slice) and row_loc.step is not None:
            raise err._unsupported_error(
                "row slicer cannot have a step for now")

        row_scalar = is_scalar(row_loc) or util.is_tuple(row_loc)

        if self.is_at:
            if self.is_loc:
                if not row_scalar:
                    raise ValueError(
                        "At based indexing can only have scalar indexers")
            else:
                if not is_integer(row_loc):
                    raise ValueError(
                        "iAt based indexing can only have integer indexers")

        return (row_loc, row_scalar, _compute_ndim(row_loc))
Esempio n. 24
0
 def _get_level_number(self, level):
     names = self.names
     count = names.count(level)
     if count > 1:
         raise ValueError(
             f"The name {level} occurs multiple times, use a level number")
     try:
         level = self.names.index(level)
     except ValueError as e:
         if not is_integer(level):
             raise KeyError(f"Level {level} not found") from e
         elif level < 0:
             level += self.nlevels
             if level < 0:
                 orig_level = level - self.nlevels
                 raise IndexError(
                     f"Too many levels: Index has only {self.nlevels} "
                     f"levels, {orig_level} is not a valid level number"
                 ) from e
         elif level >= self.nlevels:
             raise IndexError(
                 f"Too many levels: Index has only {self.nlevels} levels, "
                 f"not {level}") from e
     return level
Esempio n. 25
0
    def _parse_excel(self,
                     sheetname=0,
                     header=0,
                     skiprows=None,
                     names=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     convert_float=True,
                     true_values=None,
                     false_values=None,
                     verbose=False,
                     dtype=None,
                     squeeze=False,
                     **kwds):

        skipfooter = kwds.pop('skipfooter', None)
        if skipfooter is not None:
            skip_footer = skipfooter

        _validate_header_arg(header)
        if has_index_names is not None:
            warn(
                "\nThe has_index_names argument is deprecated; index names "
                "will be automatically inferred based on index_col.\n"
                "This argmument is still necessary if reading Excel output "
                "from 0.16.2 or prior with index names.",
                FutureWarning,
                stacklevel=3)

        if 'chunksize' in kwds:
            raise NotImplementedError("chunksize keyword of read_excel "
                                      "is not implemented")

        if parse_dates is True and index_col is None:
            warn("The 'parse_dates=True' keyword of read_excel was provided"
                 " without an 'index_col' keyword value.")

        def _parse_cell(cell_contents, cell_typ):
            """converts the contents of the cell into a pandas
               appropriate object"""

            if cell_typ == XL_CELL_DATE:

                if xlrd_0_9_3:
                    # Use the newer xlrd datetime handling.
                    try:
                        cell_contents = \
                            xldate.xldate_as_datetime(cell_contents,
                                                      epoch1904)
                    except OverflowError:
                        return cell_contents
                    # Excel doesn't distinguish between dates and time,
                    # so we treat dates on the epoch as times only.
                    # Also, Excel supports 1900 and 1904 epochs.
                    year = (cell_contents.timetuple())[0:3]
                    if ((not epoch1904 and year == (1899, 12, 31))
                            or (epoch1904 and year == (1904, 1, 1))):
                        cell_contents = time(cell_contents.hour,
                                             cell_contents.minute,
                                             cell_contents.second,
                                             cell_contents.microsecond)
                else:
                    # Use the xlrd <= 0.9.2 date handling.
                    try:
                        dt = xldate.xldate_as_tuple(cell_contents, epoch1904)

                    except xldate.XLDateTooLarge:
                        return cell_contents

                    if dt[0] < MINYEAR:
                        cell_contents = time(*dt[3:])
                    else:
                        cell_contents = datetime(*dt)

            elif cell_typ == XL_CELL_ERROR:
                cell_contents = np.nan
            elif cell_typ == XL_CELL_BOOLEAN:
                cell_contents = bool(cell_contents)
            elif convert_float and cell_typ == XL_CELL_NUMBER:
                # GH5394 - Excel 'numbers' are always floats
                # it's a minimal perf hit and less suprising
                val = int(cell_contents)
                if val == cell_contents:
                    cell_contents = val
            return cell_contents

        ret_dict = False
        if isinstance(sheetname, list):
            sheets = sheetname
            ret_dict = True
        elif sheetname is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheetname]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())
        output = OrderedDict()

        import xlrd
        from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        # Keep sheetname to maintain backwards compatibility.
        for asheetname in sheets:
            if verbose:
                print("Reading sheet %s" % asheetname)
            if isinstance(asheetname, compat.string_types):
                sheet = self.book.sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.book.sheet_by_index(asheetname)

            data = []
            should_parse = {}

            if sheet.nrows > 5000:
                raise Exception(
                    "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload"
                )
            elif kwds.get('MaxTest'):
                continue

            for i in range(sheet.nrows):

                row = []
                for j, (value, typ) in enumerate(
                        zip(sheet.row_values(i), sheet.row_types(i))):
                    if parse_cols is not None and j not in should_parse:
                        should_parse[j] = self._should_parse(j, parse_cols)

                    if parse_cols is None or should_parse[j]:
                        row.append(_parse_cell(value, typ))
                data.append(row)
#            output[asheetname] = data
            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None:
                if is_list_like(header):
                    header_names = []
                    control_row = [True for x in data[0]]
                    for row in header:
                        if is_integer(skiprows):
                            row += skiprows

                        data[row], control_row = _fill_mi_header(
                            data[row], control_row)
                        header_name, data[row] = _pop_header_name(
                            data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # forward fill values for MultiIndex index
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == '' or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

            if is_list_like(header) and len(header) > 1:
                has_index_names = True

            if kwds.get('parsed'):
                try:
                    parser = TextParser(data,
                                        header=header,
                                        index_col=index_col,
                                        has_index_names=has_index_names,
                                        na_values=na_values,
                                        thousands=thousands,
                                        parse_dates=parse_dates,
                                        date_parser=date_parser,
                                        true_values=true_values,
                                        false_values=false_values,
                                        skiprows=skiprows,
                                        skipfooter=skip_footer,
                                        squeeze=squeeze,
                                        dtype=dtype,
                                        **kwds)
                    output[asheetname] = parser.read()
                    if names is not None:
                        output[asheetname].columns = names
                    if not squeeze or isinstance(output[asheetname],
                                                 DataFrame):
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                except EmptyDataError:
                    # No Data, return an empty DataFrame
                    output[asheetname] = DataFrame()
            else:
                output[asheetname] = data

        if ret_dict or kwds.get('MaxTest'):
            return output
        else:
            return output[asheetname]
Esempio n. 26
0
def read_csv(
    filepath_or_buffer,
    sep=",",
    delimiter=None,
    header="infer",
    names=None,
    index_col=None,
    usecols=None,
    prefix=None,
    mangle_dupe_cols=True,
    dtype=None,
    true_values=None,
    false_values=None,
    skiprows=None,
    skipfooter=0,
    nrows=None,
    na_values=None,
    skip_blank_lines=True,
    parse_dates=False,
    compression="infer",
    quotechar='"',
    quoting=0,
    doublequote=True,
    verify_header=False,
    **kwargs,
    # TODO: Put back these options once we figure out how to support them
    #       with the Arrows CSV reader.
    # skipinitialspace=False,  # GPU only
    # keep_default_na=True,  # GPU only
    # na_filter=True,  # GPU only
    # dayfirst=False, # GPU only
    # thousands=None,  # GPU only
    # decimal=".",  # GPU only
    # lineterminator=None, # GPU only
    # comment=None,  # GPU only
    # delim_whitespace=False,  # GPU only
):

    # Checks on filepath_or_buffer
    paths = util.to_list_if_scalar(filepath_or_buffer)

    if any(not isinstance(path, str) for path in paths):
        raise err._unsupported_error(
            "'filepath_or_buffer' must be a string or a list of strings")
    if len(paths) == 0:
        raise ValueError("'filepath_or_buffer' must be a non-empty list")

    for path in paths:
        if not os.path.exists(path):
            raise ValueError(f"{path} does not exist")

    if not isinstance(compression, str):
        raise err._unsupported_error("compression", compression)
    compressions = [
        _parse_compression(infer_compression(path, compression))
        for path in paths
    ]

    # Checks on sep and delimiter
    if sep is None and delimiter is None:
        raise ValueError("at least one of 'sep' or 'delimiter' must be given")
    sep = delimiter if delimiter is not None else sep
    if len(sep) > 1:
        raise ValueError("'sep' must be a 1-character string")

    # Checks on sep and delimiter
    if header == "infer":
        header = 0 if names is None else None

    if header not in (
            0,
            None,
    ):
        raise err._unsupported_error("header", header)

    # Checks on skiprows, kipfooter, and nrows
    skiprows = 0 if skiprows is None else skiprows
    if not is_integer(skiprows):
        raise ValueError("'skiprows' must be an integer")
    if not is_integer(skipfooter):
        raise ValueError("'skipfooter' must be an integer")
    if not (nrows is None or is_integer(nrows)):
        raise ValueError("'nrows' must be None or an integer")

    # If either column names or dtype is missing, infer them by parsing
    # the first few of lines using Pandas
    # FIXME: We should use cuDF for this
    if names is None or dtype is None:
        engine = ("python" if skipfooter > 0 else "c", )
        column_names, dtypes = _extract_header_using_pandas(
            paths[0],
            sep,
            header,
            names,
            dtype,
            true_values,
            false_values,
            skiprows,
            na_values,
            skip_blank_lines,
            parse_dates,
            compression,
            quotechar,
            quoting,
            doublequote,
            engine,
            peek_rows=3,
        )
        if verify_header:
            for path in paths[1:]:
                result = _extract_header_using_pandas(
                    path,
                    sep,
                    header,
                    names,
                    dtype,
                    true_values,
                    false_values,
                    skiprows,
                    na_values,
                    skip_blank_lines,
                    parse_dates,
                    compression,
                    quotechar,
                    quoting,
                    doublequote,
                    engine,
                    peek_rows=3,
                )
                if not column_names.equals(result[0]):
                    raise ValueError(
                        f"{paths[0]} and {path} have different headers")

    else:
        column_names = pandas.Index(names)

        if is_dict_like(dtype):
            dtypes = []
            for name in names:
                if name not in dtype:
                    raise ValueError(f"'dtype' has no entry for '{name}'")
                dtypes.append(_ensure_dtype(dtype[name]))
        elif is_list_like(dtype):
            raise err._unsupported_error(
                "'dtype' must be a string, a dtype, or a dictionary")
        else:
            dtype = _ensure_dtype(dtype)
            dtypes = [dtype] * len(names)

    if column_names.has_duplicates:
        raise ValueError("Header must not have any duplicates")

    # Checks on unsupported options
    if prefix is not None:
        raise err._unsupported_error("prefix", prefix)
    if mangle_dupe_cols not in (True, ):
        raise err._unsupported_error("mangle_dupe_cols", mangle_dupe_cols)

    # If there was a header in the file, we should skip that line as well
    if header == 0:
        skiprows += 1

    # Checks on parse_dates
    _ERR_MSG_PARSE_DATES = (
        "'parse_dates' must be a list of integers or strings for now")

    if is_dict_like(parse_dates):
        raise err._unsupported_error(_ERR_MSG_PARSE_DATES)

    parse_dates = parse_dates if parse_dates is not False else []
    if not is_list_like(parse_dates):
        raise err._unsupported_error(_ERR_MSG_PARSE_DATES)

    date_cols = _get_indexer(column_names, parse_dates, "parse_dates")

    # Override dtypes for the datetime columns
    for idx in date_cols:
        dtypes[idx] = ty.ts_ns

    # If a column is given a datetime dtype but not added to the parse_dates,
    # we should record it
    for idx, dtype in enumerate(dtypes):
        if idx not in parse_dates:
            parse_dates.append(idx)

    # Checks on quoting
    if quoting != 0:
        raise err._unsupported_error("quoting", quoting)
    if len(quotechar) > 1:
        raise ValueError("'quotechar' must be a 1-character string")

    # Checks on index_col
    index_col = None if index_col is False else index_col
    if index_col is not None:
        if is_integer(index_col) or isinstance(index_col, str):
            index_col = [index_col]
        if not is_list_like(index_col):
            raise err._unsupported_error("index_col", index_col)
        index_col = _get_indexer(column_names, index_col, "index_col")

    # Checks on true_values, false_values, and na_values
    _check_string_list(true_values, "true_values")
    _check_string_list(false_values, "false_values")
    _check_string_list(na_values, "na_values")

    # Checks on nrows
    if skipfooter != 0 and nrows is not None:
        raise ValueError("'skipfooter' not supported with 'nrows'")

    df = DataFrame(
        frame=io.read_csv(
            paths,
            sep=sep,
            usecols=usecols,
            dtypes=dtypes,
            true_values=true_values,
            false_values=false_values,
            skiprows=skiprows,
            skipfooter=skipfooter,
            nrows=nrows,
            na_values=na_values,
            skip_blank_lines=skip_blank_lines,
            date_cols=date_cols,
            compressions=compressions,
            quotechar=quotechar,
            quoting=quoting,
            doublequote=doublequote,
        ),
        columns=column_names,
    )

    if index_col is not None:
        df = df.set_index(column_names[index_col])
        # Make sure we reset the names for unnamed indices
        names = df._raw_index.names
        names = [
            None if name.startswith("Unnamed") else name for name in names
        ]
        df._raw_index.names = names

    return df
Esempio n. 27
0
 def __getitem__(self, item):
     # type (Any) -> Any
     """Select a subset of self.
     Parameters
     ----------
     item : int, slice, or ndarray
         * int: The position in 'self' to get.
         * slice: A slice object, where 'start', 'stop', and 'step' are
           integers or None
         * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
     Returns
     -------
     item : scalar or ExtensionArray
     Notes
     -----
     For scalar ``item``, return a scalar value suitable for the array's
     type. This should be an instance of ``self.dtype.type``.
     For slice ``key``, return an instance of ``ExtensionArray``, even
     if the slice is length 0 or 1.
     For a boolean mask, return an instance of ``ExtensionArray``, filtered
     to the values where ``item`` is True.
     """
     # Workaround for Arrow bug that segfaults on empty slice.
     # This is fixed in Arrow master, will be released in 0.10
     if isinstance(item, slice):
         start = item.start or 0
         stop = item.stop if item.stop is not None else len(self.data)
         stop = min(stop, len(self.data))
         step = item.step if item.step is not None else 1
         # Arrow can't handle slices with steps other than 1
         # https://issues.apache.org/jira/browse/ARROW-2714
         if step != 1:
             arr = np.asarray(self)[item]
             # ARROW-2806: Inconsistent handling of np.nan requires adding a mask
             if pa.types.is_integer(self.dtype.arrow_dtype) or pa.types.is_floating(
                 self.dtype.arrow_dtype
             ):
                 mask = pd.isna(arr)
             else:
                 mask = None
             return type(self)(pa.array(arr, type=self.dtype.arrow_dtype, mask=mask))
         if stop - start == 0:
             return type(self)(pa.array([], type=self.data.type))
     elif isinstance(item, Iterable):
         if not is_array_like(item):
             item = np.array(item)
         if is_integer_dtype(item):
             return self.take(item)
         elif is_bool_dtype(item):
             indices = np.array(item)
             indices = np.argwhere(indices).flatten()
             return self.take(indices)
         else:
             raise IndexError(
                 "Only integers, slices and integer or boolean arrays are valid indices."
             )
     elif is_integer(item):
         if item < 0:
             item += len(self)
         if item >= len(self):
             return None
     value = self.data[item]
     if isinstance(value, pa.ChunkedArray):
         return type(self)(value)
     else:
         return value.as_py()
Esempio n. 28
0
def _df_filter(ranger,
               lasso,
               header=0,
               names=None,
               index_col=None,
               parse_cols=None,
               usecols=None,
               squeeze=False,
               dtype=None,
               engine=None,
               true_values=None,
               false_values=None,
               skiprows=None,
               nrows=None,
               na_values=None,
               keep_default_na=True,
               verbose=False,
               parse_dates=False,
               thousands=None,
               comment=None,
               skipfooter=0,
               convert_float=True,
               mangle_dupe_cols=True,
               **kwds):
    """
    Converts captured values table as pandas DataFrame

    Most args copied from :func:`pandas.io.read_excel()` except:
    
        sheet_name, skip_footer, converters, date_parser

    Note that ``skip_footer`` has been deprecated by ``skipfooter``.
    """
    data = lasso.values

    # Copied & adapted from `pandas.io.excel.py` v0.24.2+ (Jun 2019)
    #    https://github.com/pandas-dev/pandas/blob/d47fc0c/pandas/io/excel/_base.py#L368

    _validate_header_arg(header)

    invalid_args = (set("skip_footer chunksize date_parser converted".split())
                    & kwds.keys())
    if bool(invalid_args):
        raise NotImplementedError("Cannot implement args: %s" % invalid_args)

    if not data:
        return pd.DataFrame()

    usecols = _maybe_convert_usecols(usecols)

    if is_list_like(header) and len(header) == 1:
        header = header[0]

    # forward fill and pull out names for MultiIndex column
    header_names = None
    if header is not None and is_list_like(header):
        header_names = []
        control_row = [True for _ in data[0]]
        for row in header:
            if is_integer(skiprows):
                row += skiprows
            try:
                data[row], control_row = _fill_mi_header(
                    data[row], control_row)
            except TypeError:
                ## Arg `control_row` introduced in pandas-v0.19.0 to fix
                #  https://github.com/pandas-dev/pandas/issues/12453
                #  https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505
                data[row] = _fill_mi_header(data[row])

            if index_col is not None:
                header_name, data[row] = _pop_header_name(data[row], index_col)
                header_names.append(header_name)

    if is_list_like(index_col):
        # forward fill values for MultiIndex index
        if not is_list_like(header):
            offset = 1 + header
        else:
            offset = 1 + max(header)

        # Check if we have an empty dataset
        # before trying to collect data.
        if offset < len(data):
            for col in index_col:
                last = data[offset][col]

                for row in range(offset + 1, len(data)):
                    if data[row][col] == "" or data[row][col] is None:
                        data[row][col] = last
                    else:
                        last = data[row][col]

    has_index_names = is_list_like(header) and len(header) > 1

    # Pandaas expect '' instead of `None`!
    data = [["" if c is None else c for c in r] for r in data]

    # GH 12292 : error when read one empty column from excel file
    try:
        parser = pdparsers.TextParser(data,
                                      names=names,
                                      header=header,
                                      index_col=index_col,
                                      has_index_names=has_index_names,
                                      squeeze=squeeze,
                                      dtype=dtype,
                                      true_values=true_values,
                                      false_values=false_values,
                                      skiprows=skiprows,
                                      nrows=nrows,
                                      na_values=na_values,
                                      parse_dates=parse_dates,
                                      thousands=thousands,
                                      comment=comment,
                                      skipfooter=skipfooter,
                                      usecols=usecols,
                                      mangle_dupe_cols=mangle_dupe_cols,
                                      **kwds)

        output = parser.read()

        if not squeeze or isinstance(output, pd.DataFrame):
            if header_names:
                output.columns = output.columns.set_names(header_names)
    except EmptyDataError:
        # No Data, return an empty DataFrame
        output = pd.DataFrame()

    lasso = lasso._replace(values=output)

    return lasso
Esempio n. 29
0
def create_table(ctx, infile, table_name, col_spacing, varchar_factor, sql,
                 encoding, separator):
    """
    Display SQL table create command from a CSV file.
    """

    ordered_columns = OrderedDict()

    if infile.endswith(".xls") or infile.endswith(".xlsx"):
        print("Loading Excel file...")
        df = pd.ExcelFile(infile).parse()
    else:
        print("Loading CSV file...")
        df = pd.read_csv(infile, encoding=encoding, sep=separator)

    count = 0
    for column in df.columns:
        #print(df[column].dtype)
        count += 1
        sys.stdout.write(f"{str(count):3} ")

        # The entire column is empty.  No rows have values.
        if df[column].isna().all():
            ordered_columns[column] = {'type': None, 'length': None}
            print("{:{col_spacing}}: {}".format("No values",
                                                column,
                                                col_spacing=col_spacing))
            continue

        # Handling of numeric fields
        if is_numeric_dtype(df[column]):
            # Find the max value
            maxVal = None
            validVals = [i for i in df[column].dropna()]
            if validVals:
                maxVal = max(validVals)

            if is_float_dtype(df[column]):
                # Pandas stores numerical columns with null values as floats.  We
                # need to do some extra work to determine if the column is an int
                allIntegers = all(i.is_integer() for i in df[column].dropna())

                if allIntegers:
                    # this is an Integer column
                    ordered_columns[column] = {
                        'type': get_int_type(maxVal),
                        'length': maxVal
                    }
                    print(
                        f"int, {str(maxVal):{col_spacing-5}}: {column} : ({df[column].dtype})"
                    )
                    #df[df[column].fillna(0) != 0.0][column].astype(int)
                else:
                    # this is a Float column
                    ordered_columns[column] = {
                        'type': get_float_type(maxVal),
                        'length': maxVal
                    }
                    print(
                        f"{df[column].dtype}, {str(maxVal):{col_spacing-5}}: {column}"
                    )
            else:
                # These types were detected as integers during loading of the file.
                if is_int64_dtype(df[column]) or is_integer(df[column]):
                    ordered_columns[column] = {
                        'type': get_int_type(maxVal),
                        'length': maxVal
                    }
                    print(f"int, {str(maxVal):{col_spacing-5}}: {column}")
                else:
                    unknown = "???"
                    print(f"{unknown:{col_spacing}}: {column}")
        # Handling of Strings
        else:
            # Look for values that look like dates in 2018/01/01 or 01/01/2018 form
            patterns = [
                re.compile('^\d{1,2}[-/]\d{1,2}[-/]20\d\d$'),
                # re.compile('^\d{1,2}[-/]\d{1,2}[-/]\d{1,4}$'),
                re.compile('^20\d\d[-/]\d{1,2}[-/]\d{1,2}$')
                # re.compile('^\d{1,4}[-/]\d{1,2}[-/]\d{1,2}$')
            ]

            foundDate = False
            for pattern in patterns:
                if any(i == True for i in df[column].str.contains(pattern)):
                    foundDate = True

            foundBool = False
            try:
                maxVal = str(int(df[column].dropna().str.len().max()))
            except:
                # Could be boolean?
                # if "otc" in column:
                #     import pdb; pdb.set_trace()
                # if all(i.lower == "false" or i.lower() == "true" for i in df[column].dropna()):
                if any(type(i) == bool for i in df[column].dropna()):
                    maxVal = 0
                else:
                    maxVal = 0

            if foundDate:
                ordered_columns[column] = {'type': "DATE", 'length': maxVal}
                print(f"Date, {maxVal:{col_spacing-6}}: {column}")
            # elif foundBool:
            #     ordered_columns[column] = {'type': "BOOL", 'length': maxVal}
            #     print(f"Bool, {maxVal:{col_spacing-6}}: {column}")
            else:
                ordered_columns[column] = {
                    'type': f"VARCHAR({int(maxVal)*varchar_factor})",
                    'length': maxVal
                }
                print(f"String, {maxVal:{col_spacing-8}}: {column}")

    print("-------------------------------------")
    print(f"Total columns are: {len(df.columns)}")
    print("-------------------------------------")

    if sql:
        create_table_sql(ordered_columns, table_name)
Esempio n. 30
0
    def __setitem__(self, key: Union[int, slice, np.ndarray],
                    value: Any) -> None:
        """Set one or more values inplace.

        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of

            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object

        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.

        Returns
        -------
        None
        """
        key = check_array_indexer(self, key)

        if is_integer(key):
            key = cast(int, key)

            if not is_scalar(value):
                raise ValueError("Must pass scalars with scalar indexer")
            elif isna(value):
                value = None
            elif not isinstance(value, str):
                raise ValueError("Scalar must be NA or str")

            # Slice data and insert in-between
            new_data = [
                *self._data[0:key].chunks,
                pa.array([value], type=pa.string()),
                *self._data[(key + 1):].chunks,
            ]
            self._data = pa.chunked_array(new_data)
        else:
            # Convert to integer indices and iteratively assign.
            # TODO: Make a faster variant of this in Arrow upstream.
            #       This is probably extremely slow.

            # Convert all possible input key types to an array of integers
            if isinstance(key, slice):
                key_array = np.array(range(len(self))[key])
            elif is_bool_dtype(key):
                # TODO(ARROW-9430): Directly support setitem(booleans)
                key_array = np.argwhere(key).flatten()
            else:
                # TODO(ARROW-9431): Directly support setitem(integers)
                key_array = np.asanyarray(key)

            if is_scalar(value):
                value = np.broadcast_to(value, len(key_array))
            else:
                value = np.asarray(value)

            if len(key_array) != len(value):
                raise ValueError("Length of indexer and values mismatch")

            for k, v in zip(key_array, value):
                self[k] = v
Esempio n. 31
0
def _df_filter(ranger, lasso, header=0, skiprows=None, names=None,
               skip_footer=0, index_col=None, has_index_names=None,
               parse_cols=None, parse_dates=False, date_parser=None,
               na_values=None, thousands=None, convert_float=True,
               verbose=False, squeeze=False, **kwds):
    """
    Converts captured values table as pandas DataFrame

    Doc below copied from :func:`pandas.io.read_excel()`:

    header : int, list of ints, default 0
        Row (0-indexed) to use for the column labels of the parsed
        DataFrame. If a list of integers is passed those row positions will
        be combined into a ``MultiIndex``
    skiprows : list-like
        Rows to skip at the beginning (0-indexed)
    skip_footer : int, default 0
        Rows at the end to skip (0-indexed)
    index_col : int, list of ints, default None
        Column (0-indexed) to use as the row labels of the DataFrame.
        Pass None if there is no such column.  If a list is passed,
        those columns will be combined into a ``MultiIndex``
    names : array-like, default None
        List of column names to use. If file contains no header row,
        then you should explicitly pass header=None
    converters : dict, default None
        Dict of functions for converting values in certain columns. Keys can
        either be integers or column labels, values are functions that take one
        input argument, the Excel cell content, and return the transformed
        content.
    parse_cols : int or list, default None
        * If None then parse all columns,
        * If int then indicates last column to be parsed
        * If list of ints then indicates list of column numbers to be parsed
        * If string then indicates comma separated list of column names and
          column ranges (e.g. "A:E" or "A,C,E:F")
    squeeze : boolean, default False
        If the parsed data only contains one column then return a Series
    na_values : list-like, default None
        List of additional strings to recognize as NA/NaN
    thousands : str, default None
        Thousands separator for parsing string columns to numeric.  Note that
        this parameter is only necessary for columns stored as TEXT in Excel,
        any numeric columns will automatically be parsed, regardless of display
        format.
    keep_default_na : bool, default True
        If na_values are specified and keep_default_na is False the default NaN
        values are overridden, otherwise they're appended to
    verbose : boolean, default False
        Indicate number of NA values placed in non-numeric columns
    engine: string, default None
        If io is not a buffer or path, this must be set to identify io.
        Acceptable values are None or xlrd
    convert_float : boolean, default True
        convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
        data will be read in as floats: Excel stores all numbers as floats
        internally
    has_index_names : boolean, default None
        DEPRECATED: for version 0.17+ index names will be automatically
        inferred based on index_col.  To read Excel output from 0.16.2 and
        prior that had saved index names, use True.
    """
    data = lasso.values

    # Copied & adapted from `pandas.io.excel.py` v0.18.1
    #    https://github.com/pydata/pandas/releases/tag/v0.18.1

    skipfooter = kwds.pop('skipfooter', None)
    if skipfooter is not None:
        skip_footer = skipfooter

    _validate_header_arg(header)
    if has_index_names is not None:

        log.warning("\nThe has_index_names argument is deprecated; index names "
                    "will be automatically inferred based on index_col.\n"
                    "This argument is still necessary if reading Excel output "
                    "from 0.16.2 or prior with index names.")

    if 'chunksize' in kwds:
        raise NotImplementedError("chunksize keyword of read_excel "
                                  "is not implemented")
    if parse_dates:
        raise NotImplementedError("parse_dates keyword of read_excel "
                                  "is not implemented")

    if date_parser is not None:
        raise NotImplementedError("date_parser keyword of read_excel "
                                  "is not implemented")

    if not data:
        return pd.DataFrame()

    if pdtypes.is_list_like(header) and len(header) == 1:
        header = header[0]

    # forward fill and pull out names for MultiIndex column
    header_names = None
    if header is not None:
        if pdtypes.is_list_like(header):
            header_names = []
            control_row = [True for _ in data[0]]
            for row in header:
                if pdtypes.is_integer(skiprows):
                    row += skiprows
                try:
                    data[row], control_row = pdexcel._fill_mi_header(data[row], control_row)
                except TypeError:
                    ## Arg `control_row` introduced in pandas-v0.19.0 to fix
                    #  https://github.com/pandas-dev/pandas/issues/12453
                    #  https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505
                    data[row] = pdexcel._fill_mi_header(data[row])

                header_name, data[row] = pdexcel._pop_header_name(
                    data[row], index_col)
                header_names.append(header_name)
        else:
            data[header] = pdexcel._trim_excel_header(data[header])

    if pdtypes.is_list_like(index_col):
        # forward fill values for MultiIndex index
        if not pdtypes.is_list_like(header):
            offset = 1 + header
        else:
            offset = 1 + max(header)

        for col in index_col:
            last = data[offset][col]
            for row in range(offset + 1, len(data)):
                if data[row][col] == '' or data[row][col] is None:
                    data[row][col] = last
                else:
                    last = data[row][col]

    if pdtypes.is_list_like(header) and len(header) > 1:
        has_index_names = True

    # Pandaas expect '' instead of `None`!
    data = [['' if c is None else c for c in r] for r in data]

    # GH 12292 : error when read one empty column from excel file
    try:
        parser = pdparsers.TextParser(data, header=header, index_col=index_col,
                                      has_index_names=has_index_names,
                                      na_values=na_values,
                                      thousands=thousands,
                                      parse_dates=parse_dates,
                                      date_parser=date_parser,
                                      skiprows=skiprows,
                                      skip_footer=skip_footer,
                                      squeeze=squeeze,
                                      **kwds)

        output = parser.read()
        if names is not None:
            output.columns = names
        if not squeeze or isinstance(output, pd.DataFrame):
            output.columns = output.columns.set_names(header_names)
    except pdiocom.EmptyDataError:
        # No Data, return an empty DataFrame
        output = pd.DataFrame()

    lasso = lasso._replace(values=output)

    return lasso
Esempio n. 32
0
def test_pandas_agreement(obj):
    assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj)
    assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj)
    assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj)
    assert types.is_integer(obj) == ptypes.is_integer(obj)
    assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
Esempio n. 33
0
def qcut(x, q, labels=None, retbins=False, precision=3, duplicate='raise'):
    """
    Quantile-based discretization function.

    Discretize variable into equal-sized buckets based on rank or based
    on sample quantiles. For example 1000 values for 10 quantiles would
    produce a Categorical object indicating quantile membership for each data point.

    Parameters
    ----------
    x : 1d tensor or Series
    q : int or list-like of float
        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
    labels : array or False, default None
        Used as labels for the resulting bins. Must be of the same length as
        the resulting bins. If False, return only integer indicators of the
        bins. If True, raises an error.
    retbins : bool, optional
        Whether to return the (bins, labels) or not. Can be useful if bins
        is given as a scalar.
    precision : int, optional
        The precision at which to store and display the bins labels.
    duplicates : {default 'raise', 'drop'}, optional
        If bin edges are not unique, raise ValueError or drop non-uniques.

    Returns
    -------
    out : Categorical or Series or tensor of integers if labels is False
        The return type (Categorical or Series) depends on the input: a Series
        of type category if input is a Series else Categorical. Bins are
        represented as categories when categorical data is returned.
    bins : tensor of floats
        Returned only if `retbins` is True.

    Notes
    -----
    Out of bounds values will be NA in the resulting Categorical object

    Examples
    --------
    >>> import mars.dataframe as md
    >>> md.qcut(range(5), 4).execute()
    ... # doctest: +ELLIPSIS
    [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
    Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...

    >>> md.qcut(range(5), 3, labels=["good", "medium", "bad"]).execute()
    ... # doctest: +SKIP
    [good, good, medium, bad, bad]
    Categories (3, object): [good < medium < bad]

    >>> md.qcut(range(5), 4, labels=False).execute()
    array([0, 0, 1, 2, 3])
    """
    if is_integer(q):
        q = np.linspace(0, 1, q + 1)

    if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE, pd.DataFrame, pd.Series)):
        x = DataFrame(x) if x.ndim == 2 else Series(x)
        bins = x.quantile(q)
    else:
        x = astensor(x)
        if isinstance(q, ENTITY_TYPE):
            q = q * 100
        else:
            q = [iq * 100 for iq in q]
        bins = percentile(x, q)

    return cut(x,
               bins,
               labels=labels,
               retbins=retbins,
               precision=precision,
               include_lowest=True,
               duplicates=duplicate)
Esempio n. 34
0
    def __setitem__(self, key, value):
        # type: (Union[int, np.ndarray], Any) -> None
        """Set one or more values inplace.

        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of

            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object

        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.

        Returns
        -------
        None
        """
        # Convert all possible input key types to an array of integers
        if is_bool_dtype(key):
            key = np.argwhere(key).flatten()
        elif isinstance(key, slice):
            key = np.array(range(len(self))[key])
        elif is_integer(key):
            key = np.array([key])
        else:
            key = np.asanyarray(key)

        if pd.api.types.is_scalar(value):
            value = np.broadcast_to(value, len(key))
        else:
            value = np.asarray(value)

        if len(key) != len(value):
            raise ValueError("Length mismatch between index and value.")

        affected_chunks_index = self._get_chunk_indexer(key)
        affected_chunks_unique = np.unique(affected_chunks_index)

        all_chunks = list(self.data.iterchunks())

        for ix, offset in zip(
            affected_chunks_unique, self.offsets[affected_chunks_unique]
        ):
            chunk = all_chunks[ix]

            # Translate the array-wide indices to indices of the chunk
            key_chunk_indices = np.argwhere(affected_chunks_index == ix).flatten()
            array_chunk_indices = key[key_chunk_indices] - offset

            if pa.types.is_date64(self.dtype.arrow_dtype):
                # ARROW-2741: pa.array from np.datetime[D] and type=pa.date64 produces invalid results
                arr = np.array(chunk.to_pylist())
                arr[array_chunk_indices] = np.array(value)[key_chunk_indices]
                pa_arr = pa.array(arr, self.dtype.arrow_dtype)
            else:
                arr = chunk.to_pandas()
                # In the case where we zero-copy Arrow to Pandas conversion, the
                # the resulting arrays are read-only.
                if not arr.flags.writeable:
                    arr = arr.copy()
                arr[array_chunk_indices] = value[key_chunk_indices]

                mask = None
                # ARROW-2806: Inconsistent handling of np.nan requires adding a mask
                if (
                    pa.types.is_integer(self.dtype.arrow_dtype)
                    or pa.types.is_floating(self.dtype.arrow_dtype)
                    or pa.types.is_boolean(self.dtype.arrow_dtype)
                ):
                    nan_values = pd.isna(value[key_chunk_indices])
                    if any(nan_values):
                        nan_index = key_chunk_indices & nan_values
                        mask = np.ones_like(arr, dtype=bool)
                        mask[nan_index] = False
                pa_arr = pa.array(arr, self.dtype.arrow_dtype, mask=mask)
            all_chunks[ix] = pa_arr

        self.data = pa.chunked_array(all_chunks)