Ejemplo n.º 1
0
    def parse(self,
              sheet_name=0,
              header=0,
              names=None,
              index_col=None,
              usecols=None,
              squeeze=False,
              dtype=None,
              true_values=None,
              false_values=None,
              skiprows=None,
              nrows=None,
              na_values=None,
              verbose=False,
              parse_dates=False,
              date_parser=None,
              thousands=None,
              comment=None,
              skipfooter=0,
              convert_float=True,
              mangle_dupe_cols=True,
              **kwds):

        _validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())

        output = OrderedDict()

        for asheetname in sheets:
            if verbose:
                print("Reading sheet {sheet}".format(sheet=asheetname))

            if isinstance(asheetname, compat.string_types):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(data[row],
                                                             control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == '' or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(data,
                                    names=names,
                                    header=header,
                                    index_col=index_col,
                                    has_index_names=has_index_names,
                                    squeeze=squeeze,
                                    dtype=dtype,
                                    true_values=true_values,
                                    false_values=false_values,
                                    skiprows=skiprows,
                                    nrows=nrows,
                                    na_values=na_values,
                                    parse_dates=parse_dates,
                                    date_parser=date_parser,
                                    thousands=thousands,
                                    comment=comment,
                                    skipfooter=skipfooter,
                                    usecols=usecols,
                                    mangle_dupe_cols=mangle_dupe_cols,
                                    **kwds)

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                    elif compat.PY2:
                        output[asheetname].columns = _maybe_convert_to_string(
                            output[asheetname].columns)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Ejemplo n.º 2
0
    def parse(
        self,
        sheet_name=0,
        header=0,
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        dtype=None,
        true_values=None,
        false_values=None,
        skiprows=None,
        nrows=None,
        na_values=None,
        verbose=False,
        parse_dates=False,
        date_parser=None,
        thousands=None,
        comment=None,
        skipfooter=0,
        convert_float=True,
        mangle_dupe_cols=True,
        **kwds,
    ):

        validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(dict.fromkeys(sheets).keys())

        output = {}

        for asheetname in sheets:
            if verbose:
                print(f"Reading sheet {asheetname}")

            if isinstance(asheetname, str):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if not data:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(
                        data[row], control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == "" or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(
                    data,
                    names=names,
                    header=header,
                    index_col=index_col,
                    has_index_names=has_index_names,
                    squeeze=squeeze,
                    dtype=dtype,
                    true_values=true_values,
                    false_values=false_values,
                    skiprows=skiprows,
                    nrows=nrows,
                    na_values=na_values,
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    comment=comment,
                    skipfooter=skipfooter,
                    usecols=usecols,
                    mangle_dupe_cols=mangle_dupe_cols,
                    **kwds,
                )

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Ejemplo n.º 3
0
def _df_filter(ranger,
               lasso,
               header=0,
               names=None,
               index_col=None,
               parse_cols=None,
               usecols=None,
               squeeze=False,
               dtype=None,
               engine=None,
               true_values=None,
               false_values=None,
               skiprows=None,
               nrows=None,
               na_values=None,
               keep_default_na=True,
               verbose=False,
               parse_dates=False,
               thousands=None,
               comment=None,
               skipfooter=0,
               convert_float=True,
               mangle_dupe_cols=True,
               **kwds):
    """
    Converts captured values table as pandas DataFrame

    Most args copied from :func:`pandas.io.read_excel()` except:
    
        sheet_name, skip_footer, converters, date_parser

    Note that ``skip_footer`` has been deprecated by ``skipfooter``.
    """
    data = lasso.values

    # Copied & adapted from `pandas.io.excel.py` v0.24.2+ (Jun 2019)
    #    https://github.com/pandas-dev/pandas/blob/d47fc0c/pandas/io/excel/_base.py#L368

    _validate_header_arg(header)

    invalid_args = (set("skip_footer chunksize date_parser converted".split())
                    & kwds.keys())
    if bool(invalid_args):
        raise NotImplementedError("Cannot implement args: %s" % invalid_args)

    if not data:
        return pd.DataFrame()

    usecols = _maybe_convert_usecols(usecols)

    if is_list_like(header) and len(header) == 1:
        header = header[0]

    # forward fill and pull out names for MultiIndex column
    header_names = None
    if header is not None and is_list_like(header):
        header_names = []
        control_row = [True for _ in data[0]]
        for row in header:
            if is_integer(skiprows):
                row += skiprows
            try:
                data[row], control_row = _fill_mi_header(
                    data[row], control_row)
            except TypeError:
                ## Arg `control_row` introduced in pandas-v0.19.0 to fix
                #  https://github.com/pandas-dev/pandas/issues/12453
                #  https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505
                data[row] = _fill_mi_header(data[row])

            if index_col is not None:
                header_name, data[row] = _pop_header_name(data[row], index_col)
                header_names.append(header_name)

    if is_list_like(index_col):
        # forward fill values for MultiIndex index
        if not is_list_like(header):
            offset = 1 + header
        else:
            offset = 1 + max(header)

        # Check if we have an empty dataset
        # before trying to collect data.
        if offset < len(data):
            for col in index_col:
                last = data[offset][col]

                for row in range(offset + 1, len(data)):
                    if data[row][col] == "" or data[row][col] is None:
                        data[row][col] = last
                    else:
                        last = data[row][col]

    has_index_names = is_list_like(header) and len(header) > 1

    # Pandaas expect '' instead of `None`!
    data = [["" if c is None else c for c in r] for r in data]

    # GH 12292 : error when read one empty column from excel file
    try:
        parser = pdparsers.TextParser(data,
                                      names=names,
                                      header=header,
                                      index_col=index_col,
                                      has_index_names=has_index_names,
                                      squeeze=squeeze,
                                      dtype=dtype,
                                      true_values=true_values,
                                      false_values=false_values,
                                      skiprows=skiprows,
                                      nrows=nrows,
                                      na_values=na_values,
                                      parse_dates=parse_dates,
                                      thousands=thousands,
                                      comment=comment,
                                      skipfooter=skipfooter,
                                      usecols=usecols,
                                      mangle_dupe_cols=mangle_dupe_cols,
                                      **kwds)

        output = parser.read()

        if not squeeze or isinstance(output, pd.DataFrame):
            if header_names:
                output.columns = output.columns.set_names(header_names)
    except EmptyDataError:
        # No Data, return an empty DataFrame
        output = pd.DataFrame()

    lasso = lasso._replace(values=output)

    return lasso