Beispiel #1
0
    def fetch_data(self, headers=True, fmt='df'):
        """Retrieve the data within this tab.

        Efforts are taken to ensure that returned rows are always the same length. If
        headers=True, this length will be equal to the length of the headers. If headers=False,
        this length will be equal to the longest row.

        In either case, shorter rows will be padded with Nones and longer rows will be
        truncated (i.e. if there are 3 headers then all rows will have 3 entries regardless
        of the amount of populated cells they have).

        Args:
            headers (bool): If True, the first row will be used as the column names for the
                pandas.DataFrame. Otherwise, a 0-indexed range will be used instead

            fmt (str): The format in which to return the data. Accepted values: 'df', 'dict', 'list'

        Returns:
            When fmt='df' --> pandas.DataFrame

            When fmt='dict' --> list of dicts, e.g.::

                [{header1: row1cell1, header2: row1cell2},
                 {header1: row2cell1, header2: row2cell2},
                 ...]

            When fmt='list' --> tuple of header names, list of lists with row data, e.g.::

                ([header1, header2, ...],
                 [[row1cell1, row1cell2, ...], [row2cell1, row2cell2, ...], ...])
        """
        if fmt not in ('df', 'dict', 'list'):
            raise ValueError(
                "Unexpected value '{}' for parameter `fmt`. "
                "Accepted values are 'df', 'dict', and 'list'".format(fmt))

        fields = 'sheets/data/rowData/values(effectiveValue,effectiveFormat/numberFormat/type)'
        raw_data = self.sheets_svc.get(spreadsheetId=self.workbook.file_id,
                                       ranges=self.tabname,
                                       includeGridData=True,
                                       fields=fields).execute()
        processed_rows = self._process_rows(raw_data)

        # filter out empty rows
        max_idx = helpers._find_max_nonempty_row(processed_rows)

        if max_idx is None:
            if fmt == 'df':
                return pd.DataFrame([])
            elif fmt == 'dict':
                return []
            else:
                return ([], [])

        processed_rows = processed_rows[:max_idx + 1]

        # remove trailing Nones on rows
        processed_rows = list(
            map(helpers._remove_trailing_nones, processed_rows))

        if headers:
            header_names = processed_rows.pop(0)
            max_width = len(header_names)
        else:
            # Iterate through rows to find widest one
            max_width = max(map(len, processed_rows))
            header_names = list(range(max_width))

        # resize the rows to match the number of column headers
        processed_rows = [
            helpers._resize_row(row, max_width) for row in processed_rows
        ]

        if fmt == 'df':
            df = pd.DataFrame(data=processed_rows, columns=header_names)
            return df
        elif fmt == 'dict':
            make_row_dict = lambda row: OrderedDict(zip(header_names, row))
            return list(map(make_row_dict, processed_rows))
        else:
            return header_names, processed_rows
Beispiel #2
0
def test_find_max_nonempty_row(data, expected):
    assert expected == helpers._find_max_nonempty_row(data)