def fetch_data(self, headers=True, fmt='df'): """Retrieve the data within this tab. Efforts are taken to ensure that returned rows are always the same length. If headers=True, this length will be equal to the length of the headers. If headers=False, this length will be equal to the longest row. In either case, shorter rows will be padded with Nones and longer rows will be truncated (i.e. if there are 3 headers then all rows will have 3 entries regardless of the amount of populated cells they have). Args: headers (bool): If True, the first row will be used as the column names for the pandas.DataFrame. Otherwise, a 0-indexed range will be used instead fmt (str): The format in which to return the data. Accepted values: 'df', 'dict', 'list' Returns: When fmt='df' --> pandas.DataFrame When fmt='dict' --> list of dicts, e.g.:: [{header1: row1cell1, header2: row1cell2}, {header1: row2cell1, header2: row2cell2}, ...] When fmt='list' --> tuple of header names, list of lists with row data, e.g.:: ([header1, header2, ...], [[row1cell1, row1cell2, ...], [row2cell1, row2cell2, ...], ...]) """ if fmt not in ('df', 'dict', 'list'): raise ValueError( "Unexpected value '{}' for parameter `fmt`. " "Accepted values are 'df', 'dict', and 'list'".format(fmt)) fields = 'sheets/data/rowData/values(effectiveValue,effectiveFormat/numberFormat/type)' raw_data = self.sheets_svc.get(spreadsheetId=self.workbook.file_id, ranges=self.tabname, includeGridData=True, fields=fields).execute() processed_rows = self._process_rows(raw_data) # filter out empty rows max_idx = helpers._find_max_nonempty_row(processed_rows) if max_idx is None: if fmt == 'df': return pd.DataFrame([]) elif fmt == 'dict': return [] else: return ([], []) processed_rows = processed_rows[:max_idx + 1] # remove trailing Nones on rows processed_rows = list( map(helpers._remove_trailing_nones, processed_rows)) if headers: header_names = processed_rows.pop(0) max_width = len(header_names) else: # Iterate through rows to find widest one max_width = max(map(len, processed_rows)) header_names = list(range(max_width)) # resize the rows to match the number of column headers processed_rows = [ helpers._resize_row(row, max_width) for row in processed_rows ] if fmt == 'df': df = pd.DataFrame(data=processed_rows, columns=header_names) return df elif fmt == 'dict': make_row_dict = lambda row: OrderedDict(zip(header_names, row)) return list(map(make_row_dict, processed_rows)) else: return header_names, processed_rows
def test_find_max_nonempty_row(data, expected): assert expected == helpers._find_max_nonempty_row(data)