def _search(self, sheet: Sheet, column: str, target: str, only_one=False): """ Generic search method :param sheet: A Sheet XLRD Object where perform the search :param column: Name of the column where perform the search :param target: Value to find :param only_one: (boolean) search one (true) or multiple values (false) :return dict: a dictionary with a complete row of data """ target_rows = [] column_names = pyrvtools.pyrvtools.PyRvtools.get_columns_names(sheet) for row_number in range(sheet.nrows): if sheet.cell_value(row_number, column_names[column]) == target: target_rows.append(row_number) if target_rows and only_one: break all_answer = [] for target_row in target_rows: one_answer = {} for col_name, col_number in column_names.items(): one_answer[col_name] = sheet.cell_value(target_row, col_number) all_answer.append(one_answer) return all_answer
def do_sheet(self, elem): bk = self.bk sheetx = bk.nsheets # print elem.attrib rid = elem.get(U_ODREL + 'id') sheetId = int(elem.get('sheetId')) name = unescape(ensure_unicode(elem.get('name'))) reltype = self.relid2reltype[rid] target = self.relid2path[rid] if self.verbosity >= 2: self.dumpout( 'sheetx=%d sheetId=%r rid=%r type=%r name=%r', sheetx, sheetId, rid, reltype, name) if reltype != 'worksheet': if self.verbosity >= 2: self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name) return state = elem.get('state') visibility_map = { None: 0, 'visible': 0, 'hidden': 1, 'veryHidden': 2 } bk._sheet_visibility.append(visibility_map[state]) sheet = Sheet(bk, position=None, name=name, number=sheetx) sheet.utter_max_rows = X12_MAX_ROWS sheet.utter_max_cols = X12_MAX_COLS bk._sheet_list.append(sheet) bk._sheet_names.append(name) bk.nsheets += 1 self.sheet_targets.append(target) self.sheetIds.append(sheetId)
def __bypass_for_spec_col(self, sheet: Sheet, row_num: int) -> None: """ A private function that is executed when you specify a dictionary of certain fields. :param sheet: Book :param row_num: Number row :return: """ _ = sheet.row_values(row_num, self._comparison_col, self._comparison_col + 1)[0] if self.out_data.get(_): if self._info_name: print(_) self._number_count += 1 return self.out_data[_] = {} for k, v in self._dict_col.items(): value = sheet.row_values(row_num, v, v + 1)[0] self.out_data[_][k] = value if self._dict_col_category: self.__get_category(_, sheet, row_num)
def __bypass_default(self, sheet: Sheet, row_num: int) -> None: """ Private function, executed without specifying a dictionary of certain fields. :param sheet: Book :param row_num: Number row :return: """ _ = self.out_data.get( sheet.row_values(row_num, self._comparison_col, self._comparison_col + 1))[0] if _: if self._info_name: print(_) self._number_count += 1 return self.out_data[_] = {} values = sheet.row_values(row_num) num = 0 for val in values: self.out_data[_][num] = val num += 1
def do_sheet(self, elem): bk = self.bk sheetx = bk.nsheets # print elem.attrib rid = elem.get(U_ODREL + "id") sheetId = int(elem.get("sheetId")) name = unescape(ensure_unicode(elem.get("name"))) reltype = self.relid2reltype[rid] target = self.relid2path[rid] if self.verbosity >= 2: self.dumpout("sheetx=%d sheetId=%r rid=%r type=%r name=%r", sheetx, sheetId, rid, reltype, name) if reltype != "worksheet": if self.verbosity >= 2: self.dumpout("Ignoring sheet of type %r (name=%r)", reltype, name) return state = elem.get("state") visibility_map = {None: 0, "visible": 0, "hidden": 1, "veryHidden": 2} bk._sheet_visibility.append(visibility_map[state]) sheet = Sheet(bk, position=None, name=name, number=sheetx) sheet.utter_max_rows = X12_MAX_ROWS sheet.utter_max_cols = X12_MAX_COLS bk._sheet_list.append(sheet) bk._sheet_names.append(name) bk.nsheets += 1 self.sheet_targets.append(target) self.sheetIds.append(sheetId)
def xlrd_sheet_to_list_of_dict(sheet: Sheet) -> List[Dict]: """Convert an xlrd sheet into a list of dicts.""" keys = [sheet.cell(0, col_index).value for col_index in range(sheet.ncols)] dict_list = [] for row_index in range(1, sheet.nrows): d = {keys[col_index]: sheet.cell(row_index, col_index).value for col_index in range(sheet.ncols)} dict_list.append(d) return dict_list
def Get_Excel_Row_Values(filepath,sheetName,uniqueValue): Book = xlrd.open_workbook(filepath) Sheet = Book.sheet_by_name(sheetName) row_count = Sheet.nrows col_count = Sheet.ncols for i in range(0,row_count): for j in range(0,col_count): value = Sheet.cell_value(i, j) if value == uniqueValue: row_values = Sheet.row_values(i, 0) return row_values
def make_client_map(client_list: Sheet) -> Dict[str, str]: clients = client_list.col_values(1, 1) sales = client_list.col_values(3, 1) client_map = {} for i, client in enumerate(clients): sal = sales[i] if not sal: log('“{}”的业务员为空,归为“其他”'.format(client)) sal = '其他' ret = client_map.setdefault(client, sal) if ret != sal: log('“{}”同时属于“{}”和“{}”,自动归为“{}”'.format(client, ret, sal, ret)) return client_map
def make_sheet(rows,book=None,name='test sheet',number=0): if book is None: book = DummyBook() book._sheet_visibility.append(0) sheet = Sheet(book,0,name,number) book.add(sheet) for rowx in range(len(rows)): row = rows[rowx] for colx in range(len(row)): value = row[colx] if isinstance(value,tuple): cell_type,value = value else: cell_type=XL_CELL_TEXT sheet.put_cell(rowx,colx,cell_type,value,0) return sheet
def make_sheet(rows, book=None, name='test sheet', number=0): if book is None: book = DummyBook() book._sheet_visibility.append(0) sheet = Sheet(book, 0, name, number) book.add(sheet) for rowx in range(len(rows)): row = rows[rowx] for colx in range(len(row)): value = row[colx] if isinstance(value, tuple): cell_type, value = value else: cell_type = XL_CELL_TEXT sheet.put_cell(rowx, colx, cell_type, value, 0) return sheet
def process_row(self, row_index:int, sheet:Sheet): values = sheet.row_values(row_index) values = self.fix_floats(values) if values == [''] * len(values): self.add_table() self.state_process() else: self.current_table.add_row(values)
def _assert_sheet_content(sheet_name: str, actual_worksheet: Sheet, expected_worksheet: Sheet): assert (actual_worksheet.nrows == expected_worksheet.nrows ), f"Different number of rows in {sheet_name} sheet" assert (actual_worksheet.ncols == expected_worksheet.ncols ), f"Different number of columns in {sheet_name} sheet" for row_index, actual_row in enumerate(actual_worksheet.get_rows()): expected_row = expected_worksheet.row(row_index) for cell_index, actual_cell in enumerate(actual_row): expected_cell = expected_row[cell_index] assert ( actual_cell.ctype == expected_cell.ctype ), f"Different cell type in row {row_index}, col {cell_index} in {sheet_name} sheet" assert ( actual_cell.value == expected_cell.value ), f"Different cell content in row {row_index}, col {cell_index} in {sheet_name} sheet"
def get_rows_with_headers( sheet: Sheet) -> Tuple[List[Cell], Generator[List[Cell], None, None]]: """ Since it ends up happening a lot, return """ row_iterator = sheet.get_rows() headers = [cell.value for cell in next(row_iterator)] return headers, row_iterator
def _find_column_index(sheet: Sheet, column_name: str) -> int: distribution_index = -1 for i in range(sheet.ncols): if sheet.cell_value(0, i) == column_name: distribution_index = i break return distribution_index
def get_columns_names(sheet: Sheet): """ Return a dictionary with COLUMN_NAME:ID_COLUMN :param sheet: a Sheet object """ mapping = {} for col_index in range(sheet.ncols): mapping[sheet.cell_value(0, col_index)] = col_index return mapping
def get_column_values(sheet: Sheet, column_name: str) -> List: column_index = _find_column_index(sheet, column_name) if column_index == -1: raise Exception(f"Sheet does not contain column {column_name}") values = [] for i in range(1, sheet.nrows): values.append(sheet.cell_value(i, column_index)) return values
def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: n_rows = sheet.nrows n_cols = sheet.ncols res = [] for row_id in range(n_rows): row = [] for col_id in range(n_cols): value = sheet.cell_value(rowx=row_id, colx=col_id) row.append(value) res.append(row) metadata = TableMetadata(page_id=sheet_id) return Table(cells=res, metadata=metadata)
def get_headers(sheet: Sheet): row_idx: int = 0 for row in sheet.get_rows(): is_header_row: bool = True if type(row[0].value) == str and row[0].value.startswith('*'): for cell in row[1:]: if type(cell.value) == str and len(cell.value) == 0: is_header_row = is_header_row and True else: is_header_row = False else: is_header_row = False row_idx += 1 if is_header_row: break header_names: Tuple[str, ...] = tuple( map( lambda cell: cell.value.replace('.', '').replace('/', '_').replace( ' ', '_').lower(), sheet.row(row_idx))) return header_names
def __init__(self, route_table: Sheet): self.rt_idx = {} self.sc_idx = {} self.schools = {} # school: (route, abbr) routes = route_table.col_values(0) schools = route_table.col_values(1) assert len(schools) == len(routes) abbrs = route_table.col_values(2) assert len(abbrs) == len(schools) dist_route = set() for i, school in enumerate(schools): route, abbr = routes[i], abbrs[i] if (not school) or (not route) or (not abbr): stm = '不完整的记录:第{}行:“{} {} {}”,已丢弃' log(stm.format(i + 1, school, route, abbr)) else: self.schools[school] = (route, abbr) self.rt_idx.setdefault(route, len(self.rt_idx)) self.sc_idx.setdefault(school, len(self.sc_idx))
def commit_from_sheet(ws: Sheet, model: db.Model, **kwargs): """Initialize DB table data from XLRD Worksheet. Initialize table data from source data associated with corresponding data model. Args: ws (xlrd.sheet.Sheet): XLRD worksheet object. model (class): SqlAlchemy model class. """ survey, indicator, characteristic = '', '', '' if model == Data: survey = kwargs['survey'] indicator = kwargs['indicator'] characteristic = kwargs['characteristic'] header = None for i, row in enumerate(ws.get_rows()): row = [r.value for r in row] if i == 0: header = row else: row_dict = {k: v for k, v in zip(header, row)} if model == Data: survey_code = row_dict.get('survey_code') survey_id = survey.get(survey_code) row_dict['survey_id'] = survey_id indicator_code = row_dict.get('indicator_code') indicator_id = indicator.get(indicator_code) row_dict['indicator_id'] = indicator_id char1_code = row_dict.get('char1_code') char1_id = characteristic.get(char1_code) row_dict['char1_id'] = char1_id char2_code = row_dict.get('char2_code') char2_id = characteristic.get(char2_code) row_dict['char2_id'] = char2_id try: record = model(**row_dict) except (DatabaseError, ValueError, AttributeError, KeyError, IntegrityError, Exception) as err: msg = 'Error when processing data import.\n' \ '- Worksheet name: {}\n' \ '- Row number: {}\n' \ '- Cell values: {}\n\n' \ '- Original Error:\n' + \ type(err).__name__ + ': ' + str(err) msg = msg.format(ws.name, i + 1, row) logging.error(msg) raise PmaApiDbInteractionError(msg) db.session.add(record)
def __get_category(self, key: str, sheet: Sheet, row_num: int) -> None: """ The private function is executed if the dictionary of certain category fields is specified. :param key: Key record :param sheet: Book :param row_num: Number row :return: """ if self._join: cat = '' for v in self._dict_col_category.values(): _ = sheet.row_values(row_num, v, v + 1)[0] if cat: cat += f'{self._delimiter}{_}' else: cat = _ self.out_data[key]['category'] = cat else: for k, v in self._dict_col_category: value = sheet.row_values(row_num, v, v + 1)[0] self.out_data[key][k] = value
def filter_data( self, sheet: Sheet, datetime_handler: Dict[int, str] = None) -> List[Dict[str, object]]: ''' 依据过滤传入的sheet中数据,并返回数据 1、解决整型数据读取后变成小数 2、解决日期时间读取后变成小数,默认格式:'%Y-%m-%d %H:%M:%S' 3、依据datetime_handler 可以特殊格式化指定列的日期时间格式,没有输入则默认 4、布尔类型的数据,读取转换为 ture和false :param sheet: 传入Sheet对象 :param datetime_handler:列序号为key(从0开始),日期格式fmt为value的字典 (如:{0:'%Y-%m-%d %H:%M:%S',2:'%Y-%m-%d'},表示第一列使用xx格式,第三列使用xx格式) :return: 当前excel-sheet页中数据list=[row1{param1:value1,param2:value2...}, row2{param1:value1,param2:value2...},....] 以下为ctype类型: XL_CELL_EMPTY: 'empty',0 XL_CELL_TEXT: 'text',1 XL_CELL_NUMBER: 'number',2 XL_CELL_DATE: 'xldate',3 XL_CELL_BOOLEAN: 'bool',4 XL_CELL_ERROR: 'error',5 XL_CELL_BLANK: 'blank,6 ''' row_all = sheet.nrows all_data = [] # top_data = sheet.row_values(0) top_data = self.filter_row_data(sheet.row_slice(1), datetime_handler) for x in range(2, row_all): row_cell_list = sheet.row_slice(x) all_data.append( dict( zip(top_data, self.filter_row_data(row_cell_list, datetime_handler)))) return all_data
def get_merged_cells_value(sheet: Sheet, row_index, col_index): """ 先判断给定的单元格,是否属于合并单元格; 如果是合并单元格,就返回合并单元格的内容 :return: """ merged = get_merged_cells(sheet) for (rlow, rhigh, clow, chigh) in merged: if rlow <= row_index < rhigh: if clow <= col_index < chigh: cell_value = sheet.cell_value(rlow, clow) # print('该单元格[%d,%d]属于合并单元格,值为[%s]' % (row_index, col_index, cell_value)) return cell_value break return None
def get_boundaries(sheet: Sheet) -> Tuple[int, int]: start_row_num: int = -1 end_row_num: int = -1 row_idx: int = 0 for row in sheet.get_rows(): is_boundary: bool = True for cell in row: is_boundary = type( cell.value) == str and cell.value.startswith('*') if not is_boundary: break if is_boundary: if start_row_num < 0: start_row_num = row_idx elif end_row_num < 0: end_row_num = row_idx row_idx += 1 return start_row_num + 1, end_row_num - 1
def get_dividend_rows(sheet: Sheet) -> List[Dict[str, Any]]: start_row_num, end_row_num = get_boundaries(sheet) header_names: Tuple[str, ...] = get_headers(sheet) # headers: Dict[str, int] = {header_names[idx]: idx for idx in range(0, len(header_names))} dividend_rows: List[Dict[str, Any]] = [] row_num = 1 for row_idx in range(start_row_num, end_row_num): row: Dict[str, Any] = dict( zip(header_names, map(lambda cell: cell.value, sheet.row(row_idx)))) row['index'] = row_num row_num += 1 # print(row) if row['narration'].find('ACH') >= 0 or row['narration'].find( 'DIV') >= 0: dividend_rows.append(row) return dividend_rows
def __init__(self, sheet: Sheet, keys: Iterable[str] = ..., key_row=0): self.conn = sqlite3.connect(':memory:') self.cur = self.conn.cursor() self.where = None self.orders = {} # create table stm = [] col_keys = [] for i in (self.ARGS if keys is ... else keys): key, ktype, col = i.split() stm.append('{} {}'.format(key, ktype)) col_keys.append((int(col), key)) stm = 'CREATE TABLE TEMP({});'.format(','.join(stm)) self.cur.execute(stm) # add records klist = ','.join([i[1] for i in col_keys]) stm = 'INSERT INTO TEMP({}) VALUES({})' for i in range(key_row + 1, sheet.nrows): row = sheet.row_values(i) if (len(row) > 1) and (not row[1]): continue vlist = ','.join([repr(row[j[0]]) for j in col_keys]) self.cur.execute(stm.format(klist, vlist)) self.conn.commit() # add extra order if keys is not ...: return order = ['肉', '菜', '油料干货', '调料制品', '杂货类'] self.add_order('kind', order) order = ['营养餐', '非营养餐', '幼儿餐', '教师餐'] self.add_order('meal', order) cur = self.select('DISTINCT NAME, KIND') order = self.orders['kind'] names = [(order.get(k, len(order)), n) for n, k in cur] names.sort() self.add_order('name', [i[1] for i in names])
def read_student_info(sheet: Sheet, file_name: str) -> dict: student_id = file_name.replace('student_', '').replace('.xlsx', '') student_name = sheet.cell(0, 4).value student_score = round(sheet.cell(15, 4).value, 1) print('[read_student_info]\t' + student_id + ' : ' + student_name + ' : ' + str(student_score)) return {'id': student_id, 'name': student_name, 'score': student_score}
def process_schema(self, row_index:int, sheet:Sheet): self.schema = sheet.cell(row_index, 0).value
def process_table(self, row_index:int, sheet:Sheet): self.current_table_name = sheet.cell(row_index, 0).value self.current_table = Table(self.schema, self.current_table_name, [], [])
def extract_header(ws: Sheet): return (ws.cell_value(1, 1)), (ws.cell_value(1, 6))
def process_cols(self, row_index:int, sheet:Sheet): self.current_table.cols = sheet.row_values(row_index)