def split_table_by_number(xlsTable, row_number, output, sheetName=None, sheetIndex=None): """ Split a table by row number Given a number of rows, this method will split an input table in several tables with a number of rows equal to row_number. TODO: Do it with Pandas """ import xlrd; import xlwt from gasp.fm import tbl_to_obj from gasp.pyt.xls.fld import col_name COLUMNS_ORDER = col_name( xlsTable, sheet_name=sheetName, sheet_index=sheetIndex ) DATA = tbl_to_obj(xlsTable, sheet=sheetIndex if sheetIndex else sheetName, output='array' ) # Create output out_xls = xlwt.Workbook() l = 1 s = 1 base = sheetName if sheetName else 'data' for row in DATA: if l == 1: sheet = out_xls.add_sheet('{}_{}'.format(base, s)) # Write Columns for col in range(len(COLUMNS_ORDER)): sheet.write(0, col, COLUMNS_ORDER[col]) for col in range(len(COLUMNS_ORDER)): sheet.write(l, col, row[COLUMNS_ORDER[col]]) l += 1 if l == row_number + 1: l = 1 s += 1 # Save result out_xls.save(output)
def export_cells_not_in(inTable, noTable, outTable, inSheet, noSheet, inFID, noFID): """ Export to a new file the cells of in Table not in noTable """ import xlrd import xlwt from gasp.fm import tbl_to_obj from gasp.pyt.xls.fld import col_name, get_columns_position from gasp.pyt.xls.summ import list_unique_values_column # TODO: check if tables are xls # Get Data inData = tbl_to_obj(inTable, sheet=inSheet, output='array') COLUMNS = col_name(inTable, sheet_name=inSheet) # From noDATA, get IDS that will not be in the outTable noXls = xlrd.open_workbook(noTable) _noSheet = noXls.sheet_by_name(noSheet) colsPosition = get_columns_position(_noSheet, noFID) noFIDS = list_unique_values_column(_noSheet, colsPosition[noFID]) # Create Output out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(inSheet) # Write columns titles for c in range(len(COLUMNS)): new_sheet.write(0, c, COLUMNS[c]) # Write data not in noData l = 1 for row in inData: if row[inFID] not in noFIDS: c = 0 for col in COLUMNS: new_sheet.write(l, c, row[col]) c += 1 l += 1 out_xls.save(outTable) return outTable
def cols_name(ff, sheetName=None, sheetIdx=None): """ Get Columns Name in File, regardeless the type of file """ from gasp.pyt.oss import fprop fFormat = fprop(ff, 'ff') if fFormat == '.xlsx' or fFormat == '.xls': from gasp.pyt.xls.fld import col_name cols = col_name(ff, sheet_name=sheetName, sheet_index=sheetIdx) else: raise ValueError( 'File format is not valid!' ) return cols
def join_xls_table(main_table, fid_main, join_table, fid_join, copy_fields, out_table, main_sheet=None, join_sheet=None): """ Join tables using a commum attribute Relations: - 1 to 1 - N to 1 TODO: Use Pandas Instead """ import xlwt from gasp.fm import tbl_to_obj from gasp.pyt.xls.fld import col_name copy_fields = [copy_fields] if type(copy_fields) == str else \ copy_fields if type(copy_fields) == list else None if not copy_fields: raise ValueError('copy_fields should be a list or a string') # main_table to dict mainData = tbl_to_obj(main_table, sheet=main_sheet, useFirstColAsIndex=True, output='dict') # join table to dict joinData = tbl_to_obj(join_table, sheet=join_sheet, useFirstColAsIndex=True, output='dict') # write output data out_sheet_name = 'data' if not main_sheet and not join_sheet else join_sheet \ if join_sheet and not main_sheet else main_sheet out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(out_sheet_name) # Write tiles COLUMNS_ORDER = col_name(main_table, sheet_name=main_sheet) TITLES = COLUMNS_ORDER + copy_fields for i in range(len(TITLES)): new_sheet.write(0, i, TITLES[i]) # parse data l = 1 for fid in mainData: new_sheet.write(l, 0, fid) c = 1 for col in COLUMNS_ORDER[1:]: new_sheet.write(l, c, mainData[fid][col]) c += 1 for col in copy_fields: if fid in joinData: new_sheet.write(l, c, joinData[fid][col]) c += 1 l += 1 out_xls.save(out_table)
interest_columns = [interest_columns] if type(interest_columns) == str else \ interest_columns if type(interest_columns) == list else None if not interest_columns: raise ValueError( 'interest_columns should be a list or a string' ) # XLS data to dict data = tbl_to_obj( xls_path, sheet_name=sheet, useFirstColAsIndex=True, output='dict' ) # Get Order Values COLUMNS_BY_ORDER = col_name(xls_path, sheet_name=sheet) # Store and map changes changes = {} # Replace values for fid in data: for col in interest_columns: if charToReplace in data[fid][col]: repObj = data[fid][col].replace(charToReplace, _replacement) data[fid][col] = repObj if fid not in changes: changes[fid] = {col : data[fid][col]} else: changes[fid][col].update({col: data[fid][col]})
def xlstimedelta_to_pddf(inXls, timecol, sheet_name=None, sheet_index=None, columnsToMantain=None): """ Convert a table with a column with timedelta values to a valid Pandas DataFrame """ import datetime import xlrd from xlrd import xldate_as_tuple from gasp.pyt import obj_to_lst from gasp.pyt.xls.sheet import get_sheet_obj from gasp.pyt.xls.fld import col_name, get_columns_position __xls = xlrd.open_workbook(inXls) sheet = get_sheet_obj(__xls, name=sheet_name, index=sheet_index) # Get Cols name COLS_NAME = col_name(sheet) if not columnsToMantain else \ obj_to_lst(columnsToMantain) if type(timecol) != int: if timecol not in COLS_NAME: COLS_NAME.append(timecol) # Get Cols position COLS_POS = get_columns_position(sheet, COLS_NAME) POS_COLS = COLS_POS.values() if type(timecol) == int: COL_TIME_POSITION = timecol else: COL_TIME_POSITION = COLS_POS[timecol] data = [] for row in range(1, sheet.nrows): l_col = [] for col in range(sheet.ncols): if col not in POS_COLS: continue if col == COL_TIME_POSITION: tt = xldate_as_tuple( sheet.cell(row, col).value, __xls.datemode) l_col.append( datetime.timedelta(weeks=tt[1], days=tt[2], hours=tt[3], minutes=tt[4], seconds=tt[5])) else: l_col.append(sheet.cell(row, col).value) data.append(l_col) df = pandas.DataFrame(data, columns=COLS_NAME) return df