def output_to_excel_pivot(dataobj): wb = openworkbook(dataobj.sourcefile) datasheet = dataobj.datasheet ws_pvt = wb[dataobj.pivotsheet] ws_data = wb[datasheet] #clear data sheet and put in new data wb.remove(ws_data) wb.create_sheet(datasheet) ws_data = wb[datasheet] #header = ['id'] #header.extend(dataobj.data.columns) #print(f'header type {type(header)} : {header}') #no idea why line above doesn't work but line below does. Output appears identical #header = ['id'] + [w.replace('i', 'i') for w in list(dataobj.data.columns)] header = [w.replace('i', 'i') for w in list(dataobj.data.columns)] #print(f'header after replace type {type(header)} : {header}') ws_data.append(header) for r in dataframe_to_rows(dataobj.data, index=False, header=False): ws_data.append(r) #update pivot data area pivot = ws_pvt._pivots[0] pivot.cache.cacheSource.worksheetSource.ref = f'A1:{get_column_letter(len(dataobj.data.columns))}{len(dataobj.data.index)+1}' pivot.cache.refreshOnload = True try: wb.save(filename=dataobj.outputfile) except: print('excel file in use. No output file made') logging.warning(f'file in use {dataobj.outputfile}')
def get_df_from_driver(filepath: str, sheet: str, querystr: str = None): """ Get a dataframe of a sheet from the driver spreadsheet Pass in: filepath: Path to the driverfile sheet: Sheet with the list querystr: Optional Query string to limit the list. The form of the query string is 'active == "y"' Assumes tables starts with first row. If not try using Returns a dataframe """ filelist = None try: sfsession = ShareFileSession(SHAREFILE_OPTIONS) readlist_item = sfsession.get_io_version(filepath) control_wb = wf.openworkbook(readlist_item.io_data) if not control_wb: logging.warning('cannot open read template file list workbook') return False ws = control_wb[sheet] filelist = wf.convertwstodf(ws) if querystr: filelist = filelist.query(querystr) control_wb.close() except Exception as e: msg = f'Problem in get_df_from_driver: {e}' logging.critical(msg) print(msg) return False readlist_item = None sfsession = None return filelist
def generatefiles(session): #get field lists xreffile = params.D0_FIELD_CROSS_REF_FILE wb = openworkbook(xreffile) ws_map = wb['RptOutput'] #convert to datframe df_map = convertwstodf(ws_map, 1) ordercol = 'FieldOrder' claimcol = 'claimfields' clmfullcol = 'claimfull' dbrvwcol = 'dbreviewfields' #dbextcol = 'dbrvwextra' evalcol = 'evalfields' atrcol = 'atrfields' stepcol = 'stepeqn' rawcol = 'rawextra' #get sampleframe df_frame = pd.read_csv(params.D0_DATA_PATH + '\\cust_trkg_data_2017.csv') #df_frame = df_frame.rename(columns={'Unamed: 0': 'excelcounter'}) #df_frame = df_frame.set_index('ClaimID') cols = ['ClaimID', 'SBW.ProjID'] df_frame_short = df_frame[cols] print(f'short type is {type(df_frame_short)}') #get claim data df_claim_full = pd.read_csv(params.D0_DATA_FILE, low_memory=False) df_claim_full = df_claim_full.merge(df_frame_short, on='ClaimID') # Exclude replaced measurements df_claim = df_claim_full[df_claim_full.Replaced != 'Yes'] # Pick only sampled rows df_claim = df_claim[df_claim.sampled == 'Y'] #limit to claim fields df_map_claimfields = df_map[df_map[claimcol].notnull()] df_map_claimfields = df_map_claimfields[[ordercol, claimcol]] df_claim = df_claim[df_map_claimfields[claimcol].tolist()] claimdict = df_map_claimfields.set_index(claimcol)[ordercol].to_dict() df_claim = df_claim.rename(columns=claimdict) #print(f'claimshape is {df_claim.shape} with cols {df_claim.columns}') #get ATR claim data df_atr = pd.read_csv(params.D0_LOCAL_ATR_OUTPUT_FILE, low_memory=False) df_map_atrfields = df_map[df_map[atrcol].notnull()] df_map_atrfields = df_map_atrfields[[ordercol, atrcol]] df_atr = df_atr[df_map_atrfields[atrcol].tolist()] atrdict = df_map_atrfields.set_index(atrcol)[ordercol].to_dict() df_atr = df_atr.rename(columns=atrdict) #get eval data measures_without_sample_id = pd.read_sql( session.query(Measure).statement, session.bind) #drop project fields smplfields = [ 'RvwInstallDate', 'RvwAppVsInstallDate', 'RvwPaidIncentive', 'RvwPermit' ] measures_without_sample_id.drop(smplfields, axis=1, inplace=True) samples = pd.read_sql(session.query(Sample).statement, session.bind) measures1 = measures_without_sample_id.merge(samples, on='SBW_ProjID') df_map_evalfields = df_map[df_map[evalcol].notnull()] df_map_evalfields = df_map_evalfields[[ordercol, evalcol]] df_dups = None df_dups = df_map_evalfields[df_map_evalfields.duplicated(evalcol)] print('dup is {}'.format(df_dups)) df_map_evalfields.drop(df_dups[ordercol].tolist(), axis=0, inplace=True) evaldict = df_map_evalfields.set_index(evalcol)[ordercol].to_dict() measures = measures1[df_map_evalfields[evalcol].tolist()] measures = measures.rename(columns=evaldict) measures = measures.assign(SampledProject=1) if len(df_dups.index) > 0: #only works if there are not multiple of the same dup #add the dup fields evaldict = df_dups.set_index(evalcol)[ordercol].to_dict() tmp = measures1.rename(columns=evaldict) measures = measures.join(tmp[df_dups[ordercol].tolist()]) del tmp df_eval = measures.merge(df_claim, on='ClaimID') df_eval = df_eval.merge(df_atr, on='ClaimID', suffixes=('eval', '')) print(f'df_eval shape with claim and atr is {df_eval.shape}') #Get data from alldata df_raw_extra = pd.read_csv(params.D0_ALL_DATA_FILE, low_memory=False) df_raw_extra = remapdata(df_map, rawcol, ordercol, df_raw_extra) df_eval = df_eval.merge(df_raw_extra, on='ClaimID', suffixes=( '_old', '' )) #shouldn't be any overlap, but just in case want to keep a clean set. #get db review data df_dbreview = pd.read_csv(params.D0_DATABASE_REVIEW_FILE) #df_dbreview_ext = pd.read_csv(params.D0_DATABASE_REVIEW_EXTENDED_FILE) #drop passthru records df_dbreview = df_dbreview.query( 'EvalStdReportGroup == "2017_Savings_Review"') df_map_dbrvwfields = df_map[df_map[dbrvwcol].notnull()] df_map_dbrvwfields = df_map_dbrvwfields[[ordercol, dbrvwcol]] df_dbreview = df_dbreview[df_map_dbrvwfields[dbrvwcol].tolist()] dbrwvdict = df_map_dbrvwfields.set_index(dbrvwcol)[ordercol].to_dict() df_dbreview = df_dbreview.rename(columns=dbrwvdict) ''' #add extra fields df_map_dbextfields = df_map[df_map[dbextcol].notnull()] df_map_dbextfields = df_map_dbextfields[[ordercol, dbextcol]] df_dbreview_ext = df_dbreview_ext[df_map_dbextfields[dbextcol].tolist()] dbrwvdict = df_map_dbextfields.set_index(dbextcol)[ordercol].to_dict() df_dbreview_ext = df_dbreview_ext.rename(columns=dbrwvdict) df_dbreview = df_dbreview.merge(df_dbreview_ext, on='ClaimID') ''' #add claimfull df_map_clmfullfields = df_map[df_map[clmfullcol].notnull()] df_map_clmfullfields = df_map_clmfullfields[[ordercol, clmfullcol]] df_claim_ext = df_claim_full[df_map_clmfullfields[clmfullcol].tolist()] cfulldict = df_map_clmfullfields.set_index(clmfullcol)[ordercol].to_dict() df_claim_ext = df_claim_ext.rename(columns=cfulldict) df_dbreview = df_dbreview.merge(df_claim_ext, on='ClaimID') #add source flag df_dbreview = df_dbreview.assign(ClaimsDatabase=1) #the merge below appends dbreview rows to sample rows. Total rows = dbrevoew + sample #df_evalallrows = df_eval.append(df_dbreview) #the merge below creates extrs fields whey they overlap based on matching claimIDs. Total rows = dbreview rows. df_eval = df_eval.merge(df_dbreview, on='ClaimID', how='outer', suffixes=('', '_cdr')) #now add in the atr fields for the fullset df_eval = df_eval.merge(df_atr, on='ClaimID', how='left', suffixes=('', '_atr')) #Add in the steps for table 5 df_steps = createSteps(df_eval) df_steps = df_steps.reset_index() df_steps = remapdata(df_map, stepcol, ordercol, df_steps) df_eval = df_eval.merge(df_steps, on='ClaimID', suffixes=( '_old', '' )) #shouldn't be any overlap, but just in case want to keep a clean set. # just a diagnostic section, I think ws_map = wb['Eligibility'] #convert to datframe df_map_elig = convertwstodf(ws_map, 1) anycol = 'noteligfieldsany' allcol = 'noteligfieldsall' df_map_fields = df_map_elig[df_map_elig[anycol].notnull()] anyfields = df_map_fields[anycol].tolist() df_map_fields = df_map_elig[df_map_elig[allcol].notnull()] allfields = df_map_fields[allcol].tolist() tmp = ((df_eval[anyfields] == 'No').any(1) | (df_eval[allfields] == 'No').all(1)) print(f'ineligible list length {(tmp.sum())}') t6cnts = (df_eval[anyfields] == 'No').sum() csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\settozerocnts.csv' t6cnts.to_csv(csvfile) #print(f't6? {t6cnts}') #end of mystery section. Is it just for diagnostic purposes? print(f'df_eval shape after dbreview merge is {df_eval.shape}') #print(f'df_evalallrows shape after dbreview append is {df_evalallrows.shape}') #flag for changed NTG and EUL IDs #df_eval = df_eval.assign(NTGIDChanged = df_eval['cdrNTG_ID'] != df_eval['EvalNTG_ID']) df_eval = df_eval.assign(EvalEUL_ID_trim=df_eval['EvalEUL_ID'].str.replace( '_Any', '').str.replace('_Gro', '')) #df_eval = df_eval.assign(EULIDChanged = df_eval['cdrEUL_ID'] != df_eval['EvalEUL_ID_trim']) df_eval = df_eval.assign(MeasDescChanged=df_eval['EvalMeasDescription'] != df_eval['MeasDescription']) #new fields to indicate savings changed df_eval = df_eval.assign( kWhChanged=df_eval['EvalExPostLifecycleNetkWh_atr'] != df_eval['ExAnteLifecycleNetkWh']) df_eval = df_eval.assign( kWhPctChange=((df_eval['EvalExPostLifecycleNetkWh_atr'] - df_eval['ExAnteLifecycleNetkWh']) / df_eval['ExAnteLifecycleNetkWh'])) df_eval = df_eval.assign( thmChanged=df_eval['EvalExPostLifecycleNetTherm_atr'] != df_eval['ExAnteLifecycleNetTherm']) df_eval = df_eval.assign( thmPctChange=((df_eval['EvalExPostLifecycleNetTherm_atr'] - df_eval['ExAnteLifecycleNetTherm']) / df_eval['ExAnteLifecycleNetTherm'])) if True: #set final column order df_map_fields = df_map[df_map[ordercol].notnull()] #fieldorder = df_map_fields[ordercol].tolist() #df_final = df_eval[fieldorder] df_final = df_eval try: csvfile = params.D0_REPORT_DATA_FILE df_final.to_csv(csvfile, index=False) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_final.to_csv(params.SAMPLED_SITE_REVIEW_PATH + '\\evaldata2.csv', index=False) #generate xl file exportdata = PivotOutput() exportdata.sourcefile = params.SAMPLED_SITE_REVIEW_PATH + '\\pivottemplate.xlsx' exportdata.outputfile = params.SAMPLED_SITE_REVIEW_PATH + '\\evaldata.xlsx' exportdata.datasheet = 'data' exportdata.pivotsheet = 'pvt_data' exportdata.data = df_final output_to_excel_pivot(exportdata) print('data exported') logging.info('Done generating report data files')
def gettablespecs(sfsession, filepath, statusfilter='Test'): """ Load the various table specs from the passed file as defined on the Captions sheet Inputs - Filepath or workbook, statusfilter to control which tables Returns a list of tablespec objects (and future figure spec) """ #open workbook #assumes there is a captions sheet which will drive it sheet = 'Captions' srcsheet = 'SourceDefs' #TODO bring in def as a dataframe instead of messing with it as a sheet. if isinstance(filepath, str): file_item = sfsession.get_io_version(filepath) if file_item: filepath = file_item.io_data try: wb = openworkbook(filepath, values=True) except: wb = filepath if not wb: print('problem loading workbook. Quitting') return False try: ws = wb[sheet] except: print('Captions sheet missing') return False try: #ws_src = None ws_src = wb[srcsheet] except: print('Source defs sheet missing') return False #cycle through the captions listed if type is table #for row in ws.iter_rows(row_offset=1): #get critical col numbers for i in range(1,ws.max_column + 1): if ws.cell(1,i).value is None: continue #print(f'text is {ws.cell(1,i)}') if ws.cell(1,i).value.lower() == 'type': typecol = i elif ws.cell(1,i).value.lower() == 'text': #for old version captioncol = i elif ws.cell(1,i).value.lower() == 'caption': #for new version captioncol = i elif ws.cell(1,i).value.lower() == 'sheet': sheetcol = i elif ws.cell(1,i).value.lower() == 'status': statuscol = i elif ws.cell(1,i).value.lower() == 'source': sourcecol = i elif ws.cell(1,i).value.lower() == 'destination': destcol = i elif ws.cell(1,i).value.lower() == 'style': stylecol = i elif ws.cell(1,i).value.lower() == 'fit': fitcol = i parameters = [] #not using this section after D0 (I don't think) ''' if 'sheetcol' in locals(): #to pull filter column data. PA in the D0 example for i in range(sheetcol + 2, ws.max_column + 1): #to skip extra column param = Pfield(i, ws.cell(1,i).value) #parameters.append({pcol:i, field:ws.cell(1,i)}) parameters.append(param) ''' tablespecs = {} figurespecs = {} for i in range(1,ws.max_row +1): #print(f'processing row {i} as sheet {ws.cell(i,sheetcol).value}') #test below breaks on D0 so no longer compatible :( if ws.cell(i,destcol).value is not None and ws.cell(i,destcol).value.lower() == 'report workbook' and ws.cell(i,statuscol).value is not None and statusfilter.lower() in ws.cell(i,statuscol).value.lower(): ws_object = wb[ws.cell(i,sheetcol).value] tblspec = TableSpec() tblspec.getdataspec(ws_object) tblspec.source = SourceDef().get_details(sfsession, wb=ws.parent, sheet='SourceDefs', srccol='sourcename', srcname='sourcedef', loccol='location', shtcol='sheet', tablename=ws.cell(i,sourcecol).value ) tblspec.name = ws.cell(i,captioncol).value tblspec.get_unique_fields() tablespecs[tblspec.name] = tblspec elif ws.cell(i,destcol).value is not None and ws.cell(i,typecol).value.lower() == 'table' and ws.cell(i,statuscol).value is not None and statusfilter.lower() in ws.cell(i,statuscol).value.lower(): lkpsheet = ws.cell(i,sheetcol).value if lkpsheet not in wb.sheetnames: print(f'yo! sheet missing:{lkpsheet}') continue ws_object = wb[ws.cell(i,sheetcol).value] #ws_object = wb[ws.cell(i,captioncol).hyperlink.location.split('!')[0].replace("'", '')] sfilter = '' for item in parameters: if ws.cell(i,item.column).value: if ws.cell(i,item.column).value.isnumeric(): sfilter += f'{item.field} == {ws.cell(i,item.column).value} and ' else: sfilter += f'{item.field} == \'{ws.cell(i,item.column).value}\' and ' if sfilter == '': sfilter = None elif sfilter[-4:] == 'and ': sfilter = sfilter[:-5] tblspec = createtablespec(ws_object, sfilter) if not tblspec: print(f'problem getting tablespec for {ws_object.title}') continue tblspec.source = SourceDef().get_details(sfsession, wb=ws.parent, sheet='SourceDefs', srccol='sourcename', srcname='sourcedef', loccol='location', shtcol='sheet', tablename=ws.cell(i,sourcecol).value ) tblspec.name = ws.cell(i,captioncol).value tblspec.style = ws.cell(i,stylecol).value tblspec.autofit = ws.cell(i,fitcol).value tblspec.get_unique_fields() tablespecs[tblspec.name] = tblspec elif ws.cell(i,destcol).value is not None and ws.cell(i,typecol).value.lower() == 'figure': ws_object = wb[ws.cell(i,captioncol).hyperlink.location.split('!')[0].replace("'", '')] figurespecs[ws.cell(i,captioncol).value] = createfigurespec(ws_object) #this call currently doesn't do anythign because the procedure is just a pass #possibly only temporary processing of figures #TODO change to io verion df_captions = pd.read_excel(filepath, sheet) df_figures = df_captions[df_captions['Type'].str.contains('Plot') & df_captions['Status'].str.contains(statusfilter)].groupby('Source') for source, data in df_figures: figurespec = TableSpec() figurespec.captions = [x for x in data['Caption']] figurespec.source = SourceDef().get_details(sfsession, wb=ws.parent, sheet='SourceDefs', srccol='sourcename', srcname='sourcedef', loccol='location', shtcol='sheet', tablename=source) figurespecs[source] = figurespec specs = [] specs.append({'tables':tablespecs}) specs.append({'figures':figurespecs}) return specs
def db_to_excel(filepath): """ Move data from the db (or other source) to excel files as defined in the passed excel filename The passed workbook must have a sheet named spec and columns named active, srctype, tablename, srcindex, srcfields, destindex, destfile, sheet, startrow, fields """ sfsession = ShareFileSession(SHAREFILE_OPTIONS) #open workbook control_item = sfsession.get_io_version(filepath) if not control_item: logging.warning(f'driver file {filepath} not found for db_to_excel') return False control_wb = wf.openworkbook(control_item.io_data) ws = control_wb['spec'] filelist = wf.convertwstodf(ws) filelist = filelist.query('active == "y"') control_wb.close() control_item = None #gather needed parts #TODO sort list so all changes to a workbook can be done and then that workbook saved rather than needing to open and close for each write for filerow in filelist.itertuples(): excelpath = filerow.destfile localname = os.path.basename(excelpath) headerrow = filerow.headerrow sheet = filerow.sheet fields = filerow.fields table = filerow.tablename srcindex = filerow.srcindex dstindex = filerow.destindex wkb_item = sfsession.get_io_version(excelpath) hasvba = '.xlsm' in excelpath srcfields = filerow.srcfields srctype = filerow.srctype if srctype == 'db': #get data from db data = pd.read_sql(table, engine) elif srctype == 'csv': src_item = sfsession.get_io_version(table) if not src_item: logging.warning(f'file {table} not found for db_to_excel') continue src_item.io_data.seek(0) data = pd.read_csv(src_item.io_data) elif srctype == 'excel': src_item = sfsession.get_io_version(table) if not src_item: logging.warning(f'file {table} not found for db_to_excel') continue srcsheet = filerow.srcsheet srcrow = filerow.srcheaderrow data = pd.read_excel(src_item.io_data, sheet_name=srcsheet, start_row=srcrow - 1) srcfieldlist = srcfields.split(',') srcfieldlist.append(srcindex) data_fields = data[srcfieldlist] dstfieldlist = fields.split(',') dstfieldlist.append(dstindex) #remap src field names to dst field names data_fields.columns = dstfieldlist datalist = data_fields.to_dict('records') #this is to figure out where data should go, but since it uses values, can't save ws_vals = getSampleControlFile( filepath=wkb_item.io_data, wks=sheet, headerrow=headerrow, #not used but passing it anyway asdf=False, usevalues=True) #this is the one where values actually get written to ws_out = getSampleControlFile( filepath=wkb_item.io_data, wks=sheet, headerrow=headerrow, #not used but passing it anyway asdf=False, usevalues=False, usevba=hasvba) print(f'starting to write to excel {fields} {str(datetime.now())}') ws_results = writetoexcel(ws_vals=ws_vals, ws_out=ws_out, header_row=headerrow, datalist=datalist, key=dstindex) print(f'done writing to excel {str(datetime.now())}') if not ws_results: print('bad results') else: tracker_io = BytesIO() ws_results.parent.save(tracker_io) #upload to sf wkb_folderID = wkb_item.data['Parent']['Id'] sfsession.upload_file(wkb_folderID, localname, tracker_io) print(f'done uploading to sharefile {str(datetime.now())}')
def createSteps(data=None): if data is None: df_data = pd.read_csv(params.D0_REPORT_DATA_FILE) else: df_data = data xreffile = params.D0_FIELD_CROSS_REF_FILE wb = openworkbook(xreffile) ws_map = wb['stepeqn'] #convert to datframe df_map = convertwstodf(ws_map, 1) fieldcol = 'fieldname' eqncol = ['eqn_true', 'eqn_false', 'eqn_condition'] eqnheaders = [fieldcol] eqnheaders.extend(eqncol) #some constants for the calculated stuff operators = '*/()+-><!==AND&OR|.' stringops = 'ANDOR&|' dfname = 'df_all_fields' df_all_fields = df_data.set_index('ClaimID') #pull in conditional df_map_eqnfields = df_map[df_map[eqncol].notnull().any(1)] df_map_eqnfields = df_map_eqnfields[eqnheaders] print(f'all fields column count to start is {len(df_all_fields.columns)}') for _, row in df_map_eqnfields.iterrows(): parts_true = row[eqncol[0]].split() try: parts_false = row[eqncol[1]].split() parts_cond = row[eqncol[2]].split() eqnonly = False except: eqnonly = True nojoin = False for p in parts_true: if len(parts_true) == 1: eqn_true = p nojoin = True elif p not in operators and '==' not in p and '.' not in p and not is_number( p): parts_true[parts_true.index(p)] = "{}['{}']".format(dfname, p) if not nojoin: eqn_true = ''.join(parts_true) nojoin = False if not eqnonly: for p in parts_false: if len(parts_false) == 1: eqn_false = p nojoin = True elif p not in operators and '==' not in p and not is_number(p): parts_false[parts_false.index(p)] = "{}['{}']".format( dfname, p) if not nojoin: eqn_false = ''.join(parts_false) nojoin = False for p in parts_cond: if len(parts_cond) == 1: eqn_cond = p nojoin = True elif p not in operators and 'No' not in p and '==' not in p and '.' not in p and not is_number( p ): #maybe change to startswith in operators if can use list parts_cond[parts_cond.index(p)] = "{}['{}']".format( dfname, p) elif p in operators: parts_cond[parts_cond.index(p)] = ' ' + p + ' ' if not nojoin: eqn_cond = ''.join(parts_cond) #print('eqn true:{}'.format(eqn_true)) #print('eqn false:{}'.format(eqn_false)) #print('eqn cond:{}'.format(eqn_cond)) #df['d'] = df['b'].where(df['b'] < 0, df['c']) if eqnonly: myargs = eqn_true else: myargs = eqn_true + '.where(' + eqn_cond + ', ' + eqn_false + ')' dftmp = df_all_fields #dftmp.set_index('ClaimID', inplace=True) #print(f'all fields colmn count before eval is {len(df_all_fields.columns)}') print(f'processing {row[fieldcol]}') #', my args is {myargs}') #no idea how the line below is adding col to df_all_fields dftmp[row[fieldcol]] = eval(myargs) #dftmp = dftmp[row[fieldcol]].reset_index().set_index('ClaimID') #print(f'all fields colmn count after eval is {len(df_all_fields.columns)}') #df_all_fields = df_all_fields.join(dftmp) #print(f'all fields colmn count after join is {len(df_all_fields.columns)}') #print('ugh') #df_all_fields = df_all_fields.merge(dftmp, on='ClaimID') print(f'all field shape is {df_all_fields.shape}') answerfields = [ 'ExAnteLifecycleNetkW', 'ExAnteLifecycleNetkWh', 'ExAnteLifecycleNetTherm', 'cdrdatekw', 'cdrdatekwh', 'cdrdatethm', 'cdrntgeligkw', 'cdrntgeligkwh', 'cdrntgeligthm', 'cdrulntgeligkw', 'cdrulntgeligkwh', 'cdrulntgeligthm' ] countfields = [ 'cdrdateineligibleflagkw', 'cdrdateineligibleflagkwh', 'cdrdateineligibleflagthm', 'cdrdatentgineligibleflagkw', 'cdrdatentgineligibleflagkwh', 'cdrdatentgineligibleflagthm', 'cdrdatentgulineligibleflagkw', 'cdrdatentgulineligibleflagkwh', 'cdrdatentgulineligibleflagthm' ] answerfieldsPA = answerfields answerfieldsPA.append('PA') df_all_fieldsShort = df_all_fields[answerfieldsPA] summarytable = df_all_fieldsShort.groupby(['PA']).sum() csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_dbr_SumsbyPA.csv' summarytable.to_csv(csvfile) answerfieldsPA = countfields answerfieldsPA.append('PA') df_all_fieldsShort = df_all_fields[answerfieldsPA] summarytable = df_all_fieldsShort.groupby(['PA']).sum() csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_dbr_CountsbyPA.csv' summarytable.to_csv(csvfile) ''' pd.options.display.float_format = '{:20,.0f}'.format print(f'step 1 summary {summarytable}') df_output = reshapestepssummary(summarytable) csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_dbr_SumsAll.csv' df_output.to_csv(csvfile) summarytable = df_all_fields[countfields].agg('sum') print(f'step 1 count summary {summarytable}') df_output = reshapestepssummary(summarytable) csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_dbr_CountsAll.csv' df_output.to_csv(csvfile) ''' #sample only version answerfields = [ 'ExAnteLifecycleNetkW', 'ExAnteLifecycleNetkWh', 'ExAnteLifecycleNetTherm', 'evaleligkw', 'evaleligkwh', 'evaleligthm', 'evalsvgsEligkw', 'evalsvgsEligkwh', 'evalsvgsEligthm', 'evalULsvgeligkw', 'evalULsvgeligkwh', 'evalULsvgeligthm', 'evalNTGULsvgeligkw', 'evalNTGULsvgeligkwh', 'evalNTGULsvgeligthm' ] countfields = [ 'evalineligibleflagkw', 'evalineligibleflagkwh', 'evalineligibleflagthm', 'evalsvgschangeflagkw', 'evalsvgschangeflagkwh', 'evalsvgschangeflagthm', 'evalULchangeflagkw', 'evalULchangeflagkwh', 'evalULchangeflagthm', 'evalNTGchangeflagkw', 'evalNTGchangeflagkwh', 'evalNTGchangeflagthm' ] summarytable = df_all_fields.query( 'SampledProject == 1')[answerfields].agg('sum') #print(f'step 2x summary {summarytable}') summarytable = df_all_fields.query('SampledProject == 1')[countfields].agg( 'sum') #print(f'step 2x count summary {summarytable}') #ATR version answerfields = [ 'ExAnteLifecycleNetkW', 'ExAnteLifecycleNetkWh', 'ExAnteLifecycleNetTherm', 'atr_eligkw', 'atr_eligkwh', 'atr_eligthm', 'atr_svgsEligkw', 'atr_svgsEligkwh', 'atr_svgsEligthm', 'atr_NTGsvgeligkw', 'atr_NTGsvgeligkwh', 'atr_NTGsvgeligthm', 'atr_NTGULsvgeligkw', 'atr_NTGULsvgeligkwh', 'atr_NTGULsvgeligthm' ] countfields = [ 'atr_ineligibleflagkw', 'atr_ineligibleflagkwh', 'atr_ineligibleflagthm', 'atr_svgschangeflagkw', 'atr_svgschangeflagkwh', 'atr_svgschangeflagthm', 'atr_ULchangeflagkw', 'atr_ULchangeflagkwh', 'atr_ULchangeflagthm', 'atr_NTGchangeflagkw', 'atr_NTGchangeflagkwh', 'atr_NTGchangeflagthm' ] answerfieldsPA = answerfields answerfieldsPA.append('PA') df_all_fieldsShort = df_all_fields[answerfieldsPA] summarytable = df_all_fieldsShort.groupby(['PA']).sum() csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_DS_SumsbyPA.csv' summarytable.to_csv(csvfile) answerfieldsPA = countfields answerfieldsPA.append('PA') df_all_fieldsShort = df_all_fields[answerfieldsPA] summarytable = df_all_fieldsShort.groupby(['PA']).sum() csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\steps_DS_CountsbyPA.csv' summarytable.to_csv(csvfile) csvfile = params.D0_REPORT_STEPTABLE_DATA_FILE try: df_all_fields.to_csv(csvfile, index=True) except: print('drat, step file in use. no file created.') if data is not None: return df_all_fields
def createEvalResults(session, WriteFinalFile=False): #create sample file data = pd.read_csv(params.D0_DATA_FILE) # Exclude replaced measurements not_replaced = data[data.Replaced != 'Yes'] # Pick only sampled rows sample_list = not_replaced[not_replaced.sampled == 'Y'] # Group by SampleID and SBW_ProjID sample_groups = sample_list.groupby( ['SampleID', 'SBW_ProjID']).size().reset_index().rename(columns={0: ''}) sample_groups = sample_groups.rename(columns={'': 'msrcount'}) sample_groups = sample_groups.astype({'SampleID': int}) #read data from database df_control = getSampleControlFile() df_control = df_control[df_control['ProjectStatus'] == 'Complete'] #join together so have complete and projectids mylist = df_control.set_index('SampleID').join( sample_groups.set_index('SampleID'), how='left', lsuffix='_msr') #print('MYLIST TYPE IS {}'.format(mylist)) #print('list cols:{}'.format(mylist.columns)) msrheaders = ['SBW_ProjID', 'ProjectStatus', 'msrcount'] mylist = mylist[msrheaders] print('complete list shape: {}'.format(mylist.shape)) #print(mylist) #s.query(User).filter(User.name == 'Mariana').one() #msrs = session.query(Measure).all() #print('msr type is {}'.format(type(msrs))) #df_msrs = pd.DataFrame(session.query(Measure).all()) df_msrs = pd.read_sql(session.query(Measure).statement, session.bind) #print('msr type: {}'.format(type(df_msrs))) #mylist.set_index('SBW_ProjID', inplace=True) #df_msrs.set_index('SBW_ProjID', inplace=True) df_complete_msrs = mylist.set_index('SBW_ProjID').join( df_msrs.set_index('SBW_ProjID'), how='left', lsuffix='_msr') #df_complete_msrs = mylist.join(df_msrs, how='left', lsuffix='_msr') #print('msr shape:{}'.format(df_msrs.shape)) print('msr complete shape:{}'.format(df_complete_msrs.shape)) #print(df_complete_msrs) #Make adjustments #read crossreference file xreffile = params.D0_FIELD_CROSS_REF_FILE wb = openworkbook(xreffile) ws_map = wb['Eligibility'] #convert to datframe df_map = convertwstodf(ws_map, 1) anycol = 'noteligfieldsany' allcol = 'noteligfieldsall' df_map_fields = df_map[df_map[anycol].notnull()] anyfields = df_map_fields[anycol].tolist() df_map_fields = df_map[df_map[allcol].notnull()] allfields = df_map_fields[allcol].tolist() #Measures Not eligible savingsfields = [ 'EvalBase1kWhSvgs', 'EvalBase1kWSvgs', 'EvalBase1ThermSvgs', 'EvalBase2kWhSvgs', 'EvalBase2kWSvgs', 'EvalBase2ThermSvgs' ] df_complete_msrs.reset_index(inplace=True) df_complete_msrs['EvalIneligiblekw'] = False df_complete_msrs['EvalIneligiblekwh'] = False df_complete_msrs['EvalIneligiblethm'] = False for field in savingsfields: if 'kWSvgs' in field: engtype = 'kw' if 'kWhSvgs' in field: engtype = 'kwh' if 'ThermSvgs' in field: engtype = 'thm' df_complete_msrs[field + '_Orig'] = df_complete_msrs[ field] #create Orig field to hold original value df_complete_msrs[field + '_ChangeReason'] = 'NA' df_complete_msrs[field] = df_complete_msrs[field].where( (df_complete_msrs[anyfields] != 'No').all(1), 0) #df_complete_msrs[field] = df_complete_msrs[field].where((df_complete_msrs[anyfields] != 'No').all(1) # & (df_complete_msrs[allfields] != 'No').any(1), 0) df_complete_msrs[field + '_ChangeReason'] = df_complete_msrs[ field + '_ChangeReason'].where( (df_complete_msrs[anyfields] != 'No').all(1), 'Ineligible') #df_complete_msrs[field+ '_ChangeReason'] = df_complete_msrs[field+ '_ChangeReason'].where((df_complete_msrs[anyfields] != 'No').all(1) # & (df_complete_msrs[allfields] != 'No').any(1), 'Ineligible') df_complete_msrs['EvalIneligible' + engtype] = df_complete_msrs[ 'EvalIneligible' + engtype].where( (df_complete_msrs[anyfields] != 'No').all(1), True) #df_complete_msrs['EvalIneligible' + engtype] = df_complete_msrs['EvalIneligible' + engtype].where((df_complete_msrs[anyfields] != 'No').all(1) # & (df_complete_msrs[allfields] != 'No').any(1), True) #ntgr to zero ntgrfields = ['EvalNTG_kWH', 'EvalNTG_therms'] #, 'EvalNTGRTherm','EvalNTGRCost'] df_complete_msrs['ProgInfluenceFlag'] = ( df_complete_msrs[allfields] == 'No').all(1) for field in ntgrfields: df_complete_msrs[field] = df_complete_msrs[field].where( (df_complete_msrs[allfields] != 'No').any(1), 0) df_project = pd.read_sql(session.query(Sample).statement, session.bind) print('prj shape:{}'.format(df_project.shape)) #produces cartesian join, ouch #df_join = pd.DataFrame(session.query(Measure, Sample).all()) #print('join shape:{}'.format(df_join.shape)) #add calculated/lookup fields #set column names ws_map = wb['mapping'] #convert to datframe df_map = convertwstodf(ws_map, 1) #print('map shape is {}'.format(df_map.shape)) #print(df_map) #df_all_msrs[df_all_msrs[testfields_MeasAppType].notnull().any(1) atrfinalcol = 'ATRFieldlist' atrcol = 'InternalFields' claimcol = 'FD_SampleFieldName' wkbcol = 'workbookFieldName' dbcol = 'databasefieldname' srccol = 'atraccess' constcol = 'Constant' calccol = 'Calculation' calc2col = 'DependantCalc' rndcol = 'roundto' eqncol = ['eqn_true', 'eqn_false', 'eqn_condition'] claimheaders = [atrcol, claimcol] #wkbheaders = [atrcol,wkbcol] dbheaders = [atrcol, dbcol] srcheaders = [atrcol, srccol] calcheaders = [atrcol, calccol] calc2headers = [atrcol, calc2col] constheaders = [atrcol, constcol] eqnheaders = [atrcol] eqnheaders.extend(eqncol) rndheaders = [atrfinalcol, rndcol] #Bring in Faiths claim fields df_map_claimfields = df_map[df_map[claimcol].notnull()] df_map_claimfields = df_map_claimfields[claimheaders] df_dups = None df_dups = df_map_claimfields[df_map_claimfields.duplicated(claimcol)] print('dup is {}'.format(df_dups)) df_map_claimfields.drop(df_dups[atrcol].tolist(), axis=0, inplace=True) claimdict = df_map_claimfields.set_index(claimcol)[atrcol].to_dict() df_atr = sample_list[df_map_claimfields[claimcol].tolist()] df_atr = df_atr.rename(columns=claimdict) #if df_dups is not None: if len(df_dups.index) > 0: #only works if there are not multiple of the same dup #add the dup fields claimdict = df_dups.set_index(claimcol)[atrcol].to_dict() tmp = sample_list.rename(columns=claimdict) df_atr = df_atr.join(tmp[df_dups[atrcol].tolist()]) del tmp #print ('atr shape : {}, cols: {}'.format(df_atr.shape, df_atr.columns)) #Add fields from db column df_map_dbfields = df_map[df_map[dbcol].notnull()] df_map_dbfields = df_map_dbfields[dbheaders] df_dups = None df_dups = df_map_dbfields[df_map_dbfields.duplicated(dbcol)] print('dup is {}'.format(df_dups)) df_map_dbfields.drop(df_dups[atrcol].tolist(), axis=0, inplace=True) #works even if no dups dbdict = df_map_dbfields.set_index(dbcol)[atrcol].to_dict() df_eval_msrs = df_complete_msrs[df_map_dbfields[dbcol].tolist()] df_eval_msrs = df_eval_msrs.rename(columns=dbdict) df_atr = df_atr.set_index('ClaimId').join( df_eval_msrs.set_index('ClaimId') ) #uses lowercase because fields have been renamed to ATR versions #df_atr = sample_list[df_map_dbfields[claimcol].tolist()] #df_atr = df_atr.rename(columns=claimdict) if len(df_dups.index) > 0: #only works if there are not multiple of the same dup #add the dup fields dbdictdup = df_dups.set_index(dbcol)[atrcol].to_dict() tmp = df_complete_msrs.rename(columns=dbdictdup) if not df_atr.index.name == 'ClaimId': df_atr.set_index('ClaimId', inplace=True) df_atr = df_atr.join( tmp.set_index('ClaimID')[df_dups[atrcol].tolist()]) del tmp #print ('atr shape : {}, cols: {}'.format(df_atr.shape, df_atr.columns)) #print ('atr shape with db fields : {}'.format(df_atr.shape)) # Add in atraccess fields df_map_srcfields = df_map[df_map[srccol].notnull()] df_map_srcfields = df_map_srcfields[srcheaders] df_src = pd.read_csv(params.D0_ATR_SOURCE_FILE) df_src = df_src[df_map_srcfields[srccol].tolist()] df_atr = df_atr.join(df_src.set_index('ClaimId')) #print ('atr shape with atr source fields : {}'.format(df_atr.shape)) # At this point df_atr has all the fields in columns B:F from cross ref. Just need to add the calculated fields #pull in constants df_map_constants = df_map[df_map[constcol].notnull()] df_map_constants = df_map_constants[constheaders] dbdict = df_map_constants.set_index(atrcol)[constcol].to_dict() for i in dbdict: if isinstance(dbdict[i], (int, float, complex)) or dbdict[i].lower() != "null": df_atr[i] = dbdict[i] else: df_atr[i] = None #print ('atr shape with constants added : {}'.format(df_atr.shape)) #some constants for the calculated stuff operators = '*/()+-><!==AND&OR|.' stringops = 'ANDOR&|' dfname = 'df_all_fields' #pull in conditional df_map_eqnfields = df_map[df_map[eqncol].notnull().any(1)] df_map_eqnfields = df_map_eqnfields[eqnheaders] for _, row in df_map_eqnfields.iterrows(): df_all_fields = df_atr.join(df_complete_msrs.set_index('ClaimID'), how='left', rsuffix='_msr') #print ('type of row is {}, fieldname is {}, value is {}'.format(type(row), row[eqncol], row)) parts_true = row[eqncol[0]].split() parts_false = row[eqncol[1]].split() parts_cond = row[eqncol[2]].split() nojoin = False for p in parts_true: if len(parts_true) == 1: eqn_true = p nojoin = True elif p not in operators and '==' not in p and not is_number(p): parts_true[parts_true.index(p)] = "{}['{}']".format(dfname, p) if not nojoin: eqn_true = ''.join(parts_true) nojoin = False for p in parts_false: if len(parts_false) == 1: eqn_false = p nojoin = True elif p not in operators and '==' not in p and not is_number(p): parts_false[parts_false.index(p)] = "{}['{}']".format( dfname, p) if not nojoin: eqn_false = ''.join(parts_false) nojoin = False for p in parts_cond: if len(parts_cond) == 1: eqn_cond = p nojoin = True elif p not in operators and '==' not in p and '.' not in p and not is_number( p ): #maybe change to startswith in operators if can use list parts_cond[parts_cond.index(p)] = "{}['{}']".format(dfname, p) if not nojoin: eqn_cond = ''.join(parts_cond) #print('eqn true:{}'.format(eqn_true)) #print('eqn false:{}'.format(eqn_false)) #print('eqn cond:{}'.format(eqn_cond)) #df['d'] = df['b'].where(df['b'] < 0, df['c']) myargs = eqn_true + '.where(' + eqn_cond + ', ' + eqn_false + ')' dftmp = df_complete_msrs.set_index('ClaimID') dftmp[row[atrcol]] = eval(myargs) dftmp = dftmp[row[atrcol]] df_atr = df_atr.join(dftmp) #print ('atr shape with conditionals added : {}'.format(df_atr.shape)) ### #pull in the calculated fields df_map_calcfields = df_map[df_map[calccol].notnull()] df_map_calcfields = df_map_calcfields[calcheaders] #append dependent calcs df_map_calc2fields = df_map[df_map[calc2col].notnull()] df_map_calc2fields = df_map_calc2fields[calc2headers] df_map_calc2fields.columns = calcheaders #.rename({calc2col : calccol}, inplace=True) df_map_calcfields = pd.concat([df_map_calcfields, df_map_calc2fields]) #loop through for _, row in df_map_calcfields.iterrows(): df_all_fields = df_atr.join(df_complete_msrs.set_index('ClaimID'), how='left', rsuffix='_msr') parts = row[calccol].split() nojoin = False for p in parts: if len(parts) == 1: eqn = p nojoin = True elif p not in operators and '==' not in p and '.' not in p and not is_number( p): parts[parts.index(p)] = "{}['{}']".format(dfname, p) elif p in stringops: parts[parts.index(p)] = ' %s ' % p if not nojoin: eqn = ''.join(parts) nojoin = False myargs = eqn dftmp = df_complete_msrs.set_index('ClaimID') dftmp[row[atrcol]] = eval(myargs) dftmp = dftmp[row[atrcol]] df_atr = df_atr.join(dftmp) #New appends the last field delete if trouble df_all_fields = df_atr.join(df_complete_msrs.set_index('ClaimID'), how='left', rsuffix='_msr') #above ok? print('atr shape with everything added : {}'.format(df_atr.shape)) ## Drop passthru from atr to dbreview record will stay in place #df[df.name != 'Tina'] df_atr = df_atr[df_atr.EvalNetPassThru != True] print('atr shape after drop EvalNetPassThru : {}'.format(df_atr.shape)) # Pull in db review file df_dbrvw = pd.read_csv(params.D0_DATABASE_REVIEW_FILE) # Hack to fix string number column. Not needed anymore #df_dbrvw["ExAnteLifecycleNetkWh"] = df_dbrvw["ExAnteLifecycleNetkWh"].str.replace('-', '0') #df_dbrvw["ExAnteLifecycleNetkWh"] = df_dbrvw["ExAnteLifecycleNetkWh"].str.replace(',', '').astype(float) #Combine with atr #dropping all sampled records, then join ''' Didn't work #df_dbrvwnonsampled = (df_dbrvw.merge(df_atr, on=['ClaimID', 'ClaimId'], how='left', indicator=True) df_dbrvwnonsampled = (df_dbrvw.merge(df_atr, how='left', indicator=True) .query('_merge == "left_only"') .drop('_merge', 1)) ''' df_dbrvwnonsampled = df_dbrvw[~df_dbrvw.ClaimId.isin(df_atr.index)] #print('original shape is {}, after drop it is {}'.format(df_dbrvw.shape, df_dbrvwnonsampled.shape)) #df_atr.reset_index(inplace=True) df_fullatr = pd.concat([df_dbrvwnonsampled.set_index('ClaimId'), df_atr], sort=False) try: csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\allfieldsatrdata.csv' df_fullatr.to_csv(csvfile) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_fullatr.to_csv(params.SAMPLED_SITE_REVIEW_PATH + '\\allfieldsatrdata2.csv') #get cols df_map_atrfields = df_map[df_map[atrfinalcol].notnull()] atrcols = df_map_atrfields[atrfinalcol].tolist() #print('final cols are {}'.format(atrcols)) df_fullatr.index.name = 'ClaimId' df_fullatr = df_fullatr.reset_index() #df_fullatr.rename(columns={'ClaimID': 'ClaimId'}, inplace = True ) df_fullatr = df_fullatr[atrcols] df_fullatr.set_index('ClaimId', inplace=True) #Final data Clean up #df_fullatr.fillna(0, inplace=True) # don't turn back on unless deal with marketeffectsbenefits nulls df_fullatr.replace({True: '1', False: '0'}, inplace=True) # round output for passthru = 0 rounding = False if (rounding): df_map_roundfields = df_map[df_map[rndcol].notnull()] df_map_roundfields = df_map_roundfields[rndheaders] df_map_roundfields.set_index(atrfinalcol, inplace=True) df_map_roundfields = df_map_roundfields.astype({rndcol: int}) df_map_roundfields['types'] = 'float' df_fullatr = df_fullatr.astype(df_map_roundfields['types']) print('fullatr shape before round: {}'.format(df_fullatr.shape)) df_fullatr[df_fullatr['EvalNetPassThru'] == 1].round( df_map_roundfields[rndcol]) print('fullatr shape after round: {}'.format(df_fullatr.shape)) #df_fullatr = sigfigs(df_fullatr) print('alldata shape: {}'.format(df_all_fields.shape)) logging.info('printing atr files') try: csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\atrdata.csv' df_atr.to_csv(csvfile) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_atr.to_csv(params.SAMPLED_SITE_REVIEW_PATH + '\\tmpatr2.csv') try: csvfile = params.D0_ALL_DATA_FILE #csvfile = params.SAMPLED_SITE_REVIEW_PATH + '\\alldata.csv' df_all_fields.to_csv(csvfile) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_all_fields.to_csv(params.SAMPLED_SITE_REVIEW_PATH + '\\alldata2.csv') try: csvfile = params.D0_LOCAL_ATR_OUTPUT_FILE df_fullatr.to_csv(csvfile) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_fullatr.to_csv(params.D0_ESPI_PATH + '\\finalatr_backup.csv') if WriteFinalFile: try: csvfile = params.D0_ATR_OUTPUT_FILE df_fullatr.to_csv(csvfile) logging.info(f'wrote csv for {csvfile}') except: logging.info("file in use writing to backup") df_fullatr.to_csv(params.D0_ESPI_PATH + '\\actualatr_backup.csv') print('yup')