def combine_data(self, df): df = combining_data(df, self.key, vmc.datadatecol, **self.p) df = utl.data_to_type(df, date_col=vmc.datadatecol) df = utl.apply_rules(df, self.vm_rules, utl.PRE, **self.p) df = combining_data(df, self.key, vmc.datafloatcol, **self.p) df = utl.data_to_type(df, vmc.datafloatcol, vmc.datadatecol) return df
def strip_dict(tdf, col, data_dict): tdf = tdf.copy() tdf = utl.data_to_type(tdf, str_col=[dctc.DICT_COL_FNC]) tdf = tdf[tdf[dctc.DICT_COL_FNC] == 'Strip'] data_dict = utl.data_to_type(data_dict, str_col=[col]) for val in tdf[dctc.DICT_COL_VALUE].unique(): data_dict[col] = data_dict[col].str.replace(val, '') return data_dict
def select_translation(tdf, col, data_dict, fnc_type='Select'): if dctc.DICT_COL_SEL not in tdf.columns: return data_dict tdf = tdf.copy() tdf = utl.data_to_type(tdf, str_col=[dctc.DICT_COL_FNC]) select_rows = tdf[dctc.DICT_COL_FNC].str.contains(fnc_type, na=False) tdf = tdf[select_rows].copy() tdf[dctc.DICT_COL_FNC] = tdf[dctc.DICT_COL_FNC].str.split('::').str[1] sel = tdf[[ dctc.DICT_COL_FNC, dctc.DICT_COL_SEL, dctc.DICT_COL_VALUE, dctc.DICT_COL_NVALUE ]].to_dict(orient='index') for s in sel: col2 = sel[s][dctc.DICT_COL_FNC] col2_q = sel[s][dctc.DICT_COL_SEL] val = sel[s][dctc.DICT_COL_VALUE] nval = sel[s][dctc.DICT_COL_NVALUE] if col2 not in data_dict.columns: continue if fnc_type == 'Select': data_dict.loc[(data_dict[col2].astype('U') == col2_q) & (data_dict[col] == val), col] = nval if fnc_type == 'Set': data_dict.loc[data_dict[col2].astype('U') == col2_q, col] = nval if fnc_type == 'Append': mask = ((data_dict[col2].astype('U') == col2_q) & (data_dict[col].str[-len(nval):] != nval)) data_dict.loc[mask, col] = (data_dict.loc[mask, col] + nval) return data_dict
def clean_df(self, df): if df.empty: return df df = df.drop([jsonmet, jsonseg], axis=1).set_index(colcid) ndf = pd.DataFrame(columns=[coldate, colcid]) ndf = utl.data_to_type(ndf, str_col=[colcid], int_col=[coldate]) for col in df.columns: tdf = df[col].apply(lambda x: self.clean_data(x)).apply(pd.Series) tdf = tdf.unstack().reset_index() tdf = tdf.rename(columns={0: col, 'level_0': coldate}) tdf = utl.data_to_type(tdf, str_col=[colcid], int_col=[coldate]) ndf = pd.merge(ndf, tdf, on=[coldate, colcid], how='outer') df = ndf df[colspend] /= 1000000 df[coldate].replace(self.dates, inplace=True) return df
def read(self): if not os.path.isfile(self.full_file_path): logging.info('Creating {}'.format(self.filename)) df = pd.DataFrame(columns=self.columns, index=None) df.to_csv(self.full_file_path, index=False, encoding='utf-8') self.df = utl.import_read_csv(self.filename, self.csvpath) self.df = utl.data_to_type(self.df, str_col=[self.key])
def import_plan_data(key, df, plan_omit_list, **kwargs): if df is None or df.empty: df = pd.DataFrame(columns=kwargs[vmc.fullplacename] + [vmc.vendorkey]) df = df.loc[~df[vmc.vendorkey].isin(plan_omit_list)] df = df.loc[:, kwargs[vmc.fullplacename]] df = full_placement_creation(df, key, dctc.FPN, kwargs[vmc.fullplacename]) df = df.drop_duplicates() dic = dct.Dict(kwargs[vmc.filenamedict]) df_fpn = pd.DataFrame(df[dctc.FPN]) er.ErrorReport(df_fpn, dic, None, kwargs[vmc.filenameerror]) merge_col = list(set(dic.data_dict.columns).intersection(df.columns)) dic.data_dict = utl.data_to_type(dic.data_dict, str_col=merge_col) dic.data_dict = dic.data_dict.merge(df, on=merge_col, how='left') dic.apply_functions() dic.data_dict = utl.data_to_type(dic.data_dict, date_col=vmc.datadatecol) return dic.data_dict
def get_new_values(self, keys_list): keys_list = utl.data_to_type(keys_list, str_col=keys_list.columns) keys_list = keys_list.merge(pd.DataFrame(self.df[self.key]), on=self.key, how='left', indicator=True) keys_list = keys_list[keys_list['_merge'] == 'left_only'] keys_list = pd.DataFrame(keys_list[self.key]) return keys_list
def data_to_df(r): df = pd.DataFrame() df['Date'] = r.json()['metrics'][0]['columns'] for x in r.json()['metrics'][0]['dataset']: df[x['seriesName']] = x['set'] df = utl.data_to_type(df, date_col=['Date']) df['Date'] = df['Date'].dt.date return df
def add_key_values(self, data_dict): keys_list = pd.DataFrame(data_dict[self.key]).drop_duplicates() keys_list.dropna(subset=[self.key], inplace=True) keys_list = self.get_new_values(keys_list) keys_list = self.auto_split(keys_list) self.df = utl.data_to_type(self.df, str_col=keys_list.columns) self.df = self.df.merge(keys_list, how='outer').reset_index(drop=True) self.df.dropna(subset=[self.key], inplace=True) self.write(self.df)
def net_plan_comp(df, p_col=dctc.PFPN, n_cost=vmc.cost, p_cost=dctc.PNC): df = utl.data_to_type(df, float_col=[p_cost]) df[p_cost] = df[p_cost].fillna(0) nc_pnc = df[df[dctc.UNC] != True] nc_pnc = nc_pnc.groupby(p_col)[p_cost, n_cost].sum() nc_pnc = nc_pnc[nc_pnc[p_cost] > 0] if p_cost not in nc_pnc.columns: nc_pnc[p_cost] = 0 nc_pnc[DIF_NC_PNC] = nc_pnc[n_cost] - nc_pnc[p_cost] nc_pnc = nc_pnc.reset_index() nc_pnc.columns = [p_col] + DIF_COL df = df.merge(nc_pnc, on=p_col, how='left') return df
def agency_fees_calculation(df): logging.info('Calculating Agency Fees') if dctc.AGF not in df.columns: logging.warning('Agency Fee Rates not in dict. ' 'Update dict and run again to calculate agency fees.') return df threshold = utl.import_read_csv(agency_fee_file, utl.config_path) df = utl.data_to_type(df, float_col=[NCF, dctc.AGF]) if not df.empty and not threshold.empty: threshold = threshold[AGENCY_THRESH].fillna(0).astype(float).values[0] threshold = (df[NCF].sum() - threshold) / df[NCF].sum() df[dctc.AGF] = df[dctc.AGF] * threshold df[AGENCY_FEES] = df[dctc.AGF] * df[NCF] return df
def apply_to_dict(self, data_dict): if self.key not in data_dict.columns: return data_dict self.read() self.add_key_values(data_dict) data_dict = utl.data_to_type(data_dict, str_col=[self.key]) data_dict = data_dict.merge(self.df, on=self.key, how='left') for col in self.dependents: col_x = col + '_x' col_y = col + '_y' if col_y in data_dict.columns: data_dict[col] = data_dict[col_y] data_dict = data_dict.drop([col_x, col_y], axis=1) self.rename_y_columns(data_dict) data_dict = self.reorder_columns(data_dict) return data_dict
def total_cost_calculation(df): logging.info('Calculating Total Cost') if AGENCY_FEES not in df.columns: logging.warning('Agency Fees not in dataframe. ' 'Update dict and run again to calculate total cost.') return df df = utl.data_to_type(df, float_col=[ NCF, AGENCY_FEES, vmc.AD_COST, vmc.dcm_service_fee, vmc.REP_COST, vmc.VER_COST ]) df[TOTAL_COST] = df[NCF] + df[AGENCY_FEES] for col in [vmc.AD_COST, vmc.dcm_service_fee, vmc.REP_COST, vmc.VER_COST]: if col in df.columns: df[TOTAL_COST] += df[col] return df
def vm_loop(self): logging.info('Initializing Vendor Matrix Loop') self.df = pd.DataFrame(columns=[vmc.date, dctc.FPN, dctc.PN, dctc.BM]) self.sort_vendor_list() for vk in self.vl: self.tdf = self.vendor_get(vk) self.df = self.df.append(self.tdf, ignore_index=True, sort=True) self.df = full_placement_creation(self.df, plan_key, dctc.PFPN, self.vm[vmc.fullplacename][plan_key]) if not os.listdir(er.csvpath): if os.path.isdir(er.csvpath): logging.info('All placements defined. Deleting Error report' ' directory.') os.rmdir(er.csvpath) self.df = utl.data_to_type(self.df, vmc.datafloatcol, vmc.datadatecol) return self.df
def nested_dicts_to_cols(self, nd_col): self.df[nd_col] = ( self.df[nd_col].apply(lambda x: self.convert_dictionary(x))) dict_df = self.df[nd_col].apply(pd.Series).fillna(0) column_list = dict_df.columns.values.tolist() column_list = [ l for l in column_list if l not in ['action_type', 'value'] ] clean_df = pd.DataFrame() if 'action_type' in dict_df.columns: column_list += ['action_type'] for col in column_list: dirty_df = dict_df[col].apply(pd.Series).fillna(0) if 'action_type' in dirty_df.columns: dirty_df = utl.data_to_type(dirty_df, str_col=['action_type']) clean_df = self.clean_nested_df(dirty_df, clean_df) self.df = pd.concat([clean_df, self.df], axis=1) # type: pd.DataFrame self.df = self.df.drop(nested_dict_col, axis=1) # type: pd.DataFrame
def vm_parse(self): self.vm_df = pd.DataFrame(columns=vmc.datacol) self.vm_df = self.read() self.vm = self.vm_df.copy() self.plan_net_check() drop = [ item for item in self.vm.columns.values.tolist() if (item[0] == '|') ] self.vm = utl.col_removal(self.vm, 'vm', drop) self.vm = utl.data_to_type(self.vm, [], vmc.datecol, vmc.barsplitcol) self.vl = self.vm[vmc.vendorkey].tolist() self.vm = self.vm.set_index(vmc.vendorkey).to_dict() for col in vmc.barsplitcol: self.vm[col] = ({ key: list(value.split('|')) for key, value in self.vm[col].items() })
def full_placement_creation(df, key, full_col, full_place_cols): logging.debug('Creating Full Placement Name') df[full_col] = '' df = utl.data_to_type( df, str_col=[x[2:] if x[:2] == '::' else x for x in full_place_cols]) for idx, col in enumerate(full_place_cols): if col[:2] == '::': col = col[2:] df[col] = df[col].str.replace('_', '', regex=True) if col not in df: logging.warning('{} was not in {}. It was not included in ' 'Full Placement Name. For reference column names' ' are as follows: \n {}'.format( col, key, df.columns.values.tolist())) continue if idx == 0: df[full_col] = df[col] else: df[full_col] = (df[full_col] + '_' + df[col]) return df
def clean_nested_df(dirty_df, clean_df): values = [x for x in dirty_df.columns if x != 'action_type'] dirty_df = utl.data_to_type(dirty_df, float_col=values) dirty_df = pd.pivot_table(dirty_df, columns='action_type', values=values, index=dirty_df.index, aggfunc='sum', fill_value=0) if type(dirty_df.columns) == pd.MultiIndex: dirty_df.columns = [ ' - '.join([str(y) for y in x]) if x[0] != 'value' else x[1] for x in dirty_df.columns ] for col in [x for x in [0.0, 'action_type'] if x in dirty_df.columns]: dirty_df = dirty_df.drop(col, axis=1) dirty_df = dirty_df.apply(pd.to_numeric) clean_df = pd.concat([clean_df, dirty_df], axis=1) clean_df = clean_df.groupby(clean_df.columns, axis=1).sum() # type: pd.DataFrame return clean_df
def combining_data(df, key, columns, **kwargs): logging.debug('Combining Data.') combine_cols = [x for x in columns if kwargs[x] != ['nan']] for col in combine_cols: if col in df.columns and col not in kwargs[col]: df[col] = 0 for item in kwargs[col]: if col == item: continue if item not in df: logging.warning('{} is not in {}. It was not ' 'put in {}'.format(item, key, col)) continue if col not in df.columns: df[col] = 0 if col in vmc.datafloatcol: df = utl.data_to_type(df, float_col=[col, item]) df[col] += df[item] else: df[col] = df[item] for col in [x for x in columns if x not in combine_cols]: if col in df.columns or col == vmc.date: df[col] = 0 return df
def df_single_transform(df, transform): if str(transform) == 'nan': return df transform = transform.split('::') transform_type = transform[0] if transform_type == 'MixedDateColumn': mixed_col = transform[1] date_col = transform[2] df[date_col] = df[mixed_col] df = utl.data_to_type(df, date_col=[date_col]) df['temp'] = df[date_col] df[date_col] = df[date_col].fillna(method='ffill') df = df[df['temp'].isnull()].reset_index(drop=True) df.drop('temp', axis=1, inplace=True) if transform_type == 'Pivot': pivot_col = transform[1] val_col = transform[2].split('|') df = df.fillna(0) index_cols = [x for x in df.columns if x not in val_col + [pivot_col]] df = pd.pivot_table(df, index=index_cols, columns=[pivot_col], aggfunc='sum') if len(val_col) != 1: df.columns = df.columns.map('_'.join) if type(df.columns) == pd.MultiIndex: df.columns = [' - '.join([str(y) for y in x]) for x in df.columns] df = df.reset_index() if transform_type == 'Merge': merge_file = transform[1] left_merge = transform[2] right_merge = transform[3] merge_df = pd.read_csv(merge_file) dfs = {left_merge: df, right_merge: merge_df} for col in dfs: if dfs[col][col].dtype == 'float64': dfs[col][col] = dfs[col][col].fillna(0).astype('int') dfs[col][col] = dfs[col][col].astype('U') dfs[col][col] = dfs[col][col].str.strip('.0') filename = 'Merge-{}-{}.csv'.format(left_merge, right_merge) err = er.ErrorReport(df, merge_df, None, filename, merge_col=[left_merge, right_merge]) df = err.merge_df df = df.drop('_merge', axis=1) if transform_type == 'DateSplit': start_date = transform[1] end_date = transform[2] if len(transform) == 4: exempt_col = transform[3].split('|') else: exempt_col = [] df = utl.data_to_type(df, date_col=[end_date, start_date]) df['days'] = (df[end_date] - df[start_date]).dt.days + 1 n_cols = [ x for x in df.columns if df[x].dtype in ['int64', 'float64'] and x not in exempt_col + ['days'] ] df[n_cols] = df[n_cols].div(df['days'], axis=0) df = df.loc[df.index.repeat(df['days'])] df[start_date] = (df.groupby(level=0)[start_date].transform( lambda x: pd.date_range(start=x.iat[0], periods=len(x)))) df = df.drop('days', axis=1) df = df.reset_index(drop=True) # type: pd.DataFrame if transform_type == 'Stack': header_col_name = transform[1] hold_col_name = transform[2] df.columns = [ df.columns[idx - 1] if 'Unnamed' in x else x for idx, x in enumerate(df.columns) ] hdf = pd.DataFrame(df[hold_col_name]) ndf = pd.DataFrame() for x in set(y for y in df.columns if y != hold_col_name): tdf = df[x] tdf.columns = tdf.loc[0] tdf = tdf.iloc[1:] tdf[header_col_name] = x ndf = ndf.append(tdf) df = pd.concat([ndf, hdf], axis=1, join='inner') df = df.reset_index(drop=True) # type: pd.DataFrame if transform_type == 'Melt': header_col_name = transform[1] variable_cols = transform[2].split('|') df = df.melt(id_vars=[x for x in df.columns if x not in variable_cols], value_vars=[x for x in variable_cols if x in df.columns], var_name='{}-variable'.format(header_col_name), value_name='{}-value'.format(header_col_name)) df = df.reset_index(drop=True) if transform_type == 'RawTranslate': tc = dct.DictTranslationConfig() tc.read(dctc.filename_tran_config) df = tc.apply_translation_to_dict(df) if transform_type == 'AddColumn': col_name = transform[1] col_val = transform[2] df[col_name] = col_val return df
def clean(self): self.data_dict = utl.data_to_type(self.data_dict, dctc.floatcol, dctc.datecol, dctc.strcol) if dctc.FPN in self.data_dict.columns: self.data_dict = self.data_dict.drop_duplicates(dctc.FPN) self.data_dict = self.data_dict.reset_index(drop=True)
def merge_df_cleaning(df, first_row, last_row, date_col, start_date, end_date): df = utl.first_last_adj(df, first_row, last_row) df = utl.data_to_type(df, date_col=date_col) df = utl.date_removal(df, date_col[0], start_date, end_date) return df