if __name__ == "__main__": from gdayf.core.controller import Controller from gdayf.common.constants import * from pandas import set_option from gdayf.common.dataload import DataLoad #Analysis controller = Controller() if controller.config_checks(): data_train, data_test = DataLoad().dm() status, recomendations = controller.exec_analysis( datapath=data_train, objective_column='Weather_Temperature', amode=POC, metric='test_rmse', deep_impact=5) controller.reconstruct_execution_tree(arlist=None, metric='test_rmse', store=True) controller.remove_models(recomendations, mode=EACH_BEST) set_option('display.max_rows', 500) set_option('display.max_columns', 50) set_option('display.max_colwidth', 100) set_option('display.precision', 4) set_option('display.width', 1024) #Prediction print('Starting Prediction\'s Phase')
def train_workflow(self, datapath, wkey, workflow, prefix='main', remove_models=EACH_BEST): set_option('display.max_rows', 500) set_option('display.max_columns', 500) set_option('display.width', 1000) wf = workflow pfix = prefix error, dataset = self.check_path(datapath) if dataset is None: return error controller = Controller(e_c=self._ec) if controller.config_checks(): variables = dataset.columns.tolist() #for wkey, wvalue in wf.items(): if wf["data"]["filtered_columns"] is not None: for delete in wf["data"]["filtered_columns"]: try: variables.remove(delete) except Exception: self._logging.log_info('gDayF', "Workflow", self._labels["failed_var"], delete) self._logging.log_info('gDayF', "Workflow", self._labels["variables_desc"], variables) if wf["data"]["for_each"] is not None: fe_column = wf["data"]["for_each"] fe_data_exclusions = wf["data"]["for_each_exclusions"] fe_filtered_data = wf["data"]["filtered_data"] fe_parameters = wf["parameters"] fe_next = wf["Next"] for each in eval('dataset.' + fe_column + '.unique()'): if fe_data_exclusions is None or each not in fe_data_exclusions: aux_dataset = eval('dataset[dataset.' + fe_column + '== each]') pfix = xstr(prefix + '_' + str(each)) if fe_filtered_data is not None: qcolumn = fe_filtered_data["column"] quantile = aux_dataset[qcolumn].quantile( q=fe_filtered_data["quantile"]) aux_dataset = eval('aux_dataset.loc[aux_dataset.' + qcolumn + '<= ' + str(quantile) + ']') pfix = xstr(pfix + '_' + str(fe_filtered_data["quantile"])) if fe_parameters is not None: source_parameters = list() source_parameters.append( 'controller.exec_analysis(') source_parameters.append( 'datapath=aux_dataset.loc[:, variables]') for ikey, ivalue in fe_parameters.items(): source_parameters.append(',') source_parameters.append(ikey) source_parameters.append('=') if isinstance(ivalue, str) and ikey != "amode": source_parameters.append('\'') source_parameters.append(ivalue) source_parameters.append('\'') else: source_parameters.append(str(ivalue)) source_parameters.append(')') self._logging.log_info( 'gDayF', "Workflow", self._labels["desc_operation"], ''.join(source_parameters)) status, recomendations = eval( ''.join(source_parameters)) controller.remove_models(recomendations, mode=remove_models) controller.reconstruct_execution_tree( recomendations, metric=fe_parameters['metric'], store=True) #model_id = recomendations[0]['model_id'] table_model_list = controller.table_model_list( ar_list=recomendations, metric=eval(fe_parameters['metric'])) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', table_model_list.to_string(justify="left")) #filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'train_performance' if self._config['common'][ 'workflow_summary_enabled']: filename = self.storage_path( 'train', str(pfix) + '_' + 'train_performance', 'xls') table_model_list.to_excel( filename, index=False, sheet_name='performance') self.replicate_file('train', filename=filename) prediction_frame = controller.exec_prediction( datapath=aux_dataset, model_file=recomendations[0]['json_path'][0] ['value']) try: if 'predict' in prediction_frame.columns.values: prediction_frame.rename( columns={"predict": wkey}, inplace=True) elif 'prediction' in prediction_frame.columns.values: prediction_frame.rename( columns={"prediction": wkey}, inplace=True) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', prediction_frame.to_string( index_names=False, justify="left")) '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'prediction', 'xls')''' if self._config['common'][ 'workflow_summary_enabled']: filename = self.storage_path( 'train', str(pfix) + '_' + 'prediction', 'xls') prediction_frame.to_excel( filename, index=False, sheet_name='train_prediction') self.replicate_file('train', filename=filename) except AttributeError as oexecution_error: self._logging.log_info( 'gDayF', "Workflow", self._labels["failed_model"], str(repr(oexecution_error))) try: if fe_next is not None and prediction_frame is not None: self.workflow(prediction_frame, fe_next, pfix, remove_models=remove_models) except Exception as oexecution_error: self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], str(fe_next)) else: aux_dataset = dataset if wf["data"]["filtered_data"] is not None: qcolumn = wf["data"]["filtered_data"]["column"] quantile = aux_dataset[[qcolumn]].quatile( [wf["data"]["filtered_data"]["quantile"]]) aux_dataset = aux_dataset.query('%s <= %s' % (qcolumn, quantile)) if wf['parameters'] is not None: source_parameters = list() source_parameters.append('controller.exec_analysis(') source_parameters.append( 'datapath=aux_dataset.loc[:, variables]') for ikey, ivalue in wf['parameters'].items(): source_parameters.append(',') source_parameters.append(ikey) source_parameters.append('=') if isinstance(ivalue, str) and ikey != "amode": source_parameters.append('\'') source_parameters.append(ivalue) source_parameters.append('\'') else: source_parameters.append(str(ivalue)) source_parameters.append(')') self._logging.log_info('gDayF', "Workflow", self._labels["desc_operation"], ''.join(source_parameters)) status, recomendations = eval(''.join(source_parameters)) controller.remove_models(recomendations, mode=remove_models) controller.reconstruct_execution_tree( recomendations, metric=wf['parameters']['metric'], store=True) model_id = recomendations[0]['model_id'] table_model_list = controller.table_model_list( ar_list=recomendations, metric=eval(wf['parameters']['metric'])) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', table_model_list.to_string(justify="left")) if self._config['common']['workflow_summary_enabled']: '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'train_performance', 'xls')''' filename = self.storage_path( 'train', str(pfix) + '_' + 'train_performance', 'xls') table_model_list.to_excel(filename, index=False, sheet_name="performace") self.replicate_file('train', filename=filename) prediction_frame = controller.exec_prediction( datapath=aux_dataset, model_file=recomendations[0]['json_path'][0]['value']) try: if 'predict' in prediction_frame.columns.values: prediction_frame.rename(columns={"predict": wkey}, inplace=True) elif 'prediction' in prediction_frame.columns.values: prediction_frame.rename( columns={"prediction": wkey}, inplace=True) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', prediction_frame.to_string(index_names=False, justify="left")) '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'prediction', 'xls')''' if self._config['common']['workflow_summary_enabled']: filename = self.storage_path( 'train', str(pfix) + '_' + 'prediction', 'xls') prediction_frame.to_excel( filename, index=False, sheet_name="train_prediction") self.replicate_file('train', filename=filename) except AttributeError as oexecution_error: self._logging.log_info('gDayF', "Workflow", self._labels["failed_model"], str(repr(oexecution_error))) if wf['Next'] is not None and prediction_frame is not None: try: self.workflow(datapath=prediction_frame, workflow=wf['Next'], prefix=pfix, remove_models=remove_models) except Exception as oexecution_error: self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], str(wf['Next'])) self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], repr(oexecution_error)) controller.clean_handlers() del controller
def predict_workflow(self, datapath, wkey, workflow, prefix='main', workflow_id='default', remove_models=EACH_BEST): set_option('display.height', 1000) set_option('display.max_rows', 500) set_option('display.max_columns', 500) set_option('display.width', 1000) error, dataset = self.check_path(datapath) if dataset is None: return error if isinstance(workflow, str): file = open(workflow, 'r') wf = load(file, object_hook=OrderedDict) else: wf = workflow pfix = xstr(prefix) controller = Controller(e_c=self._ec) if controller.config_checks(): variables = dataset.columns.tolist() #for wkey, wvalue in wf.items(): if wf["model"] is not None and \ (isinstance(wf["model"], str) or isinstance(wf["model"], dict)): if wf["data"]["filtered_columns"] is not None: for delete in wf["data"]["filtered_columns"]: try: variables.remove(delete) except Exception: self._logging.log_info('gDayF', "Workflow", self._labels["failed_var"], delete) self._logging.log_info('gDayF', "Workflow", self._labels["variables_desc"], variables) if wf["data"]["for_each"] is not None: fe_column = wf["data"]["for_each"] fe_data_exclusions = wf["data"]["for_each_exclusions"] fe_filtered_data = wf["data"]["filtered_data"] fe_next = wf["Next"] for each in eval('dataset.' + fe_column + '.unique()'): if fe_data_exclusions is None or each not in fe_data_exclusions: aux_dataset = eval('dataset[dataset.' + fe_column + '== each]') pfix = xstr(prefix + '_' + str(each)) if fe_filtered_data is not None: qcolumn = fe_filtered_data["column"] quantile = aux_dataset[qcolumn].quantile( q=fe_filtered_data["quantile"]) aux_dataset = eval( 'aux_dataset.loc[aux_dataset.' + qcolumn + '<= ' + str(quantile) + ']') pfix = xstr(pfix + '_' + str(fe_filtered_data["quantile"])) prediction_frame = controller.exec_prediction( datapath=aux_dataset, model_file=wf["model"][str(each)]) try: if 'predict' in prediction_frame.columns.values: prediction_frame.rename( columns={"predict": wkey}, inplace=True) elif 'prediction' in prediction_frame.columns.values: prediction_frame.rename( columns={"prediction": wkey}, inplace=True) except AttributeError: self._logging.log_info( 'gDayF', "Workflow", self._labels["anomalies_operation"]) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', prediction_frame.to_string(index_names=False, justify="left")) try: if isinstance(prediction_frame, DataFrame) \ and self._config['common']['workflow_summary_enabled']: '''filename = self.storage_path('predict', wkey + '_' + str(pfix) + '_' + 'prediction', 'xls')''' filename = self.storage_path( 'predict', str(pfix) + '_' + str(self.timestamp) + '_' + 'prediction', 'xls') prediction_frame.to_excel( filename, index=False, sheet_name="prediction") self.replicate_file('predict', filename=filename) elif self._config['common'][ 'workflow_summary_enabled']: for ikey, ivalue in prediction_frame[ 'columns'].items(): ppDF = decode_ordered_dict_to_dataframe( ivalue) if isinstance(ppDF, DataFrame): '''filename = self.storage_path('predict', wkey + '_' + str(pfix) + '_' + 'prediction_' + ikey, 'xls')''' filename = self.storage_path( 'predict', str(pfix) + '_' + str(self.timestamp) + '_' + 'prediction_' + ikey, 'xls') ppDF.to_excel( filename, index=False, sheet_name="prediction") self.replicate_file( 'predict', filename=filename) filename = self.storage_path( 'predict', str(pfix) + '_' + str(self.timestamp) + '_' + '_prediction', 'json') with open(filename, 'w') as f: f.write( dumps( prediction_frame['global_mse']) ) self.replicate_file('predict', filename=filename) except AttributeError: self._logging.log_info( 'gDayF', "Workflow", self._labels["anomalies_operation"], prediction_frame) try: if fe_next is not None and prediction_frame is not None: self.workflow(prediction_frame, fe_next, pfix, remove_models=remove_models) except Exception as oexecution_error: self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], str(fe_next)) self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], repr(oexecution_error)) else: aux_dataset = dataset prediction_frame = controller.exec_prediction( datapath=aux_dataset, model_file=wf["model"]) if 'predict' in prediction_frame.columns.values: prediction_frame.rename(columns={"predict": wkey}, inplace=True) elif 'prediction' in prediction_frame.columns.values: prediction_frame.rename(columns={"prediction": wkey}, inplace=True) self._logging.log_info( 'gDayF', 'workflow', self._labels["results"] + '\n', prediction_frame.to_string(index_names=False, justify="left")) if isinstance( prediction_frame, DataFrame ) and self._config['common']['workflow_summary_enabled']: filename = self.storage_path( 'predict', str(pfix) + str(self.timestamp) + '_' + '_prediction', 'xls') prediction_frame.to_excel(filename, index=False, sheet_name="prediction") self.replicate_file('predict', filename=filename) elif self._config['common']['workflow_summary_enabled']: for ikey, ivalue in prediction_frame['columns'].items( ): ppDF = decode_ordered_dict_to_dataframe(ivalue) if isinstance(ppDF, DataFrame): filename = self.storage_path( 'predict', str(pfix) + '_' + str(self.timestamp) + '_' + 'prediction_' + ikey, 'xls') ppDF.to_excel(filename, index=False, sheet_name="prediction") self.replicate_file('predict', filename=filename) filename = self.storage_path( 'predict', str(pfix) + '_' + str(self.timestamp) + '_' + '_prediction', 'json') with open(filename, 'w') as f: f.write(dumps(prediction_frame)) self.replicate_file('predict', filename=filename) if wf['Next'] is not None and prediction_frame is not None: try: self.workflow(datapath=prediction_frame, workflow=wf['Next'], prefix=pfix, remove_models=remove_models) except Exception as oexecution_error: self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], str(wf['Next'])) self._logging.log_critical( 'gDayF', "Workflow", self._labels["failed_wf"], repr(oexecution_error)) controller.clean_handlers() del controller
'/Data/gdayf-v1/experiments/Crulogic-r2/CRULOGIC-avg-fuel-speed-predicted_1537911231.396293/' ) source_1_data.append('summary/predict/') source_1_data.append('Avg-speed_a2_p_prediction.xls') model_1_data = read_excel(io=''.join(source_1_data)) source_2_data = list() source_2_data.append( '/Data/gdayf-v1/experiments/Crulogic-r2/CRULOGIC-avg-fuel-speed-predicted_1537911231.396293/' ) source_2_data.append('summary/predict/') source_2_data.append('Avg-speed_a4_p_prediction.xls') model_2_data = read_excel(io=''.join(source_2_data)) #Analysis controller = Controller(user_id='Crulogic-r2') if controller.config_checks(): model_1 = '/Data/gdayf-v1/experiments/Crulogic-r2/CRULOGIC-avg-fuel-speed-predicted_1537911231.396293/Crulogic-r2_Dataframe_77256_6438_12_1537911811.7863178/h2o/train/1537911811.7874267/json/H2ORandomForestEstimator_1537912248.5985134.json' model_2 = '/Data/gdayf-v1/experiments/Crulogic-r2/CRULOGIC-avg-fuel-speed-predicted_1537911231.396293/Crulogic-r2_Dataframe_77256_6438_12_1537913921.5431554/h2o/train/1537913921.544499/json/H2OGradientBoostingEstimator_1537914152.3058493.json' dataframe_dict = OrderedDict() objective_column = 'avg-fuel' dataframe_dict[objective_column] = model_data[objective_column] prediction_frame = controller.exec_prediction(datapath=model_1_data, model_file=model_1) model_data['model_1'] = prediction_frame['predict'] prediction_frame = controller.exec_prediction(datapath=model_2_data, model_file=model_2) model_data['model_2'] = prediction_frame['predict'] columns = model_data.columns.values.tolist()