def save_models(self, arlist, mode=BEST, metric='accuracy'): if mode == BEST: model_list = [arlist[0]] elif mode == BEST_3: model_list = arlist[0:3] elif mode == EACH_BEST: exclusion = list() model_list = list() for model in arlist: if (get_model_fw(model), model['model_parameters'][ get_model_fw(model)]['model'], model['normalizations_set']) not in exclusion: model_list.append(model) exclusion.append( (get_model_fw(model), model['model_parameters'][ get_model_fw(model)]['model'], model['normalizations_set'])) elif mode == ALL: model_list = arlist elif mode == NONE: model_list = list() for fw in self._config['frameworks'].keys(): self.init_handler(fw) for each_model in model_list: if fw in each_model['model_parameters'].keys(): self.model_handler[fw]['handler'].store_model( each_model, user=self._ec.get_id_user()) self.clean_handler(fw)
def base_specific(self, dataframe_metadata, list_ar_metadata): version = self._ec.config.get_config()['common']['version'] for ar_metadata in list_ar_metadata: ar_structure = ArMetadata() if ar_metadata['dataset_hash_value'] == self.hash_dataframe: self._ec.set_id_analysis(ar_metadata['model_id']) ar_structure['predecessor'] = ar_metadata['model_parameters'][get_model_fw(ar_metadata)] \ ['parameters']['model_id']['value'] ar_structure['round'] = int(ar_metadata['round']) + 1 else: ar_structure['predecessor'] = 'root' ar_structure['model_id'] = self._ec.get_id_analysis() ar_structure['version'] = version ar_structure['user_id'] = self._ec.get_id_user() ar_structure['workflow_id'] = ar_metadata['workflow_id'] ar_structure['objective_column'] = ar_metadata['objective_column'] ar_structure['timestamp'] = self.timestamp ar_structure['normalizations_set'] = ar_metadata[ 'normalizations_set'] ar_structure['dataset'] = self.dataframe_name ar_structure['dataset_hash_value'] = self.hash_dataframe ar_structure['data_initial'] = dataframe_metadata ar_structure['data_normalized'] = None ar_structure['model_parameters'] = ar_metadata['model_parameters'] ar_structure['ignored_parameters'] = None ar_structure['full_parameters_stack'] = None ar_structure['status'] = -1 self.next_analysis_list.append(ar_structure) self.analyzed_models.append( self.generate_vectors(ar_structure, ar_metadata['normalizations_set']))
def get_external_model(self, armetadata, type='pojo'): fw = get_model_fw(armetadata) self.init_handler(fw) results = self.model_handler[fw]['handler'].get_external_model( armetadata, type) self.clean_handler(fw) return results
def remove_models(self, arlist, mode=ALL): if mode == BEST: model_list = arlist[1:] elif mode == BEST_3: model_list = arlist[3:] elif mode == EACH_BEST: exclusion = list() model_list = list() for model in arlist: if (get_model_fw(model), model['model_parameters'][ get_model_fw(model)]['model'], model['normalizations_set']) not in exclusion: exclusion.append( (get_model_fw(model), model['model_parameters'][ get_model_fw(model)]['model'], model['normalizations_set'])) else: model_list.append(model) elif mode == ALL: model_list = arlist elif mode == NONE: model_list = list() fw_list = list() for models in model_list: if get_model_fw(models) not in fw_list: fw_list.append(get_model_fw(models)) for fw in fw_list: self.init_handler(fw) self.model_handler[fw]['handler'].remove_models(model_list) self.clean_handler(fw)
def optimize_models(self, armetadata): metric_value, _, objective = eval('self.get_' + self.metric + '(armetadata)') engine = get_model_fw(armetadata) for engines in [*self._frameworks]: if engine == engines: optimizer_engine = importlib.import_module(self._frameworks[engine]['conf']['optimization_method']) model_list = optimizer_engine.Optimizer(self._ec).optimize_models(armetadata=armetadata, metric_value=metric_value, objective=objective, deepness=self.deepness, deep_impact=self.deep_impact) optimized_model_list = list() for model in model_list: self.safe_append(optimized_model_list, model) return (optimized_model_list)
def generate_vectors(self, model, normalization_set): vector = list() norm_vector = list() fw = get_model_fw(model) for parm, parm_value in model['model_parameters'][fw][ 'parameters'].items(): if isinstance(parm_value, OrderedDict) and parm != 'model_id': vector.append(parm_value['value']) #added 31/08/2017 if normalization_set == [None]: norm_vector = normalization_set else: for normalization in normalization_set: norm_vector.append( md5(dumps(normalization).encode('utf8')).hexdigest()) #print("Trace:%s-%s-%s-%s"%(fw, model['model_parameters'][fw]['model'], vector, norm_vector)) return fw, model['model_parameters'][fw]['model'], vector, norm_vector
def copy_template(self, increment=1): new_model = ArMetadata() new_model['model_id'] = deepcopy(self['model_id']) new_model['version'] = deepcopy(self['version']) new_model['workflow_id'] = deepcopy(self['workflow_id']) new_model['user_id'] = deepcopy(self['user_id']) new_model['type'] = deepcopy(self['type']) new_model['objective_column'] = deepcopy(self['objective_column']) new_model['timestamp'] = deepcopy(self['timestamp']) new_model['round'] = self['round'] + increment new_model['execution_seconds'] = 0.0 new_model['tolerance'] = 0.0 new_model['predecessor'] = self['model_parameters'][get_model_fw( self)]['parameters']['model_id']['value'] new_model['normalizations_set'] = deepcopy(self['normalizations_set']) new_model['dataset'] = deepcopy(self['dataset']) new_model['dataset_hash_value'] = deepcopy(self['dataset_hash_value']) new_model['data_initial'] = deepcopy(self['data_initial']) new_model['data_normalized'] = deepcopy(self['data_normalized']) new_model['model_parameters'] = deepcopy(self['model_parameters']) new_model['ignored_parameters'] = deepcopy(self['ignored_parameters']) return new_model
def analysis_specific(self, dataframe_metadata, list_ar_metadata): self.next_analysis_list.clear() if self.deepness == 1: #Check_dataframe_metadata compatibility self.base_specific(dataframe_metadata, list_ar_metadata) # Added 22/09/1974 elif self.deepness > self.deep_impact: self.next_analysis_list = None elif self.next_analysis_list is not None: fw_model_list = list() # Added 31/08/2017 best_models = list() # End - Added 31/08/2017 aux_loop_controller = len(self.analysis_recommendation_order) for indexer in range(0, aux_loop_controller): try: # Modified 31/08/2017 model = self.analysis_recommendation_order[indexer] if model['status'] == 'Executed': model_type = model['model_parameters'][get_model_fw( model)]['model'] if model_type not in best_models: fw_model_list.extend( self.optimize_models( self.analysis_recommendation_order[indexer] )) #print("Trace:%s-%s" % (model_type, best_models)) best_models.append(model_type) # End - Modified 31/08/2017 except TypeError: ''' If all optimize_models doesn't return new models pass and look for next best model on this type''' pass # if fw_model_list is not None: self.next_analysis_list.extend(fw_model_list) if len(self.next_analysis_list) == 0: self.next_analysis_list = None self.deepness += 1 return self._ec.get_id_analysis(), self.next_analysis_list
def optimize_models(self, armetadata, metric_value, objective, deepness, deep_impact): model_list = list() model = armetadata['model_parameters'][get_model_fw(armetadata)] config = self._config if get_model_fw(armetadata) == 'spark' and metric_value != objective \ and armetadata['status'] != self._labels['failed_op']: try: scoring_metric = decode_ordered_dict_to_dataframe(armetadata['metrics']['scoring']) except ValueError: print("TRACE: Not scoring: " + model) min_rows_limit = config['min_rows_limit'] min_rows_increment = config['min_rows_increment'] max_interactions_increment = config['max_interactions_increment'] interactions_increment = config['interactions_increment'] max_depth_increment = config['max_depth_increment'] ntrees_increment = config['ntrees_increment'] stepSize = config['stepSize'] aggregationDepth_increment = config['aggregationDepth_increment'] regParam = config['regParam'] elastic_variation = config['elastic_variation'] nv_smoothing = config['nv_smoothing'] nv_improvement = config['nv_improvement'] nv_divisor = config['nv_divisor'] clustering_increment = config['clustering_increment'] initstep_increment = config['initstep_increment'] if model['model'] == 'LinearSVC': if deepness == 2 and len(regParam) != 0: for elastic in regParam: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['regParam']['value'] = elastic['value'] model_list.append(new_armetadata) try: if model['parameters']['maxIter']['value'] \ >= scoring_metric['totalIterations'][0] and \ scoring_metric['totalIterations'][0] <= max_interactions_increment: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= interactions_increment model_list.append(new_armetadata) except KeyError: if model['parameters']['maxIter']['value'] \ <= max_interactions_increment: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= interactions_increment model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['aggregationDepth']['value'] *= aggregationDepth_increment model_list.append(new_armetadata) elif model['model'] == 'LogisticRegression' or model['model'] == 'LinearRegression': if deepness == 2 and len(regParam) != 0: for elastic in regParam: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['regParam']['value'] = elastic['value'] model_list.append(new_armetadata) if model['parameters']['elasticNetParam']['value'] \ * (1 + elastic_variation) <= 1.0: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['elasticNetParam']['value'] = \ model_aux['parameters']['elasticNetParam']['value'] * (1 + elastic_variation) model_list.append(new_armetadata) if model['parameters']['elasticNetParam']['value'] \ * (1 - elastic_variation) >= 0.0: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['elasticNetParam']['value'] = \ model_aux['parameters']['elasticNetParam']['value'] * (1 + elastic_variation) model_list.append(new_armetadata) try: if model['parameters']['maxIter']['value'] \ >= scoring_metric['totalIterations'][0] and \ scoring_metric['totalIterations'][0] <= max_interactions_increment: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= interactions_increment model_list.append(new_armetadata) except KeyError: if model['parameters']['maxIter']['value'] \ <= max_interactions_increment: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= interactions_increment model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['aggregationDepth']['value'] *= aggregationDepth_increment model_list.append(new_armetadata) elif model['model'] == 'DecisionTreeClassifier' or model['model'] == 'DecisionTreeRegressor': if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['minInstancesPerNode']['value'] = round( model_aux['parameters']['minInstancesPerNode']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxDepth']['value'] = \ model_aux['parameters']['maxDepth']['value'] * max_depth_increment model_list.append(new_armetadata) elif model['model'] == 'GBTRegressor': if deepness == 2 and len(stepSize) != 0 and len(eval(model['parameters']['lossType']['type'])) != 0: for stepsize in stepSize: for element in eval(model['parameters']['lossType']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['lossType']['value'] = element model_aux['parameters']['stepSize']['value'] = stepsize['learn'] model_list.append(new_armetadata) elif deepness == 2 and len(stepSize) != 0: for stepsize in stepSize: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['stepSize']['value'] = stepsize['learn'] model_list.append(new_armetadata) elif deepness == 2 and len(eval(model['parameters']['lossType']['type'])) != 0: for element in eval(model['parameters']['lossType']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['impurity']['value'] = element model_list.append(new_armetadata) if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['minInstancesPerNode']['value'] = round( model_aux['parameters']['minInstancesPerNode']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) # 05/07/2018. Included platform base restriction maxDepth <=30 if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value'] \ and model['parameters']['maxDepth']['value'] != 30: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] if model_aux['parameters']['maxDepth']['value'] * max_depth_increment > 30: model_aux['parameters']['maxDepth']['value'] = 30 else: model_aux['parameters']['maxDepth']['value'] *= max_depth_increment model_list.append(new_armetadata) if scoring_metric['trees'][0] >= model['parameters']['maxIter']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= ntrees_increment model_list.append(new_armetadata) elif model['model'] == 'GBTClassifier': if deepness == 2 and len(stepSize) != 0 and len(eval(model['parameters']['lossType']['type'])) != 0: for stepsize in stepSize: for element in eval(model['parameters']['lossType']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['lossType']['value'] = element model_aux['parameters']['stepSize']['value'] = stepsize['learn'] model_list.append(new_armetadata) elif deepness == 2 and len(stepSize) != 0: for stepsize in stepSize: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['stepSize']['value'] = stepsize['learn'] model_list.append(new_armetadata) elif deepness == 2 and len(eval(model['parameters']['impurity']['type'])) != 0: for element in eval(model['parameters']['lossType']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['lossType']['value'] = element model_list.append(new_armetadata) if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['minInstancesPerNode']['value'] = round( model_aux['parameters']['minInstancesPerNode']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxDepth']['value'] *= max_depth_increment model_list.append(new_armetadata) if scoring_metric['trees'][0] >= model['parameters']['maxIter']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= ntrees_increment model_list.append(new_armetadata) elif model['model'] == 'RandomForestClassifier' or model['model'] == 'RandomForestRegressor': if deepness == 2 and len(eval(model['parameters']['featureSubsetStrategy']['type'])) != 0 \ and len(eval(model['parameters']['impurity']['type'])) != 0: for featuresubsetstrategy in eval(model['parameters']['featureSubsetStrategy']['type']): for element in eval(model['parameters']['impurity']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['impurity']['value'] = element model_aux['parameters']['featureSubsetStrategy']['value'] = featuresubsetstrategy model_list.append(new_armetadata) elif deepness == 2 and len(eval(model['parameters']['featureSubsetStrategy']['type'])) != 0: for featuresubsetstrategy in eval(model['parameters']['featureSubsetStrategy']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['featureSubsetStrategy']['value'] = featuresubsetstrategy model_list.append(new_armetadata) elif deepness == 2 and len(eval(model['parameters']['impurity']['type'])) != 0: for element in model['parameters']['impurity']['type']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['impurity']['value'] = element model_list.append(new_armetadata) if model['parameters']['minInstancesPerNode']['value'] > (min_rows_limit / 2): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['minInstancesPerNode']['value'] = round( model_aux['parameters']['minInstancesPerNode']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) if scoring_metric['max_depth'][0] >= model['parameters']['maxDepth']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxDepth']['value'] *= max_depth_increment model_list.append(new_armetadata) if scoring_metric['trees'][0] >= model['parameters']['numTrees']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['numTrees']['value'] *= ntrees_increment model_list.append(new_armetadata) elif model['model'] == 'GeneralizedLinearRegression': if deepness == 2 and len(regParam) != 0: for elastic in regParam: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['regParam']['value'] = elastic['value'] model_list.append(new_armetadata) if deepness == 2: if model['parameters']['family']['value'] in ['gaussian', 'gamma']: linklist = ['log', 'inverse'] elif model['parameters']['family']['value'] in ['poisson']: linklist = ['log', 'sqrt'] elif model['parameters']['family']['value'] in ['poisson', 'tweedie']: linklist = [] for linkin in linklist: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['link']['value'] = linkin model_list.append(new_armetadata) if model['parameters']['maxIter']['value'] \ <= max_interactions_increment: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] *= interactions_increment model_list.append(new_armetadata) elif model['model'] == 'NaiveBayes': if deepness == 2 and len(nv_smoothing) != 0: for elastic in nv_smoothing: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['smoothing']['value'] = elastic['value'] model_list.append(new_armetadata) for adjusting in ['improvement', 'decrement']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] if adjusting == 'improvement': model_aux['parameters']['smoothing']['value'] = model_aux['parameters']['smoothing'][ 'value'] * (1 + nv_improvement) else: model_aux['parameters']['smoothing']['value'] = model_aux['parameters']['smoothing'][ 'value'] * (1 - nv_divisor) model_list.append(new_armetadata) elif model['model'] == 'BisectingKMeans': new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] = \ int(model_aux['parameters']['maxIter']['value'] * clustering_increment) model_list.append(new_armetadata) elif model['model'] == 'KMeans': if deepness == 2 and len(eval(model['parameters']['initMode']['type'])) != 0: for element in eval(model['parameters']['initMode']['type']): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['initMode']['value'] = element model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['spark'] model_aux['parameters']['maxIter']['value'] = \ int(model_aux['parameters']['maxIter']['value'] * clustering_increment) model_list.append(new_armetadata) else: return None if len(model_list) == 0: return None else: return model_list
def reconstruct_execution_tree(self, arlist=None, metric='combined', store=True): if (arlist is None or len(arlist) == 0) and self._ec.get_id_analysis() is None: self._logging.log_critical('gDayF', 'controller', self._labels["failed_model"]) return None elif self._ec.get_id_analysis( ) is not None and self._ec.get_id_user() != 'guest': new_arlist = PersistenceHandler( self._ec).recover_experiment_mongoDB() else: analysis_id = arlist[0]['model_id'] new_arlist = arlist ordered_list = self.priorize_list(arlist=new_arlist, metric=metric) root = OrderedDict() root['data'] = None root['ranking'] = 0 root['successors'] = OrderedDict() variable_dict = OrderedDict() variable_dict[0] = {'root': root} ranking = 1 for new_tree_structure in ordered_list: new_model = deep_ordered_copy(new_tree_structure) model_id = new_tree_structure['model_parameters'][get_model_fw(new_tree_structure)]\ ['parameters']['model_id']['value'] level = new_tree_structure['round'] if level not in variable_dict.keys(): variable_dict[level] = OrderedDict() new_tree_structure = OrderedDict() new_tree_structure['ranking'] = ranking new_tree_structure['data'] = new_model new_tree_structure['successors'] = OrderedDict() variable_dict[level][model_id] = new_tree_structure ranking += 1 level = 1 max_level = max(variable_dict.keys()) while level in range(1, max_level + 1): for model_id, new_tree_structure in variable_dict[level].items(): counter = 1 found = False while not found or (level - counter) == 0: if new_tree_structure['data'][ 'predecessor'] in variable_dict[level - counter].keys(): container = eval( 'variable_dict[level-counter][new_tree_structure[\'data\'][\'predecessor\']]' ) container['successors'][model_id] = new_tree_structure found = True counter += 1 if not found: self._logging.log_debug(self._ec.get_id_analysis(), 'controller', self._labels['fail_reconstruct'], model_id) level += 1 #Store_json on primary path if store and self._config['storage']['primary_path'] != 'mongoDB': primary_path = self._config['storage']['primary_path'] fstype = self._config['storage'][primary_path]['type'] datafile = list() datafile.append(self._config['storage'][primary_path]['value']) datafile.append('/') datafile.append(self._ec.get_id_user()) datafile.append('/') datafile.append(self._ec.get_id_workflow()) datafile.append('/') datafile.append(self._config['common']['execution_tree_dir']) datafile.append('/') datafile.append(self._ec.get_id_analysis()) datafile.append('.json') if self._config['persistence']['compress_json']: datafile.append('.gz') storage = StorageMetadata(self._ec) storage.append(value=''.join(datafile), fstype=fstype) PersistenceHandler(self._ec).store_json(storage, root) return root
def analysisnormal(self, dataframe_metadata, objective_column, amode): self.next_analysis_list.clear() if self.deepness == 1: self.base_iteration(amode, dataframe_metadata, objective_column) elif self.deepness > self.deep_impact: self.next_analysis_list = None elif self.deepness == 2: fw_model_list = list() # Added 31/08/2017 best_models = list() # End - Added 31/08/2017 aux_loop_controller = len(self.analysis_recommendation_order) for indexer in range(0, aux_loop_controller): try: model = self.analysis_recommendation_order[indexer] if model['status'] == 'Executed': model_type = model['model_parameters'][get_model_fw( model)]['model'] if model_type not in best_models and len( best_models) < self._config['adviser_L2_wide']: fw_model_list.extend( self.optimize_models( self.analysis_recommendation_order[indexer] )) best_models.append(model_type) except TypeError: ''' If all optimize_models doesn't return new models register it as evaluated and seleted''' best_models.append(model_type) self.next_analysis_list.extend(fw_model_list) if len(self.next_analysis_list) == 0: self.next_analysis_list = None elif self.next_analysis_list is not None: fw_model_list = list() # Added 31/08/2017 best_models = list() # End - Added 31/08/2017 aux_loop_controller = len(self.analysis_recommendation_order) for indexer in range(0, aux_loop_controller): try: model = self.analysis_recommendation_order[indexer] if model['status'] == 'Executed': model_type = model['model_parameters'][get_model_fw( model)]['model'] if model_type not in best_models and len( best_models ) < self._config['adviser_normal_wide']: fw_model_list.extend( self.optimize_models( self.analysis_recommendation_order[indexer] )) #print("Trace:%s-%s" % (model_type, best_models)) best_models.append(model_type) except TypeError: ''' If all optimize_models doesn't return new models register it as evaluated and seleted''' best_models.append(model_type) '''' Modified 20/09/2017 # Get two most potential best models fw_model_list = list() for indexer in range(0, 2): try: fw_model_list.extend(self.optimize_models(self.analysis_recommendation_order[indexer])) except TypeError: pass #if fw_model_list is not None:''' self.next_analysis_list.extend(fw_model_list) if len(self.next_analysis_list) == 0: self.next_analysis_list = None self.deepness += 1 return self._ec.get_id_analysis(), self.next_analysis_list
def table_model_list(self, ar_list, metric): dataframe = list() normal_cols = [ 'Model', 'Train_accuracy', 'Test_accuracy', 'Combined_accuracy', 'train_rmse', 'test_rmse' ] cluster_cols = ['Model', 'k', 'tot_withinss', 'betweenss'] ordered_list = self.priorize_list(arlist=ar_list, metric=metric) for model in ordered_list: if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS: try: dataframe.append({ 'Model': model['model_parameters'][get_model_fw(model)] ['parameters']['model_id']['value'], 'Round': model['round'], 'train_accuracy': model['metrics']['accuracy']['train'], 'test_accuracy': model['metrics']['accuracy']['test'], 'combined_accuracy': model['metrics']['accuracy']['combined'], 'train_rmse': model['metrics']['execution']['train']['RMSE'], 'test_rmse': model['metrics']['execution']['test']['RMSE'], 'train_r2': model['metrics']['execution']['train']['r2'], 'test_r2': model['metrics']['execution']['test']['r2'], 'path': model['json_path'][0]['value'] }) # AutoEncoders metrics except KeyError: dataframe.append({ 'Model': model['model_parameters'][get_model_fw(model)] ['parameters']['model_id']['value'], 'Round': model['round'], 'train_accuracy': model['metrics']['accuracy']['train'], 'test_accuracy': model['metrics']['accuracy']['test'], 'combined_accuracy': model['metrics']['accuracy']['combined'], 'train_rmse': model['metrics']['execution']['train']['RMSE'], 'path': model['json_path'][0]['value'] }) if metric in CLUSTERING_METRICS: try: aux = model['metrics']['execution']['train']['k'] except KeyError: aux = 0 dataframe.append({ 'Model': model['model_parameters'][get_model_fw(model)] ['parameters']['model_id']['value'], 'Round': model['round'], 'k': aux, 'tot_withinss': model['metrics']['execution']['train']['tot_withinss'], 'betweenss': model['metrics']['execution']['train']['betweenss'], 'path': model['json_path'][0]['value'] }) return DataFrame(dataframe)
def log_model_list(self, ar_list, metric): best_check = True ordered_list = self.priorize_list(arlist=ar_list, metric=metric) for model in ordered_list: if best_check: self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["best_model"], model['model_parameters'] [get_model_fw(model)]['parameters']['model_id']['value']) best_check = False else: self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["res_model"], model['model_parameters'] [get_model_fw(model)]['parameters']['model_id']['value']) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["round_reach"], model['round']) if model["normalizations_set"] is None: self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["norm_app"], []) else: self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["norm_app"], model["normalizations_set"]) if metric in ACCURACY_METRICS or metric in REGRESSION_METRICS: self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["ametric_order"], model['metrics']['accuracy']) self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["pmetric_order"], model['metrics']['execution']['train']['RMSE']) self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["pmetric_order"], model['metrics']['execution']['test']['RMSE']) self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["rmetric_order"], model['metrics']['execution']['train']['r2']) self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["rmetric_order"], model['metrics']['execution']['test']['r2']) if metric in CLUSTERING_METRICS: try: self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["ckmetric_order"], model['metrics']['execution']['train']['k']) except KeyError: self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["ckmetric_order"], "0") self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["ctmetric_order"], model['metrics']['execution']['train']['tot_withinss']) self._logging.log_info( self._ec.get_id_analysis(), 'controller', self._labels["cbmetric_order"], model['metrics']['execution']['train']['betweenss'])
def exec_analysis(self, datapath, objective_column, amode=POC, metric='test_accuracy', deep_impact=3, **kwargs): # Clustering variables k = None estimate_k = False #Force analysis variable atype = None hash_dataframe = '' for pname, pvalue in kwargs.items(): if pname == 'k': assert isinstance(pvalue, int) k = pvalue elif pname == 'estimate_k': assert isinstance(pvalue, bool) estimate_k = pvalue elif pname == 'atype': assert pvalue in atypes atype = pvalue supervised = True if objective_column is None: supervised = False self._logging.log_info('gDayF', "Controller", self._labels["start"]) self._logging.log_info('gDayF', "Controller", self._labels["ana_param"], metric) self._logging.log_info('gDayF', "Controller", self._labels["dep_param"], deep_impact) self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"], amode) if isinstance(datapath, str): try: self._logging.log_info('gDayF', "Controller", self._labels["input_param"], datapath) pd_dataset = inputHandlerCSV().inputCSV(filename=datapath) id_datapath = Path(datapath).name hash_dataframe = hash_key('MD5', datapath) except IOError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] except OSError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] except JSONDecodeError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] elif isinstance(datapath, DataFrame): self._logging.log_info('gDayF', "Controller", self._labels["input_param"], str(datapath.shape)) pd_dataset = datapath id_datapath = 'Dataframe' + \ '_' + str(pd_dataset.size) + \ '_' + str(pd_dataset.shape[0]) + \ '_' + str(pd_dataset.shape[1]) #hash_dataframe = md5(datapath.to_msgpack()).hexdigest() hash_dataframe = md5( datapath.to_json().encode('utf-8')).hexdigest() else: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'], None pd_test_dataset = None ''' Changed 05/04/2018 if metric == 'combined_accuracy' or 'test_accuracy':''' if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \ and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS): pd_dataset, pd_test_dataset = pandas_split_data( pd_dataset, train_perc=self._config['common']['test_frame_ratio']) df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas') self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath + '_' + str(time())) adviser = self.adviser.AdviserAStar(e_c=self._ec, metric=metric, deep_impact=deep_impact, dataframe_name=id_datapath, hash_dataframe=hash_dataframe) adviser.set_recommendations(dataframe_metadata=df, objective_column=objective_column, amode=amode, atype=atype) while adviser.next_analysis_list is not None: for each_model in adviser.next_analysis_list: fw = get_model_fw(each_model) if k is not None: try: each_model["model_parameters"][fw]["parameters"]["k"][ "value"] = k each_model["model_parameters"][fw]["parameters"]["k"][ "seleccionable"] = True each_model["model_parameters"][fw]["parameters"][ "estimate_k"]["value"] = estimate_k each_model["model_parameters"][fw]["parameters"][ "estimate_k"]["seleccionable"] = True except KeyError: pass self.init_handler(fw) if pd_test_dataset is not None: _, analyzed_model = self.model_handler[fw][ 'handler'].order_training(training_pframe=pd_dataset, base_ar=each_model, test_frame=pd_test_dataset, filtering='STANDARDIZE') else: _, analyzed_model = self.model_handler[fw][ 'handler'].order_training(training_pframe=pd_dataset, base_ar=each_model, test_frame=pd_dataset, filtering='STANDARDIZE') if analyzed_model is not None: adviser.analysis_recommendation_order.append( analyzed_model) adviser.next_analysis_list.clear() adviser.analysis_recommendation_order = adviser.priorize_models( model_list=adviser.analysis_recommendation_order) adviser.set_recommendations(dataframe_metadata=df, objective_column=objective_column, amode=amode) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["ana_models"], str(len(adviser.analyzed_models))) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["exc_models"], str(len(adviser.excluded_models))) self._logging.log_exec(self._ec.get_id_analysis(), 'controller', self._labels["end"]) self.clean_handlers() adviser.analysis_recommendation_order = adviser.priorize_models( model_list=adviser.analysis_recommendation_order) return self._labels[ 'success_op'], adviser.analysis_recommendation_order
def exec_prediction(self, datapath, armetadata=None, model_file=None): self._logging.log_info('gDayF', "Controller", self._labels["ana_mode"], 'prediction') if armetadata is None and model_file is None: self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], datapath) return self._labels["failed_model"] elif armetadata is not None: try: assert isinstance(armetadata, ArMetadata) base_ar = deep_ordered_copy(armetadata) except AssertionError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], armetadata) return self._labels["failed_model"] elif model_file is not None: try: #json_file = open(model_file) persistence = PersistenceHandler(self._ec) invalid, base_ar = persistence.get_ar_from_engine(model_file) del persistence if invalid: self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], model_file) return self._labels["failed_model"] except IOError as iexecution_error: print(repr(iexecution_error)) self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], model_file) return self._labels["failed_model"] except OSError as oexecution_error: print(repr(oexecution_error)) self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], model_file) return self._labels["failed_model"] if isinstance(datapath, str): try: self._logging.log_info('gDayF', "Controller", self._labels["input_param"], datapath) pd_dataset = inputHandlerCSV().inputCSV(filename=datapath) except [IOError, OSError, JSONDecodeError]: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] elif isinstance(datapath, DataFrame): pd_dataset = datapath self._logging.log_info('gDayF', "Controller", self._labels["input_param"], str(datapath.shape)) else: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] fw = get_model_fw(base_ar) self.init_handler(fw) prediction_frame = None try: prediction_frame, _ = self.model_handler[fw]['handler'].predict( predict_frame=pd_dataset, base_ar=base_ar) except TypeError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_model"], model_file) self.clean_handler(fw) self._logging.log_info('gDayF', 'controller', self._labels["pred_end"]) return prediction_frame
def optimize_models(self, armetadata, metric_value, objective, deepness, deep_impact): model_list = list() model = armetadata['model_parameters'][get_model_fw(armetadata)] config = self._config if get_model_fw(armetadata) == 'h2o' and metric_value != objective \ and armetadata['status'] != self._labels['failed_op']: try: model_metric = decode_ordered_dict_to_dataframe( armetadata['metrics']['model']) if model['model'] not in ['H2ONaiveBayesEstimator']: scoring_metric = decode_ordered_dict_to_dataframe( armetadata['metrics']['scoring']) nfold_limit = config['nfold_limit'] min_rows_limit = config['min_rows_limit'] cols_breakdown = config['cols_breakdown'] nfold_increment = config['nfold_increment'] min_rows_increment = config['min_rows_increment'] max_interactions_rows_breakdown = config[ 'max_interactions_rows_breakdown'] max_interactions_increment = config[ 'max_interactions_increment'] max_depth_increment = config['max_depth_increment'] ntrees_increment = config['ntrees_increment'] dpl_rcount_limit = config['dpl_rcount_limit'] dpl_divisor = config['dpl_divisor'] h_dropout_ratio = config['h_dropout_ratio'] epochs_increment = config['epochs_increment'] dpl_min_batch_size = config['dpl_min_batch_size'] dpl_batch_reduced_divisor = config['dpl_batch_reduced_divisor'] deeper_increment = config['deeper_increment'] wider_increment = config['wider_increment'] learning_conf = config['learning_conf'] rho_conf = config['rho_conf'] nv_laplace = config['nv_laplace'] nv_min_prob = config['nv_min_prob'] nv_min_sdev = config['nv_min_sdev'] nv_improvement = config['nv_improvement'] nv_divisor = config['nv_divisor'] clustering_increment = config['clustering_increment'] sample_rate = config['sample_rate'] if model['model'] == 'H2OGradientBoostingEstimator': if (deepness == 2 ) and model['types'][0]['type'] == 'regression': for tweedie_power in [1.1, 1.5, 1.9]: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['distribution'][ 'value'] = 'tweedie' model_aux['parameters'][ 'tweedie_power'] = ParameterMetadata() model_aux['parameters']['tweedie_power'].set_value( tweedie_power) model_list.append(new_armetadata) if deepness == 2: for learning in learning_conf: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['learn_rate'][ 'value'] = learning['learn'] model_aux['parameters']['learn_rate_annealing'][ 'value'] = learning['improvement'] model_list.append(new_armetadata) if model_metric['number_of_trees'][0] >= model[ 'parameters']['ntrees']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['ntrees'][ 'value'] *= ntrees_increment model_list.append(new_armetadata) if model_metric['max_depth'][0] >= model['parameters'][ 'max_depth']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['max_depth'][ 'value'] *= max_depth_increment model_list.append(new_armetadata) if model['parameters']['nfolds']['value'] < nfold_limit: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['nfolds'][ 'value'] += nfold_increment model_list.append(new_armetadata) if model['parameters']['min_rows'][ 'value'] > min_rows_limit: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['min_rows']['value'] = round( model_aux['parameters']['min_rows']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) elif model['model'] == 'H2OGeneralizedLinearEstimator': if model_metric['number_of_iterations'][0] >= model[ 'parameters']['max_iterations']['value']: if deepness == 2: max_iterations = model['parameters']['max_iterations']['value'] * \ max(round( armetadata['data_initial']['rowcount'] / max_interactions_rows_breakdown), 1) else: max_iterations = model['parameters'][ 'max_iterations'][ 'value'] * max_interactions_increment else: max_iterations = model['parameters']['max_iterations'][ 'value'] if (deepness == 2 ) and model['types'][0]['type'] == 'regression': for tweedie_power in [1.0, 1.5, 2.0, 2.5, 3.0]: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['tweedie_variance_power'][ 'value'] = tweedie_power model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) if deepness == 2: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['alpha']['value'] = 0.0 model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['alpha']['value'] = 1.0 model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) if armetadata['data_initial']['cols'] > cols_breakdown: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['solver'][ 'value'] = 'L_BFGS' model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) if deepness == 2: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['balance_classes']['value'] = \ not model_aux['parameters']['balance_classes']['value'] model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) if model['parameters']['nfolds']['value'] < nfold_limit: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['nfolds'][ 'value'] += nfold_increment model_aux['parameters']['max_iterations'][ 'value'] = max_iterations model_list.append(new_armetadata) elif model['model'] == 'H2ODeepLearningEstimator': if scoring_metric.shape[0] == 0 or \ (scoring_metric['epochs'].max() >= model['parameters']['epochs']['value']): epochs = model['parameters']['epochs'][ 'value'] * epochs_increment else: epochs = model['parameters']['epochs']['value'] if deepness == 2: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] if armetadata['data_initial'][ 'rowcount'] > dpl_rcount_limit: model_aux['parameters']['hidden']['value'] = \ round(armetadata['data_initial']['rowcount'] / (dpl_divisor * 0.5)) else: model_aux['parameters']['hidden']['value'][0] = \ round(model['parameters']['hidden']['value'][0] * wider_increment) model_list.append(new_armetadata) for learning in rho_conf: new_armetadata = new_armetadata.copy_template( increment=0) model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['rho']['value'] = learning[ 'learn'] model_aux['parameters']['epsilon'][ 'value'] = learning['improvement'] model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] if armetadata['data_initial'][ 'rowcount'] > dpl_rcount_limit: model_aux['parameters']['hidden']['value'] = \ [round(armetadata['data_initial']['rowcount'] / (dpl_divisor * 0.5)), round(armetadata['data_initial']['rowcount'] / (dpl_divisor * deep_impact))] else: model_aux['parameters']['hidden']['value'] = [ model['parameters']['hidden']['value'][0], round( model['parameters']['hidden']['value'][0] / wider_increment) ] model_aux['parameters']['hidden_dropout_ratios'][ 'value'] = [h_dropout_ratio, h_dropout_ratio] model_list.append(new_armetadata) for learning in rho_conf: new_armetadata = new_armetadata.copy_template( increment=0) model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['rho']['value'] = learning[ 'learn'] model_aux['parameters']['epsilon'][ 'value'] = learning['improvement'] model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if (deepness == 3 ) and model['types'][0]['type'] == 'regression': for tweedie_power in [1.1, 1.5, 1.9]: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['distribution'][ 'value'] = 'tweedie' model_aux['parameters'][ 'tweedie_power'] = ParameterMetadata() model_aux['parameters']['tweedie_power'].set_value( tweedie_power) model_aux['parameters']['activation'][ 'value'] = 'tanh_with_dropout' model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness == 3 and not model['parameters']['sparse'][ 'value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['sparse'][ 'value'] = not model_aux['parameters']['sparse'][ 'value'] model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) '''Eliminado 19/09/2017 if deepness == 3 and model['parameters']['activation']['value'] == "rectifier_with_dropout": new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['activation']['value'] = 'tanh_with_dropout' model_list.append(new_armetadata)''' if deepness == 3 and model['parameters'][ 'initial_weight_distribution']['value'] == "normal": new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['initial_weight_distribution'][ 'value'] = "uniform" model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) elif deepness == 3: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['initial_weight_distribution'][ 'value'] = "normal" model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness > 2 and deepness <= deep_impact: if len(armetadata['model_parameters']['h2o'] ['parameters']['hidden']['value']) < 4: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] if len(model_aux['parameters']['hidden']['value']) > 1 \ and model_aux['parameters']['hidden']['value'][0] > \ model_aux['parameters']['hidden']['value'][1]: model_aux['parameters']['hidden'][ 'value'].insert( 0, round(model_aux['parameters']['hidden'] ['value'][0] * deeper_increment)) model_aux['parameters'][ 'hidden_dropout_ratios']['value'].insert( 0, h_dropout_ratio) elif len(model_aux['parameters']['hidden']['value']) > 1 \ and model_aux['parameters']['hidden']['value'][0] < \ model_aux['parameters']['hidden']['value'][1]: model_aux['parameters']['hidden'][ 'value'].append( round(model_aux['parameters']['hidden'] ['value'][-1] * deeper_increment)) model_aux['parameters'][ 'hidden_dropout_ratios']['value'].append( h_dropout_ratio) elif len(model_aux['parameters']['hidden'] ['value']) == 1: model_aux['parameters']['hidden']['value'][0] = \ round(model_aux['parameters']['hidden']['value'][0] * deeper_increment) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] for iterador in range( 0, len(model_aux['parameters']['hidden'] ['value'])): model_aux['parameters']['hidden']['value'][iterador] = \ int(round(model_aux['parameters']['hidden']['value'][iterador]) * wider_increment) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if model['parameters']['mini_batch_size'][ 'value'] >= dpl_min_batch_size: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['mini_batch_size']['value'] = \ round(model_aux['parameters']['mini_batch_size']['value'] / dpl_batch_reduced_divisor) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) elif model['model'] == 'H2ORandomForestEstimator': if deepness == 2: for size in sample_rate: for size2 in sample_rate: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['sample_rate'][ 'value'] = size['size'] model_aux['parameters'][ 'col_sample_rate_per_tree'][ 'value'] = size2['size'] model_list.append(new_armetadata) if model_metric['number_of_trees'][0] == model[ 'parameters']['ntrees']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['ntrees'][ 'value'] *= ntrees_increment model_list.append(new_armetadata) if model_metric['max_depth'][0] == model['parameters'][ 'max_depth']['value']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['max_depth'][ 'value'] *= max_depth_increment model_list.append(new_armetadata) if model['parameters']['nfolds']['value'] < nfold_limit: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['nfolds'][ 'value'] += nfold_increment model_list.append(new_armetadata) if model['parameters']['mtries']['value'] not in [ round(armetadata['data_initial']['cols'] / 2), round(armetadata['data_initial']['cols'] * 3 / 4) ]: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['mtries']['value'] = round( armetadata['data_initial']['cols'] / 2) model_list.append(new_armetadata) new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['mtries']['value'] = round( armetadata['data_initial']['cols'] * 3 / 4) model_list.append(new_armetadata) if model['parameters']['min_rows']['value'] > ( min_rows_limit / 2): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['min_rows']['value'] = round( model_aux['parameters']['min_rows']['value'] / min_rows_increment, 0) model_list.append(new_armetadata) elif model['model'] == 'H2ONaiveBayesEstimator': if deepness == 2: for laplace in nv_laplace: for min_prob in nv_min_prob: for min_sdev in nv_min_sdev: new_armetadata = armetadata.copy_template() model_aux = new_armetadata[ 'model_parameters']['h2o'] model_aux['parameters']['laplace'][ 'value'] = laplace model_aux['parameters']['min_prob'][ 'value'] = min_prob model_aux['parameters']['min_sdev'][ 'value'] = min_sdev model_list.append(new_armetadata) elif deepness >= 2: if deepness == deep_impact: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['balance_classes']['value'] = \ not model_aux['parameters']['balance_classes']['value'] model_list.append(new_armetadata) if model['parameters']['nfolds']['value'] < nfold_limit: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['nfolds'][ 'value'] += nfold_increment model_list.append(new_armetadata) for laplace in ['improvement', 'decrement']: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] if laplace == 'improvement': model_aux['parameters']['laplace'][ 'value'] = model_aux['parameters'][ 'laplace']['value'] * (1 + nv_improvement) else: model_aux['parameters']['laplace'][ 'value'] = model_aux['parameters'][ 'laplace']['value'] * (1 - nv_divisor) model_list.append(new_armetadata) elif model['model'] == 'H2OAutoEncoderEstimator': if scoring_metric.shape[0] == 0 or \ (scoring_metric['epochs'].max() >= model['parameters']['epochs']['value']): epochs = model['parameters']['epochs'][ 'value'] * epochs_increment else: epochs = model['parameters']['epochs']['value'] if deepness == 2: for learning in rho_conf: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] model_aux['parameters']['rho']['value'] = learning[ 'learn'] model_aux['parameters']['epsilon'][ 'value'] = learning['improvement'] model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness == 3: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['sparse'][ 'value'] = not model_aux['parameters']['sparse'][ 'value'] model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness > 1 and model['parameters']['activation'][ 'value'] == "rectifier_with_dropout": new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['activation'][ 'value'] = 'tanh_with_dropout' model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness == 3 and model['parameters'][ 'initial_weight_distribution']['value'] == "normal": new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['initial_weight_distribution'][ 'value'] = "uniform" model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) elif deepness == 3: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['initial_weight_distribution'][ 'value'] = "normal" model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if deepness <= deep_impact: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] for iterador in range( 0, len(model_aux['parameters']['hidden'] ['value'])): if iterador != int((float( len(model_aux['parameters']['hidden'] ['value'])) / 2) - 0.5): model_aux['parameters']['hidden']['value'][iterador] = \ int(round(model_aux['parameters']['hidden']['value'][iterador] * wider_increment, 0)) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if len(model_aux['parameters']['hidden']['value']) < 5: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters'][ 'h2o'] next_hidden = int( round( model_aux['parameters']['hidden']['value'] [0] * deeper_increment, 0)) model_aux['parameters']['hidden']['value'].insert( 0, next_hidden) model_aux['parameters']['hidden_dropout_ratios'][ 'value'].insert(0, h_dropout_ratio) model_aux['parameters']['hidden']['value'].append( next_hidden) model_aux['parameters']['hidden_dropout_ratios'][ 'value'].append(h_dropout_ratio) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) if model['parameters']['mini_batch_size'][ 'value'] >= dpl_min_batch_size: new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['mini_batch_size']['value'] = \ round(model_aux['parameters']['mini_batch_size']['value'] / dpl_batch_reduced_divisor) model_aux['parameters']['epochs']['value'] = epochs model_list.append(new_armetadata) elif model['model'] == 'H2OKMeansEstimator': if scoring_metric.shape[0] == 0 or \ (int(scoring_metric['number_of_reassigned_observations'][-1:]) >= 0): new_armetadata = armetadata.copy_template() model_aux = new_armetadata['model_parameters']['h2o'] model_aux['parameters']['max_iterations']['value'] = \ int(model_aux['parameters']['max_iterations']['value'] * clustering_increment) model_list.append(new_armetadata) except KeyError: return None else: return None if len(model_list) == 0: return None else: return model_list
def generate_json_path(e_c, armetadata, json_type='json'): config = e_c.config.get_config() fw = get_model_fw(armetadata) model_id = armetadata['model_parameters'][fw]['parameters']['model_id'][ 'value'] compress = config['persistence']['compress_json'] json_storage = StorageMetadata(e_c) command = 'json_storage.get_' + json_type + '_path()' for each_storage_type in eval(command): if each_storage_type['type'] in ['localfs', 'hdfs']: primary_path = config['storage'][ each_storage_type['type']]['value'] source_data = list() source_data.append(primary_path) source_data.append('/') source_data.append(armetadata['user_id']) source_data.append('/') source_data.append(armetadata['workflow_id']) source_data.append('/') source_data.append(armetadata['model_id']) source_data.append('/') source_data.append(fw) source_data.append('/') source_data.append(armetadata['type']) source_data.append('/') source_data.append(str(armetadata['timestamp'])) source_data.append('/') specific_data = list() specific_data.append(each_storage_type['value']) specific_data.append('/') specific_data.append(model_id) specific_data.append('.json') if compress: specific_data.append('.gz') json_path = ''.join(source_data) json_path += ''.join(specific_data) json_storage.append(value=json_path, fstype=each_storage_type['type'], hash_type=each_storage_type['hash_type']) else: if json_type == 'json': source_data = list() source_data.append('/') source_data.append(armetadata['user_id']) source_data.append('/') source_data.append(armetadata['workflow_id']) source_data.append('/') source_data.append(armetadata['model_id']) source_data.append('/') source_data.append(model_id) json_path = ''.join(source_data) json_storage.append(value=json_path, fstype=each_storage_type['type'], hash_type=each_storage_type['hash_type']) else: json_storage.append(value=each_storage_type['value'], fstype=each_storage_type['type'], hash_type=each_storage_type['hash_type']) command = json_type + '_path' armetadata[command] = json_storage
def exec_sanalysis(self, datapath, list_ar_metadata, metric='combined_accuracy', deep_impact=1, **kwargs): self._logging.log_info('gDayF', "Controller", self._labels["start"]) self._logging.log_info('gDayF', "Controller", self._labels["ana_param"], metric) self._logging.log_info('gDayF', "Controller", self._labels["dep_param"], deep_impact) if isinstance(datapath, str): try: self._logging.log_info('gDayF', "Controller", self._labels["input_param"], datapath) pd_dataset = inputHandlerCSV().inputCSV(filename=datapath) id_datapath = Path(datapath).name hash_dataframe = hash_key('MD5', datapath) except IOError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] except OSError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] except JSONDecodeError: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'] elif isinstance(datapath, DataFrame): hash_dataframe = None self._logging.log_critical('gDayF', "Controller", self._labels["input_param"], str(datapath.shape)) pd_dataset = datapath id_datapath = 'Dataframe' + \ '_' + str(pd_dataset.size) + \ '_' + str(pd_dataset.shape[0]) + \ '_' + str(pd_dataset.shape[1]) else: self._logging.log_critical('gDayF', "Controller", self._labels["failed_input"], datapath) return self._labels['failed_input'], None pd_test_dataset = None if self._config['common']['minimal_test_split'] <= len(pd_dataset.index) \ and (metric in ACCURACY_METRICS or metric in REGRESSION_METRICS): pd_dataset, pd_test_dataset = pandas_split_data( pd_dataset, train_perc=self._config['common']['test_frame_ratio']) df = DFMetada().getDataFrameMetadata(pd_dataset, 'pandas') self._ec.set_id_analysis(self._ec.get_id_user() + '_' + id_datapath + '_' + str(time())) adviser = self.adviser.AdviserAStar(e_c=self._ec, metric=metric, deep_impact=deep_impact, dataframe_name=id_datapath, hash_dataframe=hash_dataframe) adviser.analysis_specific(dataframe_metadata=df, list_ar_metadata=list_ar_metadata) while adviser.next_analysis_list is not None: for each_model in adviser.next_analysis_list: fw = get_model_fw(each_model) self.init_handler(fw) if pd_test_dataset is not None: _, analyzed_model = self.model_handler[fw][ 'handler'].order_training(training_pframe=pd_dataset, base_ar=each_model, test_frame=pd_test_dataset, filtering='NONE') else: _, analyzed_model = self.model_handler[fw][ 'handler'].order_training(training_pframe=pd_dataset, base_ar=each_model, filtering='NONE') if analyzed_model is not None: adviser.analysis_recommendation_order.append( analyzed_model) adviser.next_analysis_list.clear() adviser.analysis_recommendation_order = adviser.priorize_models( model_list=adviser.analysis_recommendation_order) adviser.analysis_specific( dataframe_metadata=df, list_ar_metadata=adviser.analysis_recommendation_order) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["ana_models"], str(len(adviser.analyzed_models))) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["exc_models"], str(len(adviser.excluded_models))) self.log_model_list(adviser.analysis_recommendation_order, metric) self._logging.log_info(self._ec.get_id_analysis(), 'controller', self._labels["end"]) self.clean_handlers() adviser.analysis_recommendation_order = adviser.priorize_models( model_list=adviser.analysis_recommendation_order) return self._labels[ 'success_op'], adviser.analysis_recommendation_order