class AI(): """AI managing agent for Penn AI. Responsible for: - checking for user requests for recommendations, - checking for new results from experiments, - calling the recommender system to generate experiment recommendations, - posting the recommendations to the API. - handling communication with the API. :param rec_class: ai.BaseRecommender - recommender to use :param api_path: string - path to the lab api server :param extra_payload: dict - any additional payload that needs to be specified :param user: string - test user :param rec_score_file: file - pickled score file to keep persistent scores between sessions :param verbose: Boolean :param warm_start: Boolean - if true, attempt to load the ai state from the file provided by rec_score_file :param n_recs: int - number of recommendations to make for each request :param datasets: str or False - if not false, a comma seperated list of datasets to turn the ai on for at startup :param use_pmlb_knowledgebase: Boolean """ def __init__(self, rec_class=None, api_path=None, extra_payload=dict(), user='******', rec_score_file='rec_state.obj', verbose=True, warm_start=False, n_recs=1, datasets=False, use_knowledgebase=False, term_condition='n_recs', max_time=5): """Initializes AI managing agent.""" # default supervised learning recommender settings self.DEFAULT_REC_CLASS = RandomRecommender self.DEFAULT_REC_ARGS = {'metric': 'accuracy'} # recommendation engines for different problem types # will be expanded as more types of probles are supported # (classification, regression, unsupervised, etc.) self.rec_engines = {"classification": None} # Request manager settings self.n_recs = n_recs if n_recs > 0 else 1 self.continuous = n_recs < 1 # api parameters, will be removed from self once the recommenders no longer # call the api directly. # See #98 <https://github.com/EpistasisLab/pennai/issues/98> if api_path == None: api_path = ('http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']) self.user = user self.api_path = api_path self.api_key = os.environ['APIKEY'] self.verbose = verbose #False: no printouts, True: printouts on updates # file name of stored scores for the recommender self.rec_score_file = rec_score_file # timestamp of the last time new experiments were processed self.last_update = 0 # api self.labApi = api_utils.LabApi(api_path=self.api_path, user=self.user, api_key=self.api_key, extra_payload=extra_payload, verbose=self.verbose) self.load_options() #loads algorithm parameters to self.ui_options self.initilize_recommenders(rec_class) # set self.rec_engines # build dictionary of ml ids to names conversion self.ml_id_to_name = self.labApi.get_ml_id_dict() # print('ml_id_to_name:',self.ml_id_to_name) # dictionary of dataset threads, initilized and used by q_utils. # Keys are datasetIds, values are q_utils.DatasetThread instances. #WGL: this should get moved to the request manager self.dataset_threads = {} # local dataframe of datasets and their metafeatures self.dataset_mf_cache = pd.DataFrame() # store dataset_id to hash dictionary self.dataset_mf_cache_id_hash_lookup = {} if use_knowledgebase: self.load_knowledgebase() # set termination condition self.term_condition = term_condition if self.term_condition == 'n_recs': self.term_value = self.n_recs elif self.term_condition == 'time': self.term_value = max_time else: self.term_value = None # start the request manager self.requestManager = RequestManager( ai=self, defaultTermConditionStr=self.term_condition, defaultTermParam=self.term_value) # if there is a pickle file, load it as the recommender scores assert not (warm_start), "The `warm_start` option is not yet supported" # for comma-separated list of datasets in datasets, turn AI request on assert not ( datasets ), "The `datasets` option is not yet supported: " + str(datasets) ##----------------- ## Init methods ##----------------- def initilize_recommenders(self, rec_class): """ Initilize classification recommender """ # Create supervised learning recommenders if (rec_class): self.rec_engines["classification"] = rec_class( **self.DEFAULT_REC_ARGS) else: self.rec_engines["classification"] = self.DEFAULT_REC_CLASS( **self.DEFAULT_REC_ARGS) # set the registered ml parameters in the recommenders ml_p = self.labApi.get_all_ml_p() assert ml_p is not None assert len(ml_p) > 0 self.rec_engines["classification"].ml_p = ml_p # if hasattr(self.rec_engines["classification"],'mlp_combos'): # self.rec_engines["classification"].mlp_combos = self.rec_engines["classification"].ml_p['algorithm']+'|'+self.rec_engines["classification"].ml_p['parameters'] # ml_p.to_csv('ml_p_options.csv') logger.debug("recomendation engines initilized: ") for prob_type, rec in self.rec_engines.items(): logger.debug(f'\tproblemType: {prob_type} - {rec}') logger.debug('\trec.ml_p:\n' + str(rec.ml_p.head())) def load_knowledgebase(self): """Bootstrap the recommenders with the knowledgebase.""" logger.info('loading pmlb knowledgebase') kb = knowledgebase_loader.load_default_knowledgebases() # replace algorithm names with their ids self.ml_name_to_id = {v: k for k, v in self.ml_id_to_name.items()} kb['resultsData']['algorithm'] = kb['resultsData']['algorithm'].apply( lambda x: self.ml_name_to_id[x]) all_df_mf = kb['metafeaturesData'].set_index('_id') # all_df_mf = pd.DataFrame.from_records(metafeatures).transpose() # use _id to index the metafeatures, and # keep only metafeatures with results self.dataset_mf_cache = all_df_mf.loc[kb['resultsData'] ['_id'].unique()] # self.update_dataset_mf(kb['resultsData']) self.rec_engines["classification"].update(kb['resultsData'], self.dataset_mf_cache, source='knowledgebase') logger.info('pmlb knowledgebase loaded') def load_options(self): """Loads algorithm UI parameters and sets them to self.ui_options.""" logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': loading options...') responses = self.labApi.get_projects() if len(responses) > 0: self.ui_options = responses else: logger.warning("no algorithms found by load_options()") ##----------------- ## Utility methods ##----------------- def get_results_metafeatures(self, results_data): """ Return a pandas dataframe of metafeatures associated with the datasets in results_data. Retireves metafeatures from self.dataset_mf_cache if they exist, otherwise queries the api and updates the cache. :param results_data: experiment results with associated datasets """ logger.debug('results_data:' + str(results_data.columns)) logger.debug('results_data:' + str(results_data.head())) dataset_metafeatures = [] dataset_indicies = results_data['dataset_id'].unique() # add dataset metafeatures to the cache for d in dataset_indicies: if len( self.dataset_mf_cache ) == 0 or d not in self.dataset_mf_cache_id_hash_lookup.keys(): df = self.labApi.get_metafeatures(d) df['dataset'] = d dataset_metafeatures.append(df) self.dataset_mf_cache_id_hash_lookup.update({d: df['_id']}) if dataset_metafeatures: df_mf = pd.concat(dataset_metafeatures).set_index('dataset') self.dataset_mf_cache = self.dataset_mf_cache.append(df_mf) logger.info(f'mf count:\n {len(self.dataset_mf_cache.index.values)}') #logger.info(f'mf:\n {list(self.dataset_mf_cache.index.values)}') logger.info(f'indicies: \n\n {dataset_indicies}') new_mf = self.dataset_mf_cache.loc[dataset_indicies, :] assert len(new_mf) == len(dataset_indicies) logger.info(f"new_mf: {new_mf}") return new_mf ##----------------- ## Loop methods ##----------------- def check_results(self): """Checks to see if new experiment results have been posted since the previous time step. If so, set them to self.new_data and return True. :returns: Boolean - True if new results were found """ logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': checking results...') newResults = self.labApi.get_new_experiments_as_dataframe( last_update=self.last_update) if len(newResults) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': ' + str(len(newResults)) + ' new results!') self.last_update = int(time.time()) * 1000 # update timestamp self.new_data = newResults return True return False def update_recommender(self): """Update recommender models based on new experiment results in self.new_data, and then clear self.new_data. """ if (hasattr(self, 'new_data') and len(self.new_data) >= 1): new_mf = self.get_results_metafeatures(self.new_data) self.new_data['_id'] = self.new_data['dataset_id'].apply( lambda x: self.dataset_mf_cache_id_hash_lookup[x]) self.rec_engines["classification"].update(self.new_data, new_mf) logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': recommender updated') # reset new data self.new_data = pd.DataFrame() def check_requests(self): """Check to see if any new AI requests have been submitted. If so, add them to self.request_queue. :returns: Boolean - True if new AI requests have been submitted """ logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': checking requests...') # get all dtasets that have an ai 'requested' status # and initilize a new request dsFilter = {'ai': [AI_STATUS.REQUESTED.value, 'dummy']} aiOnRequests = self.labApi.get_filtered_datasets(dsFilter) if len(aiOnRequests) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': new ai request for:' + ';'.join([r['name'] for r in aiOnRequests])) # set AI flag to 'on' to acknowledge requests received for r in aiOnRequests: self.labApi.set_ai_status(datasetId=r['_id'], aiStatus='on') self.requestManager.add_request(datasetId=r['_id'], datasetName=r['name']) time.sleep(.1) # get all datasets that have a manual 'off' status # and terminate their ai requests dsFilter = {'ai': [AI_STATUS.OFF.value, 'dummy']} aiOffRequests = self.labApi.get_filtered_datasets(dsFilter) if len(aiOffRequests) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': ai termination request for:' + ';'.join([r['name'] for r in aiOffRequests])) for r in aiOffRequests: self.requestManager.terminate_request(datasetId=r['_id']) return True def process_rec(self): self.requestManager.process_requests() ##----------------- ## Syncronous actions an AI request can take ##----------------- def generate_recommendations(self, datasetId, numOfRecs, predictionType="classification"): """Generate ml recommendation payloads for the given dataset. :param datasetId :param numOfRecs :returns list of maps that represent request payload objects """ logger.info("generate_recommendations({},{})".format( datasetId, numOfRecs)) recommendations = [] metafeatures = self.labApi.get_metafeatures(datasetId) ml, p, ai_scores = self.rec_engines[predictionType].recommend( dataset_id=metafeatures['_id'].values[0], n_recs=numOfRecs, dataset_mf=metafeatures) for alg, params, score in zip(ml, p, ai_scores): # TODO: just return dictionaries of parameters from rec # modified_params = eval(params) # turn params into a dictionary recommendations.append({ 'dataset_id': datasetId, 'algorithm_id': alg, 'username': self.user, 'parameters': params, 'ai_score': score, }) return recommendations def transfer_rec(self, rec_payload): """Attempt to send a recommendation to the lab server. If any error other then a no capacity error occurs, throw an exception. :param rec_payload: dictionary - the payload describing the experiment :return bool - true if successfully sent, false if no machine capacity available """ logger.info(f"transfer_rec({rec_payload})") aiStatus = self.labApi.get_dataset_ai_status(rec_payload['dataset_id']) if not (aiStatus == AI_STATUS.ON.value): logger.debug("AI status is not on; not submitting experiment") return False submitstatus = self.labApi.launch_experiment( algorithmId=rec_payload['algorithm_id'], payload=rec_payload) logger.debug(f"transfer_rec() submitstatus: {submitstatus}") if 'error' in submitstatus: if ('No machine capacity available' in submitstatus['error']): logger.debug(f"Waiting for capacity: {submitstatus['error']}") return False else: msg = 'Unrecoverable error during transfer_rec : ' + str( submitstatus) logger.error(msg) raise RuntimeError(msg) return True ##----------------- ## Save/load ai state ##----------------- def save_state(self): """Save ML+P scores in pickle or to DB TODO: test that this still works """ raise RuntimeError("save_state is not currently supported") out = open(self.rec_score_file, 'wb') state = {} if (hasattr(self.rec_engines["classification"], 'scores')): #TODO: make this a more generic. Maybe just save the # AI or rec object itself. # state['trained_dataset_models'] = self.rec_engines["classification"].trained_dataset_models state['scores'] = self.rec_engines["classification"].scores state['last_update'] = self.last_update pickle.dump(state, out) def load_state(self): """Loads pickled score file and recommender model. TODO: test that this still works """ raise RuntimeError("load_state is not currently supported") if os.stat(self.rec_score_file).st_size != 0: filehandler = open(self.rec_score_file, 'rb') state = pickle.load(filehandler) if (hasattr(self.rec_engines["classification"], 'scores')): self.rec_engines["classification"].scores = state['scores'] # self.rec_engines["classification"].trained_dataset_models = state['trained_dataset_models'] self.last_update = state['last_update'] logger.info('loaded previous state from ' + self.last_update)
class AI(): """AI managing agent for Penn AI. Responsible for: - checking for user requests for recommendations, - checking for new results from experiments, - calling the recommender system to generate experiment recommendations, - posting the recommendations to the API. - handling communication with the API. :param rec: ai.BaseRecommender - recommender to use :param api_path: string - path to the lab api server :param extra_payload: dict - any additional payload that needs to be specified :param user: string - test user :param rec_score_file: file - pickled score file to keep persistent scores between sessions :param verbose: Boolean :param warm_start: Boolean - if true, attempt to load the ai state from the file provided by rec_score_file :param n_recs: int - number of recommendations to make for each request :param datasets: str or False - if not false, a comma seperated list of datasets to turn the ai on for at startup :param use_pmlb_knowledgebase: Boolean """ def __init__(self, rec=None, api_path=None, extra_payload=dict(), user='******', rec_score_file='rec_state.obj', verbose=True, warm_start=False, n_recs=1, datasets=False, use_knowledgebase=False, term_condition='n_recs', max_time=5): """initializes AI managing agent.""" # recommender settings if api_path == None: api_path = ('http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']) self.rec = rec self.n_recs = n_recs if n_recs > 0 else 1 self.continous = n_recs < 1 # api parameters, will be removed from self once the recommenders no longer # call the api directly. # See #98 <https://github.com/EpistasisLab/pennai/issues/98> self.user = user self.api_path = api_path self.api_key = os.environ['APIKEY'] self.verbose = verbose #False: no printouts, True: printouts on updates # file name of stored scores for the recommender self.rec_score_file = rec_score_file # timestamp of the last time new experiments were processed self.last_update = 0 # api self.labApi = api_utils.LabApi(api_path=self.api_path, user=self.user, api_key=self.api_key, extra_payload=extra_payload, verbose=self.verbose) self.load_options() #loads algorithm parameters to self.ui_options # create a default recommender if not set if (rec): self.rec = rec else: self.rec = RandomRecommender(ml_p=self.labApi.get_all_ml_p()) # set the registered ml parameters in the recommender ml_p = self.labApi.get_all_ml_p() assert ml_p is not None assert len(ml_p) > 0 self.rec.ml_p = ml_p # if hasattr(self.rec,'mlp_combos'): # self.rec.mlp_combos = self.rec.ml_p['algorithm']+'|'+self.rec.ml_p['parameters'] logger.debug('self.rec.ml_p:\n' + str(self.rec.ml_p.head())) # tmp = self.labApi.get_all_ml_p() # tmp.to_csv('ml_p_options.csv') # build dictionary of ml ids to names conversion self.ml_id_to_name = self.labApi.get_ml_id_dict() # print('ml_id_to_name:',self.ml_id_to_name) # dictionary of dataset threads, initilized and used by q_utils. # Keys are datasetIds, values are q_utils.DatasetThread instances. #WGL: this should get moved to the request manager self.dataset_threads = {} # local dataframe of datasets and their metafeatures self.dataset_mf = pd.DataFrame() if use_knowledgebase: self.load_knowledgebase() # set termination condition self.term_condition = term_condition if self.term_condition == 'n_recs': self.term_value = self.n_recs elif self.term_condition == 'time': self.term_value = max_time else: self.term_value = None # start the request manager self.requestManager = RequestManager( ai=self, defaultTermConditionStr=self.term_condition, defaultTermParam=self.term_value) # if there is a pickle file, load it as the recommender scores assert not (warm_start), "The `warm_start` option is not yet supported" # for comma-separated list of datasets in datasets, turn AI request on assert not ( datasets ), "The `datasets` option is not yet supported: " + str(datasets) ##----------------- ## Init methods ##----------------- def load_knowledgebase(self): """ Bootstrap the recommenders with the knowledgebase """ logger.info('loading pmlb knowledgebase') kb = knowledgebase_loader.load_pmlb_knowledgebase() # replace algorithm names with their ids self.ml_name_to_id = {v: k for k, v in self.ml_id_to_name.items()} kb['resultsData']['algorithm'] = kb['resultsData']['algorithm'].apply( lambda x: self.ml_name_to_id[x]) #TODO: Verify that conversion from name to id is needed.... # WGL: yes at the moment we need this until hash is implemented. # we can add a check at dataset upload to prevent repeat dataset names in # the mean time. self.user_datasets = self.labApi.get_user_datasets(self.user) self.dataset_name_to_id = {v: k for k, v in self.user_datasets.items()} kb['resultsData']['dataset'] = kb['resultsData']['dataset'].apply( lambda x: self.dataset_name_to_id[x] if x in self.dataset_name_to_id.keys() else x) metafeatures = {} for k, v in kb['metafeaturesData'].items(): if k in self.dataset_name_to_id.keys(): metafeatures[self.dataset_name_to_id[k]] = v else: metafeatures[k] = v # all_df_mf = pd.DataFrame.from_records(kb['metafeaturesData']).transpose() all_df_mf = pd.DataFrame.from_records(metafeatures).transpose() # keep only metafeatures with results self.dataset_mf = all_df_mf.reindex(kb['resultsData'].dataset.unique()) # self.update_dataset_mf(kb['resultsData']) self.rec.update(kb['resultsData'], self.dataset_mf, source='knowledgebase') logger.info('pmlb knowledgebase loaded') def load_options(self): """Loads algorithm UI parameters and sets them to self.ui_options.""" logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': loading options...') responses = self.labApi.get_projects() if len(responses) > 0: self.ui_options = responses else: logger.warning("no algorithms found by load_options()") ##----------------- ## Utility methods ##----------------- def update_dataset_mf(self, results_data): """Grabs metafeatures of datasets in results_data and concatinates them to result_data :param results_data: experiment results with associated datasets """ logger.debug('results_data:' + str(results_data.columns)) logger.debug('results_data:' + str(results_data.head())) dataset_metafeatures = [] for d in results_data['dataset'].unique(): if len(self.dataset_mf) == 0 or d not in self.dataset_mf.index: # fetch metafeatures from server for dataset and append df = self.labApi.get_metafeatures(d) # df['dataset'] = d # print('metafeatures:',df) dataset_metafeatures.append(df) if dataset_metafeatures: df_mf = pd.concat(dataset_metafeatures).set_index('dataset') # print('df_mf:',df_mf['dataset'], df_mf) self.dataset_mf = self.dataset_mf.append(df_mf) # print('self.dataset_mf:\n',self.dataset_mf) ##----------------- ## Loop methods ##----------------- def check_results(self): """Checks to see if new experiment results have been posted since the previous time step. If so, set them to self.new_data and return True. :returns: Boolean - True if new results were found """ logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': checking results...') newResults = self.labApi.get_new_experiments_as_dataframe( last_update=self.last_update) if len(newResults) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': ' + str(len(newResults)) + ' new results!') self.last_update = int(time.time()) * 1000 # update timestamp self.new_data = newResults return True return False def update_recommender(self): """Update recommender models based on new experiment results in self.new_data, and then clear self.new_data. """ if (hasattr(self, 'new_data') and len(self.new_data) >= 1): self.update_dataset_mf(self.new_data) self.rec.update(self.new_data, self.dataset_mf) logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': recommender updated') # reset new data self.new_data = pd.DataFrame() def check_requests(self): """Check to see if any new AI requests have been submitted. If so, add them to self.request_queue. :returns: Boolean - True if new AI requests have been submitted """ logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': checking requests...') # get all dtasets that have an ai 'requested' status # and initilize a new request dsFilter = {'ai': ['requested', 'dummy']} aiOnRequests = self.labApi.get_filtered_datasets(dsFilter) if len(aiOnRequests) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': new ai request for:' + ';'.join([r['name'] for r in aiOnRequests])) # set AI flag to 'on' to acknowledge requests received for r in aiOnRequests: self.labApi.set_ai_status(datasetId=r['_id'], aiStatus='on') self.requestManager.add_request(datasetId=r['_id'], datasetName=r['name']) time.sleep(.1) # get all datasets that have a manual 'off' status # and terminate their ai requests dsFilter = {'ai': ['off', 'dummy']} aiOffRequests = self.labApi.get_filtered_datasets(dsFilter) if len(aiOffRequests) > 0: logger.info( time.strftime("%Y %I:%M:%S %p %Z", time.localtime()) + ': ai termination request for:' + ';'.join([r['name'] for r in aiOffRequests])) for r in aiOffRequests: self.requestManager.terminate_request(datasetId=r['_id']) return True def process_rec(self): self.requestManager.process_requests() ##----------------- ## Syncronous actions an AI request can take ##----------------- def generate_recommendations(self, datasetId, numOfRecs): """Generate ml recommendation payloads for the given dataset. :param datasetId :param numOfRecs :returns list of maps that represent request payload objects """ logger.info("generate_recommendations({},{})".format( datasetId, numOfRecs)) recommendations = [] metafeatures = self.labApi.get_metafeatures(datasetId) ml, p, ai_scores = self.rec.recommend(dataset_id=datasetId, n_recs=numOfRecs, dataset_mf=metafeatures) for alg, params, score in zip(ml, p, ai_scores): # TODO: just return dictionaries of parameters from rec # modified_params = eval(params) # turn params into a dictionary recommendations.append({ 'dataset_id': datasetId, 'algorithm_id': alg, 'username': self.user, 'parameters': params, 'ai_score': score, }) return recommendations def transfer_rec(self, rec_payload): """Attempt to send a recommendation to the lab server. Continues until recommendation is successfully submitted or an unexpected error occurs. :param rec_payload: dictionary - the payload describing the experiment """ logger.info("transfer_rec(" + str(rec_payload) + ")") submitstatus = self.labApi.launch_experiment( algorithmId=rec_payload['algorithm_id'], payload=rec_payload) logger.debug("transfer_rec() starting loop, submitstatus: " + str(submitstatus)) while ('error' in submitstatus and submitstatus['error'] == 'No machine capacity available'): logger.debug("Waiting for server capacity: {}".format( submitstatus['error'])) sleep(3) submitstatus = self.labApi.launch_experiment( rec_payload['algorithm_id'], rec_payload) logger.debug("transfer_rec() exiting loop, submitstatus: " + str(submitstatus)) if 'error' in submitstatus: msg = 'Unrecoverable error during transfer_rec : ' + str( submitstatus) logger.error(msg) raise RuntimeError(msg) #pdb.set_trace() ##----------------- ## Save/load ai state ##----------------- def save_state(self): """Save ML+P scores in pickle or to DB TODO: test that this still works """ raise RuntimeError("save_state is not currently supported") out = open(self.rec_score_file, 'wb') state = {} if (hasattr(self.rec, 'scores')): #TODO: make this a more generic. Maybe just save the # AI or rec object itself. # state['trained_dataset_models'] = self.rec.trained_dataset_models state['scores'] = self.rec.scores state['last_update'] = self.last_update pickle.dump(state, out) def load_state(self): """Loads pickled score file and recommender model. TODO: test that this still works """ raise RuntimeError("load_state is not currently supported") if os.stat(self.rec_score_file).st_size != 0: filehandler = open(self.rec_score_file, 'rb') state = pickle.load(filehandler) if (hasattr(self.rec, 'scores')): self.rec.scores = state['scores'] # self.rec.trained_dataset_models = state['trained_dataset_models'] self.last_update = state['last_update'] logger.info('loaded previous state from ' + self.last_update)