def load_logs(self, list_of_contents, list_of_names): # global aggregate_dataframe if list_of_contents is not None: for content, filename in zip(list_of_contents, list_of_names): content_type, content_string = content.split(",") decoded = base64.b64decode(content_string).decode("utf-8") log_lines = decoded.splitlines() report = GamaReport(log_lines=log_lines, name=filename) self.reports[filename] = report eval_copy = report.evaluations.copy() eval_copy["search_method"] = report.search_method # if aggregate_dataframe is None: # eval_copy["log_no"] = 0 # eval_copy["filename"] = filename # aggregate_dataframe = eval_copy # else: # eval_copy["log_no"] = len( # aggregate_dataframe["log_no"].unique()) # eval_copy["filename"] = filename # aggregate_dataframe = pd.concat( # [aggregate_dataframe, eval_copy]) return [{ "label": logname, "value": logname } for logname in self.reports] return []
def test_gamareport_asyncEA_from_log(): """ GamaReport can be constructed from a log that recorded AsyncEA. """ log_dir = "tests/data/AsyncEA" report = GamaReport(log_dir) assert report.name == "AsyncEA" assert "AsyncEA" == report.search_method assert 3 == len(report.phases) assert ["preprocessing", "search", "postprocess"] == list(map(lambda t: t[0], report.phases)) assert ["default", "AsyncEA", "NoPostProcessing"] == list(map(lambda t: t[1], report.phases))
def test_gamareport_asyncEA_from_log(): """ Test that a GamaReport can be constructed from a log. """ log_file = 'tests/data/amazon_asyncEA.log' report = GamaReport(logfile=log_file, name=None) assert report.name == log_file assert 'AsyncEA()' == report.search_method assert 3 == len(report.phases) assert (['preprocessing', 'search', 'postprocess'] == list(map(lambda t: t[0], report.phases))) assert (['default', 'AsyncEA', 'NoPostProcessing'] == list(map(lambda t: t[1], report.phases))) assert report.method_data is None, "AsyncEA has no method data associated."
def test_gamareport_asha_from_log(): """ Test that a GamaReport can be constructed from a log and retrieve ASHA specific information. """ log_file = 'tests/data/asha.log' report = GamaReport(logfile=log_file, name=None) assert report.name == log_file assert 'AsynchronousSuccessiveHalving()' == report.search_method assert 3 == len(report.phases) assert (['preprocessing', 'search', 'postprocess'] == list(map(lambda t: t[0], report.phases))) assert (['default', 'AsynchronousSuccessiveHalving', 'NoPostProcessing'] == list(map(lambda t: t[1], report.phases))) assert report.method_data is not None, "ASHA has method data associated."
def log_to_df(path, classification, regression): ''' Converts multiple log files to dataframes depending on their learning task. Parameters: ----------- path: string Contains the path name to where the log files are stored. classification: list Contains an overview of classification learning task dataset ids. regression: list Contains an overview of regression learning task dataset ids. Returns: -------- pd.DataFrame, pd.DataFrame Contains a pd.DataFrame of classification datasets where multiple pd.DataFrame have been combined into one, Contains a pd.DataFrame of regression datasets where multiple pd.DataFrame have been combined into one ''' df_class = pd.DataFrame() df_regr = pd.DataFrame() for log_file in glob.glob(path): #gives file name of current log_file file_name = os.path.basename(log_file) dataset_id = mixed_string_to_int(file_name) #create df from GAMA report report = GamaReport(logfile=log_file) report_df = report.evaluations #Set k in df report_df['k'] = return_k(file_name) report_df['id'] = dataset_id report_df = report_df.replace([np.inf, -np.inf], np.nan).dropna() if dataset_id in classification: drop_columns = [ 'start', 'accuracy_cummax', 'length_cummax', 'relative_end' ] report_df = report_df.drop(drop_columns, axis=1) df_class = df_class.append(report_df) else: drop_columns = [ 'start', 'r2_cummax', 'length_cummax', 'relative_end' ] report_df = report_df.drop(drop_columns, axis=1) df_regr = df_regr.append(report_df) return df_class, df_regr
def test_gamareport_asyncEA_from_log(): """ GamaReport can be constructed from a log that recorded AsyncEA. """ log_file = "tests/data/amazon_asyncEA.log" report = GamaReport(logfile=log_file, name=None) assert report.name == log_file assert "AsyncEA()" == report.search_method assert 3 == len(report.phases) assert ["preprocessing", "search", "postprocess"] == list( map(lambda t: t[0], report.phases) ) assert ["default", "AsyncEA", "NoPostProcessing"] == list( map(lambda t: t[1], report.phases) ) assert report.method_data is None, "AsyncEA has no method data associated."
def log_to_df_file(log_file): ''' Converts a GAMA log file to a pd.DataFrame. Parameters: ----------- log_file: str Contains the name of the file or the path to the file. Returns: -------- pd.DataFrame Contains a pd.DataFrame for that specific log. ''' report = GamaReport(logfile=log_file) return report.evaluations
def test_gamareport_from_log(): """ Test that a GamaReport can be constructed from a log. """ # We refer to a static log, this makes it independent of other unit tests, but it also makes it # independent of the actual changes in gama logging. This is bad in the sense that if logging changes # and the static file is not updated, we won't catch it. This is good in the sense that it achieves unit test # independence and backwards incompatability of changes to GamaReport are immediately caught if tested on the # old log first. # Perhaps we can/should link to the log file used in the documentation. log_file = 'tests/data/random_search.log' report = GamaReport(logfile=log_file, name=None) assert report.name == log_file assert 'RandomSearch()' == report.search_method assert 3 == len(report.phases) assert (['preprocessing', 'search', 'postprocess'] == list(map(lambda t: t[0], report.phases))) assert (['default', 'RandomSearch', 'NoPostProcessing'] == list(map(lambda t: t[1], report.phases))) assert report.method_data is None, "Random Search has no method data associated."
def test_gamareport_from_log(): """ GamaReport can be constructed from a log that recorded RandomSearch. """ # We refer to a static log, this makes it independent of other unit tests, # but it also makes it independent of the actual changes in gama logging. # Cons: # - when logging changes and the static file is not updated, we won't catch it. # Pros: # + unit test independence # + backwards compatibility test for GamaReport # Perhaps we can/should link to the log file used in the documentation. log_dir = "tests/data/RandomSearch" report = GamaReport(log_dir) assert report.name == "RandomSearch" assert "RandomSearch" == report.search_method assert 3 == len(report.phases) assert ["preprocessing", "search", "postprocess"] == list(map(lambda t: t[0], report.phases)) assert ["default", "RandomSearch", "NoPostProcessing"] == list(map(lambda t: t[1], report.phases))
def load_logs(self, list_of_contents, list_of_names): # global aggregate_dataframe if list_of_contents is not None: tmp_dir = f"tmp_{str(uuid.uuid4())}" os.makedirs(tmp_dir) for content, filename in zip(list_of_contents, list_of_names): content_type, content_string = content.split(",") decoded = base64.b64decode(content_string).decode("utf-8") with open(os.path.join(tmp_dir, filename), "w") as fh: fh.write(decoded) report = GamaReport(tmp_dir) report_name = report.search_method for i in itertools.count(): if f"{report_name}_{i}" not in self.reports: break self.reports[f"{report_name}_{i}"] = report shutil.rmtree(tmp_dir) return [{ "label": logname, "value": logname } for logname in self.reports] return []
def load_logs(list_of_contents, list_of_names): global aggregate_dataframe if list_of_contents is not None: for content, filename in zip(list_of_contents, list_of_names): content_type, content_string = content.split(',') decoded = base64.b64decode(content_string).decode('utf-8') log_lines = decoded.splitlines() report = GamaReport(log_lines=log_lines, name=filename) reports[filename] = report eval_copy = report.evaluations.copy() eval_copy['search_method'] = report.search_method if aggregate_dataframe is None: eval_copy['log_no'] = 0 eval_copy['filename'] = filename aggregate_dataframe = eval_copy else: eval_copy['log_no'] = len( aggregate_dataframe['log_no'].unique()) eval_copy['filename'] = filename aggregate_dataframe = pd.concat( [aggregate_dataframe, eval_copy]) return [{'label': logname, 'value': logname} for logname in reports] return []
def update_page(self, _, page_store): if self.report is None: if self.log_file is None or not os.path.exists(self.log_file): raise PreventUpdate # report does not exist else: self.report = GamaReport(self.log_file) elif not self.report.update() and not self.need_update: raise PreventUpdate # report is not updated start_update = time.time() selected_pipeline = page_store.get("selected_pipeline", None) with pd.option_context("mode.use_inf_as_na", True): evaluations = self.report.evaluations.dropna() self.need_update = False scatters = self.scatter_plot(evaluations, self.report.metrics, selected_pipeline) metric_one, metric_two = self.report.metrics metric_one_text = metric_one.replace("_", " ") figure = { "data": scatters, "layout": dict( hovermode="closest", clickmode="event+select", title=f"{metric_one_text} vs {metric_two}", xaxis=dict(title=metric_one_text), yaxis=dict(title=metric_two, tickformat=",d" ), # tickformat forces integer ticks for length, uirevision="never_reset_zoom", ), } pl_table_data = [{ "pl": self.report.individuals[id_].short_name(" > "), "id": id_, "score": score, } for id_, score in zip(evaluations.id, evaluations[metric_one])] row_id = [ i for i, id_ in enumerate(evaluations.id) if id_ == selected_pipeline ] def format_pipeline(ind): pipeline_elements = [] for primitive_node in reversed(ind.primitives): pipeline_elements.append(html.B(str( primitive_node._primitive))) pipeline_elements.append(html.Br()) for terminal in primitive_node._terminals: pipeline_elements.append(f" {terminal}") pipeline_elements.append(html.Br()) return pipeline_elements if selected_pipeline is None: pl_viz_data = None else: pl_viz_data = format_pipeline( self.report.individuals[selected_pipeline]) print("Update complete in ", time.time() - start_update) return figure, pl_table_data, pl_viz_data, row_id, None, None
class RunningPage(BasePage): def __init__(self): super().__init__(name="Running", alignment=1, starts_hidden=True) self.cli = None self.id = "running-page" self.report = None self.log_file = None def build_page(self, app, controller): self.cli = CLIWindow("cli", app) plot_area = self.plot_area() pl_viz = self.pipeline_viz() pl_list = self.pipeline_list() ticker = dcc.Interval(id="ticker", interval=5000) self._content = html.Div( id=self.id, children=[ dbc.Row([dbc.Col(plot_area, width=8), dbc.Col(self.cli.html)]), dbc.Row([dbc.Col(pl_viz, width=4), dbc.Col(pl_list)]), ticker, ], ) app.callback( [ Output("evaluation-graph", "figure"), Output("pipeline-table", "data"), Output("pl-viz", "children"), Output("pipeline-table", "selected_rows"), Output("pipeline-table", "selected_row_ids"), Output("evaluation-graph", "clickData"), ], [ Input("ticker", "n_intervals"), Input("running-page-store", "data") ], )(self.update_page) app.callback( [Output("running-page-store", "data")], [ Input("evaluation-graph", "clickData"), Input("pipeline-table", "selected_row_ids"), ], [State("running-page-store", "data")], )(self.update_selection) return self._content def update_selection(self, click_data, selected_row_ids, page_store): cell_selected = None if selected_row_ids is None else selected_row_ids[ 0] if click_data is None: click_selected = None else: click_selected = click_data["points"][0]["customdata"] selected = click_selected if click_selected is not None else cell_selected # Selected row ids and click data are always set back to None. # The value that is not None is the new value. if selected is not None: self.need_update = True page_store["selected_pipeline"] = selected return [page_store] # First call or sync call. raise PreventUpdate def update_page(self, _, page_store): if self.report is None: if self.log_file is None or not os.path.exists(self.log_file): raise PreventUpdate # report does not exist else: self.report = GamaReport(self.log_file) elif not self.report.update() and not self.need_update: raise PreventUpdate # report is not updated start_update = time.time() selected_pipeline = page_store.get("selected_pipeline", None) with pd.option_context("mode.use_inf_as_na", True): evaluations = self.report.evaluations.dropna() self.need_update = False scatters = self.scatter_plot(evaluations, self.report.metrics, selected_pipeline) metric_one, metric_two = self.report.metrics metric_one_text = metric_one.replace("_", " ") figure = { "data": scatters, "layout": dict( hovermode="closest", clickmode="event+select", title=f"{metric_one_text} vs {metric_two}", xaxis=dict(title=metric_one_text), yaxis=dict(title=metric_two, tickformat=",d" ), # tickformat forces integer ticks for length, uirevision="never_reset_zoom", ), } pl_table_data = [{ "pl": self.report.individuals[id_].short_name(" > "), "id": id_, "score": score, } for id_, score in zip(evaluations.id, evaluations[metric_one])] row_id = [ i for i, id_ in enumerate(evaluations.id) if id_ == selected_pipeline ] def format_pipeline(ind): pipeline_elements = [] for primitive_node in reversed(ind.primitives): pipeline_elements.append(html.B(str( primitive_node._primitive))) pipeline_elements.append(html.Br()) for terminal in primitive_node._terminals: pipeline_elements.append(f" {terminal}") pipeline_elements.append(html.Br()) return pipeline_elements if selected_pipeline is None: pl_viz_data = None else: pl_viz_data = format_pipeline( self.report.individuals[selected_pipeline]) print("Update complete in ", time.time() - start_update) return figure, pl_table_data, pl_viz_data, row_id, None, None def scatter_plot(self, evaluations, metrics, selected_pipeline: str = None): metric_one, metric_two = metrics # Marker size indicates recency of the evaluations, # recent evaluations are bigger. biggest_size = 25 smallest_size = 5 selected_size = 30 d_size_min_max = biggest_size - smallest_size sizes = list(range(smallest_size, biggest_size))[-len(evaluations):] if len(evaluations) > d_size_min_max: sizes = [smallest_size ] * (len(evaluations) - d_size_min_max) + sizes if selected_pipeline is not None: sizes = [ size if id_ != selected_pipeline else selected_size for size, id_ in zip(sizes, evaluations.id) ] default_color = "#301cc9" selected_color = "#c81818" colors = [ default_color if id_ != selected_pipeline else selected_color for id_ in evaluations.id ] all_scatter = go.Scatter( x=evaluations[metric_one], y=-evaluations[metric_two], mode="markers", marker={ "color": colors, "size": sizes }, name="all evaluations", text=[ self.report.individuals[id_].short_name() for id_ in evaluations.id ], customdata=evaluations.id, ) return [all_scatter] def gama_started(self, process, log_file): self.cli.monitor(process) self.log_file = log_file def plot_area(self): scatter = dcc.Graph( id="evaluation-graph", figure={ "data": [], "layout": dict( hovermode="closest", transition={"duration": 500}, ), }, ) return html.Div( scatter, style={ "height": "100%", "box-shadow": "1px 1px 1px black", "padding": "2%", }, ) def pipeline_list(self): ta = dash_table.DataTable( id="pipeline-table", columns=[ { "name": "Pipeline", "id": "pl" }, { "name": "Score", "id": "score" }, ], data=[], style_table={ "maxHeight": "300px", "overflowY": "scroll" }, filter_action="native", sort_action="native", row_selectable="single", persistence_type="session", persistence=True, ) return html.Div( ta, style={ "height": "100%", "box-shadow": "1px 1px 1px black", "padding": "2%", }, ) def pipeline_viz(self): return html.Div( id="pl-viz", style={ "height": "100%", "box-shadow": "1px 1px 1px black", "padding": "2%", "whiteSpace": "pre-wrap", }, )
def update_page(self, _, page_store): start_update = time.time() selected_pipeline = page_store.get('selected_pipeline', None) if ((self.report is None and (self.log_file is None or not os.path.exists(self.log_file))) or (self.report is not None and not self.report.update() and not self.need_update)): # The report does not exist, or exists but nothing is updated. raise PreventUpdate elif self.report is None: self.report = GamaReport(self.log_file) with pd.option_context('mode.use_inf_as_na', True): evaluations = self.report.evaluations.dropna() self.need_update = False scatters = self.scatter_plot(evaluations, self.report.metrics, selected_pipeline) metric_one, metric_two = self.report.metrics metric_one_text = metric_one.replace('_', ' ') figure = { 'data': scatters, 'layout': dict( hovermode='closest', clickmode='event+select', title=f'{metric_one_text} vs {metric_two}', xaxis=dict(title=metric_one_text), yaxis=dict(title=metric_two, tickformat=',d' ), # tickformat forces integer ticks for length, uirevision='never_reset_zoom') } pl_table_data = [{ 'pl': self.report.individuals[id_].short_name(' > '), 'id': id_, 'score': score } for id_, score in zip(evaluations.id, evaluations[metric_one])] row_id = [ i for i, id_ in enumerate(evaluations.id) if id_ == selected_pipeline ] def format_pipeline(ind): pipeline_elements = [] for primitive_node in reversed(ind.primitives): pipeline_elements.append(html.B(str( primitive_node._primitive))) pipeline_elements.append(html.Br()) for terminal in primitive_node._terminals: pipeline_elements.append(f' {terminal}') pipeline_elements.append(html.Br()) return pipeline_elements pl_viz_data = None if selected_pipeline is None else format_pipeline( self.report.individuals[selected_pipeline]) print('Update complete in ', time.time() - start_update) return figure, pl_table_data, pl_viz_data, row_id, None, None