Beispiel #1
0
    def load_logs(self, list_of_contents, list_of_names):
        # global aggregate_dataframe
        if list_of_contents is not None:
            for content, filename in zip(list_of_contents, list_of_names):
                content_type, content_string = content.split(",")
                decoded = base64.b64decode(content_string).decode("utf-8")
                log_lines = decoded.splitlines()
                report = GamaReport(log_lines=log_lines, name=filename)
                self.reports[filename] = report

                eval_copy = report.evaluations.copy()
                eval_copy["search_method"] = report.search_method
                # if aggregate_dataframe is None:
                #     eval_copy["log_no"] = 0
                #     eval_copy["filename"] = filename
                #     aggregate_dataframe = eval_copy
                # else:
                #     eval_copy["log_no"] = len(
                #         aggregate_dataframe["log_no"].unique())
                #     eval_copy["filename"] = filename
                #     aggregate_dataframe = pd.concat(
                #         [aggregate_dataframe, eval_copy])
            return [{
                "label": logname,
                "value": logname
            } for logname in self.reports]
        return []
Beispiel #2
0
def test_gamareport_asyncEA_from_log():
    """ GamaReport can be constructed from a log that recorded AsyncEA. """
    log_dir = "tests/data/AsyncEA"
    report = GamaReport(log_dir)
    assert report.name == "AsyncEA"
    assert "AsyncEA" == report.search_method
    assert 3 == len(report.phases)
    assert ["preprocessing", "search",
            "postprocess"] == list(map(lambda t: t[0], report.phases))
    assert ["default", "AsyncEA",
            "NoPostProcessing"] == list(map(lambda t: t[1], report.phases))
Beispiel #3
0
def test_gamareport_asyncEA_from_log():
    """ Test that a GamaReport can be constructed from a log. """
    log_file = 'tests/data/amazon_asyncEA.log'
    report = GamaReport(logfile=log_file, name=None)
    assert report.name == log_file
    assert 'AsyncEA()' == report.search_method
    assert 3 == len(report.phases)
    assert (['preprocessing', 'search', 'postprocess']
            == list(map(lambda t: t[0], report.phases)))
    assert (['default', 'AsyncEA', 'NoPostProcessing']
            == list(map(lambda t: t[1], report.phases)))
    assert report.method_data is None, "AsyncEA has no method data associated."
Beispiel #4
0
def test_gamareport_asha_from_log():
    """ Test that a GamaReport can be constructed from a log and retrieve ASHA specific information. """
    log_file = 'tests/data/asha.log'
    report = GamaReport(logfile=log_file, name=None)
    assert report.name == log_file
    assert 'AsynchronousSuccessiveHalving()' == report.search_method
    assert 3 == len(report.phases)
    assert (['preprocessing', 'search', 'postprocess']
            == list(map(lambda t: t[0], report.phases)))
    assert (['default', 'AsynchronousSuccessiveHalving', 'NoPostProcessing']
            == list(map(lambda t: t[1], report.phases)))
    assert report.method_data is not None, "ASHA has method data associated."
Beispiel #5
0
def log_to_df(path, classification, regression):
    ''' Converts multiple log files to dataframes depending on their learning
            task.

    Parameters:
    -----------
    path: string
        Contains the path name to where the log files are stored.
    classification: list
        Contains an overview of classification learning task dataset ids.
    regression: list
        Contains an overview of regression learning task dataset ids.

    Returns:
    --------
    pd.DataFrame, pd.DataFrame
        Contains a pd.DataFrame of classification datasets where multiple pd.DataFrame have been combined into one,
            Contains a pd.DataFrame of regression datasets where multiple pd.DataFrame have been combined into one

    '''
    df_class = pd.DataFrame()
    df_regr = pd.DataFrame()

    for log_file in glob.glob(path):

        #gives file name of current log_file
        file_name = os.path.basename(log_file)
        dataset_id = mixed_string_to_int(file_name)

        #create df from GAMA report
        report = GamaReport(logfile=log_file)
        report_df = report.evaluations

        #Set k in df
        report_df['k'] = return_k(file_name)
        report_df['id'] = dataset_id

        report_df = report_df.replace([np.inf, -np.inf], np.nan).dropna()

        if dataset_id in classification:
            drop_columns = [
                'start', 'accuracy_cummax', 'length_cummax', 'relative_end'
            ]
            report_df = report_df.drop(drop_columns, axis=1)
            df_class = df_class.append(report_df)
        else:
            drop_columns = [
                'start', 'r2_cummax', 'length_cummax', 'relative_end'
            ]
            report_df = report_df.drop(drop_columns, axis=1)
            df_regr = df_regr.append(report_df)

    return df_class, df_regr
def test_gamareport_asyncEA_from_log():
    """ GamaReport can be constructed from a log that recorded AsyncEA. """
    log_file = "tests/data/amazon_asyncEA.log"
    report = GamaReport(logfile=log_file, name=None)
    assert report.name == log_file
    assert "AsyncEA()" == report.search_method
    assert 3 == len(report.phases)
    assert ["preprocessing", "search", "postprocess"] == list(
        map(lambda t: t[0], report.phases)
    )
    assert ["default", "AsyncEA", "NoPostProcessing"] == list(
        map(lambda t: t[1], report.phases)
    )
    assert report.method_data is None, "AsyncEA has no method data associated."
Beispiel #7
0
def log_to_df_file(log_file):
    ''' Converts a GAMA log file to a pd.DataFrame.

    Parameters:
    -----------
    log_file: str
        Contains the name of the file or the path to the file.

    Returns:
    --------
    pd.DataFrame
        Contains a pd.DataFrame for that specific log.
    '''
    report = GamaReport(logfile=log_file)
    return report.evaluations
Beispiel #8
0
def test_gamareport_from_log():
    """ Test that a GamaReport can be constructed from a log. """
    # We refer to a static log, this makes it independent of other unit tests, but it also makes it
    # independent of the actual changes in gama logging. This is bad in the sense that if logging changes
    # and the static file is not updated, we won't catch it. This is good in the sense that it achieves unit test
    # independence and backwards incompatability of changes to GamaReport are immediately caught if tested on the
    # old log first.
    # Perhaps we can/should link to the log file used in the documentation.
    log_file = 'tests/data/random_search.log'
    report = GamaReport(logfile=log_file, name=None)
    assert report.name == log_file
    assert 'RandomSearch()' == report.search_method
    assert 3 == len(report.phases)
    assert (['preprocessing', 'search', 'postprocess']
            == list(map(lambda t: t[0], report.phases)))
    assert (['default', 'RandomSearch', 'NoPostProcessing']
            == list(map(lambda t: t[1], report.phases)))
    assert report.method_data is None, "Random Search has no method data associated."
Beispiel #9
0
def test_gamareport_from_log():
    """ GamaReport can be constructed from a log that recorded RandomSearch. """
    # We refer to a static log, this makes it independent of other unit tests,
    # but it also makes it independent of the actual changes in gama logging.
    # Cons:
    #   - when logging changes and the static file is not updated, we won't catch it.
    # Pros:
    #   + unit test independence
    #   + backwards compatibility test for GamaReport
    # Perhaps we can/should link to the log file used in the documentation.
    log_dir = "tests/data/RandomSearch"
    report = GamaReport(log_dir)
    assert report.name == "RandomSearch"
    assert "RandomSearch" == report.search_method
    assert 3 == len(report.phases)
    assert ["preprocessing", "search",
            "postprocess"] == list(map(lambda t: t[0], report.phases))
    assert ["default", "RandomSearch",
            "NoPostProcessing"] == list(map(lambda t: t[1], report.phases))
Beispiel #10
0
    def load_logs(self, list_of_contents, list_of_names):
        # global aggregate_dataframe
        if list_of_contents is not None:
            tmp_dir = f"tmp_{str(uuid.uuid4())}"
            os.makedirs(tmp_dir)
            for content, filename in zip(list_of_contents, list_of_names):
                content_type, content_string = content.split(",")
                decoded = base64.b64decode(content_string).decode("utf-8")
                with open(os.path.join(tmp_dir, filename), "w") as fh:
                    fh.write(decoded)

            report = GamaReport(tmp_dir)
            report_name = report.search_method
            for i in itertools.count():
                if f"{report_name}_{i}" not in self.reports:
                    break
            self.reports[f"{report_name}_{i}"] = report
            shutil.rmtree(tmp_dir)
            return [{
                "label": logname,
                "value": logname
            } for logname in self.reports]
        return []
Beispiel #11
0
def load_logs(list_of_contents, list_of_names):
    global aggregate_dataframe
    if list_of_contents is not None:
        for content, filename in zip(list_of_contents, list_of_names):
            content_type, content_string = content.split(',')
            decoded = base64.b64decode(content_string).decode('utf-8')
            log_lines = decoded.splitlines()
            report = GamaReport(log_lines=log_lines, name=filename)
            reports[filename] = report

            eval_copy = report.evaluations.copy()
            eval_copy['search_method'] = report.search_method
            if aggregate_dataframe is None:
                eval_copy['log_no'] = 0
                eval_copy['filename'] = filename
                aggregate_dataframe = eval_copy
            else:
                eval_copy['log_no'] = len(
                    aggregate_dataframe['log_no'].unique())
                eval_copy['filename'] = filename
                aggregate_dataframe = pd.concat(
                    [aggregate_dataframe, eval_copy])
        return [{'label': logname, 'value': logname} for logname in reports]
    return []
Beispiel #12
0
    def update_page(self, _, page_store):
        if self.report is None:
            if self.log_file is None or not os.path.exists(self.log_file):
                raise PreventUpdate  # report does not exist
            else:
                self.report = GamaReport(self.log_file)
        elif not self.report.update() and not self.need_update:
            raise PreventUpdate  # report is not updated

        start_update = time.time()
        selected_pipeline = page_store.get("selected_pipeline", None)

        with pd.option_context("mode.use_inf_as_na", True):
            evaluations = self.report.evaluations.dropna()

        self.need_update = False
        scatters = self.scatter_plot(evaluations, self.report.metrics,
                                     selected_pipeline)
        metric_one, metric_two = self.report.metrics
        metric_one_text = metric_one.replace("_", " ")

        figure = {
            "data":
            scatters,
            "layout":
            dict(
                hovermode="closest",
                clickmode="event+select",
                title=f"{metric_one_text} vs {metric_two}",
                xaxis=dict(title=metric_one_text),
                yaxis=dict(title=metric_two, tickformat=",d"
                           ),  # tickformat forces integer ticks for length,
                uirevision="never_reset_zoom",
            ),
        }

        pl_table_data = [{
            "pl": self.report.individuals[id_].short_name(" > "),
            "id": id_,
            "score": score,
        } for id_, score in zip(evaluations.id, evaluations[metric_one])]
        row_id = [
            i for i, id_ in enumerate(evaluations.id)
            if id_ == selected_pipeline
        ]

        def format_pipeline(ind):
            pipeline_elements = []
            for primitive_node in reversed(ind.primitives):
                pipeline_elements.append(html.B(str(
                    primitive_node._primitive)))
                pipeline_elements.append(html.Br())
                for terminal in primitive_node._terminals:
                    pipeline_elements.append(f"    {terminal}")
                    pipeline_elements.append(html.Br())
            return pipeline_elements

        if selected_pipeline is None:
            pl_viz_data = None
        else:
            pl_viz_data = format_pipeline(
                self.report.individuals[selected_pipeline])

        print("Update complete in ", time.time() - start_update)
        return figure, pl_table_data, pl_viz_data, row_id, None, None
Beispiel #13
0
class RunningPage(BasePage):
    def __init__(self):
        super().__init__(name="Running", alignment=1, starts_hidden=True)
        self.cli = None
        self.id = "running-page"
        self.report = None
        self.log_file = None

    def build_page(self, app, controller):
        self.cli = CLIWindow("cli", app)
        plot_area = self.plot_area()
        pl_viz = self.pipeline_viz()
        pl_list = self.pipeline_list()
        ticker = dcc.Interval(id="ticker", interval=5000)
        self._content = html.Div(
            id=self.id,
            children=[
                dbc.Row([dbc.Col(plot_area, width=8),
                         dbc.Col(self.cli.html)]),
                dbc.Row([dbc.Col(pl_viz, width=4),
                         dbc.Col(pl_list)]),
                ticker,
            ],
        )

        app.callback(
            [
                Output("evaluation-graph", "figure"),
                Output("pipeline-table", "data"),
                Output("pl-viz", "children"),
                Output("pipeline-table", "selected_rows"),
                Output("pipeline-table", "selected_row_ids"),
                Output("evaluation-graph", "clickData"),
            ],
            [
                Input("ticker", "n_intervals"),
                Input("running-page-store", "data")
            ],
        )(self.update_page)

        app.callback(
            [Output("running-page-store", "data")],
            [
                Input("evaluation-graph", "clickData"),
                Input("pipeline-table", "selected_row_ids"),
            ],
            [State("running-page-store", "data")],
        )(self.update_selection)

        return self._content

    def update_selection(self, click_data, selected_row_ids, page_store):
        cell_selected = None if selected_row_ids is None else selected_row_ids[
            0]
        if click_data is None:
            click_selected = None
        else:
            click_selected = click_data["points"][0]["customdata"]
        selected = click_selected if click_selected is not None else cell_selected

        # Selected row ids and click data are always set back to None.
        # The value that is not None is the new value.
        if selected is not None:
            self.need_update = True
            page_store["selected_pipeline"] = selected
            return [page_store]
        # First call or sync call.
        raise PreventUpdate

    def update_page(self, _, page_store):
        if self.report is None:
            if self.log_file is None or not os.path.exists(self.log_file):
                raise PreventUpdate  # report does not exist
            else:
                self.report = GamaReport(self.log_file)
        elif not self.report.update() and not self.need_update:
            raise PreventUpdate  # report is not updated

        start_update = time.time()
        selected_pipeline = page_store.get("selected_pipeline", None)

        with pd.option_context("mode.use_inf_as_na", True):
            evaluations = self.report.evaluations.dropna()

        self.need_update = False
        scatters = self.scatter_plot(evaluations, self.report.metrics,
                                     selected_pipeline)
        metric_one, metric_two = self.report.metrics
        metric_one_text = metric_one.replace("_", " ")

        figure = {
            "data":
            scatters,
            "layout":
            dict(
                hovermode="closest",
                clickmode="event+select",
                title=f"{metric_one_text} vs {metric_two}",
                xaxis=dict(title=metric_one_text),
                yaxis=dict(title=metric_two, tickformat=",d"
                           ),  # tickformat forces integer ticks for length,
                uirevision="never_reset_zoom",
            ),
        }

        pl_table_data = [{
            "pl": self.report.individuals[id_].short_name(" > "),
            "id": id_,
            "score": score,
        } for id_, score in zip(evaluations.id, evaluations[metric_one])]
        row_id = [
            i for i, id_ in enumerate(evaluations.id)
            if id_ == selected_pipeline
        ]

        def format_pipeline(ind):
            pipeline_elements = []
            for primitive_node in reversed(ind.primitives):
                pipeline_elements.append(html.B(str(
                    primitive_node._primitive)))
                pipeline_elements.append(html.Br())
                for terminal in primitive_node._terminals:
                    pipeline_elements.append(f"    {terminal}")
                    pipeline_elements.append(html.Br())
            return pipeline_elements

        if selected_pipeline is None:
            pl_viz_data = None
        else:
            pl_viz_data = format_pipeline(
                self.report.individuals[selected_pipeline])

        print("Update complete in ", time.time() - start_update)
        return figure, pl_table_data, pl_viz_data, row_id, None, None

    def scatter_plot(self,
                     evaluations,
                     metrics,
                     selected_pipeline: str = None):
        metric_one, metric_two = metrics

        # Marker size indicates recency of the evaluations,
        # recent evaluations are bigger.
        biggest_size = 25
        smallest_size = 5
        selected_size = 30
        d_size_min_max = biggest_size - smallest_size

        sizes = list(range(smallest_size, biggest_size))[-len(evaluations):]
        if len(evaluations) > d_size_min_max:
            sizes = [smallest_size
                     ] * (len(evaluations) - d_size_min_max) + sizes
        if selected_pipeline is not None:
            sizes = [
                size if id_ != selected_pipeline else selected_size
                for size, id_ in zip(sizes, evaluations.id)
            ]

        default_color = "#301cc9"
        selected_color = "#c81818"

        colors = [
            default_color if id_ != selected_pipeline else selected_color
            for id_ in evaluations.id
        ]

        all_scatter = go.Scatter(
            x=evaluations[metric_one],
            y=-evaluations[metric_two],
            mode="markers",
            marker={
                "color": colors,
                "size": sizes
            },
            name="all evaluations",
            text=[
                self.report.individuals[id_].short_name()
                for id_ in evaluations.id
            ],
            customdata=evaluations.id,
        )
        return [all_scatter]

    def gama_started(self, process, log_file):
        self.cli.monitor(process)
        self.log_file = log_file

    def plot_area(self):
        scatter = dcc.Graph(
            id="evaluation-graph",
            figure={
                "data": [],
                "layout":
                dict(
                    hovermode="closest",
                    transition={"duration": 500},
                ),
            },
        )
        return html.Div(
            scatter,
            style={
                "height": "100%",
                "box-shadow": "1px 1px 1px black",
                "padding": "2%",
            },
        )

    def pipeline_list(self):
        ta = dash_table.DataTable(
            id="pipeline-table",
            columns=[
                {
                    "name": "Pipeline",
                    "id": "pl"
                },
                {
                    "name": "Score",
                    "id": "score"
                },
            ],
            data=[],
            style_table={
                "maxHeight": "300px",
                "overflowY": "scroll"
            },
            filter_action="native",
            sort_action="native",
            row_selectable="single",
            persistence_type="session",
            persistence=True,
        )

        return html.Div(
            ta,
            style={
                "height": "100%",
                "box-shadow": "1px 1px 1px black",
                "padding": "2%",
            },
        )

    def pipeline_viz(self):
        return html.Div(
            id="pl-viz",
            style={
                "height": "100%",
                "box-shadow": "1px 1px 1px black",
                "padding": "2%",
                "whiteSpace": "pre-wrap",
            },
        )
Beispiel #14
0
    def update_page(self, _, page_store):
        start_update = time.time()
        selected_pipeline = page_store.get('selected_pipeline', None)
        if ((self.report is None and
             (self.log_file is None or not os.path.exists(self.log_file)))
                or (self.report is not None and not self.report.update()
                    and not self.need_update)):
            # The report does not exist, or exists but nothing is updated.
            raise PreventUpdate
        elif self.report is None:
            self.report = GamaReport(self.log_file)

        with pd.option_context('mode.use_inf_as_na', True):
            evaluations = self.report.evaluations.dropna()

        self.need_update = False
        scatters = self.scatter_plot(evaluations, self.report.metrics,
                                     selected_pipeline)
        metric_one, metric_two = self.report.metrics
        metric_one_text = metric_one.replace('_', ' ')

        figure = {
            'data':
            scatters,
            'layout':
            dict(
                hovermode='closest',
                clickmode='event+select',
                title=f'{metric_one_text} vs {metric_two}',
                xaxis=dict(title=metric_one_text),
                yaxis=dict(title=metric_two, tickformat=',d'
                           ),  # tickformat forces integer ticks for length,
                uirevision='never_reset_zoom')
        }

        pl_table_data = [{
            'pl': self.report.individuals[id_].short_name(' > '),
            'id': id_,
            'score': score
        } for id_, score in zip(evaluations.id, evaluations[metric_one])]
        row_id = [
            i for i, id_ in enumerate(evaluations.id)
            if id_ == selected_pipeline
        ]

        def format_pipeline(ind):
            pipeline_elements = []
            for primitive_node in reversed(ind.primitives):
                pipeline_elements.append(html.B(str(
                    primitive_node._primitive)))
                pipeline_elements.append(html.Br())
                for terminal in primitive_node._terminals:
                    pipeline_elements.append(f'    {terminal}')
                    pipeline_elements.append(html.Br())
            return pipeline_elements

        pl_viz_data = None if selected_pipeline is None else format_pipeline(
            self.report.individuals[selected_pipeline])

        print('Update complete in ', time.time() - start_update)
        return figure, pl_table_data, pl_viz_data, row_id, None, None