Beispiel #1
0
    def update(self) -> bool:
        new_lines = _find_new_lines(self._filename, start_from=self._lines_read)
        if len(new_lines) > 0:
            self._lines_read += len(new_lines)
            print(f"read {len(new_lines)} new lines")
            events_by_type = _lines_to_dict(new_lines)
            if len(self.evaluations) == 0:
                search_start = None
            else:
                search_start = self.evaluations.start.min()
            start_n = self.evaluations.n.max()
            if math.isnan(start_n):
                start_n = -1

            new_evaluations = _evaluations_to_dataframe(
                events_by_type[TOKENS.EVALUATION_RESULT],
                metric_names=self.metrics,
                search_start=search_start,
                start_n=start_n + 1,
            )
            self.evaluations = pd.concat([self.evaluations, new_evaluations])
            for metric in self.metrics:
                self.evaluations[f"{metric}_cummax"] = self.evaluations[metric].cummax()
            new_individuals = {
                id_: Individual.from_string(pipeline, pset)
                for id_, pipeline in zip(new_evaluations.id, new_evaluations.pipeline)
            }
            self.individuals.update(new_individuals)
        return len(new_lines) > 0
Beispiel #2
0
def pipeline_to_children(pipeline, automl):
    ''' Converts pipeline format of Gama log file to individual scikit-learn
            components.

    Parameters:
    -----------
    pipeline: pipeline set
        Contains pipeline in Gama format.
    automl: GamaClassifier or GamaRegressor
        Contains either a GamaClassifier object or GamaRegressor object.

    Returns:
    --------
    scikit-learn predictor, scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional)
        Contains the GAMA individuals converted to the respective pipeline scikit-learn components.
    '''
    ind = Individual.from_string(pipeline, automl._pset)
    inds = [p.str_nonrecursive for p in ind.primitives]
    if len(inds) == 1:
        return inds[0], np.nan, np.nan, np.nan
    elif len(inds) == 2:
        return inds[0], inds[1], np.nan, np.nan
    elif len(inds) == 3:
        return inds[0], inds[2], inds[1], np.nan
    else:
        return inds[0], inds[3], inds[2], inds[1]
Beispiel #3
0
def InvalidLinearSVC(pset):
    individual_str = """LinearSVC(data,
            LinearSVC.C=0.001,
            LinearSVC.dual=True,
            LinearSVC.loss='squared_hinge',
            LinearSVC.penalty='l1',
            LinearSVC.tol=1e-05)"""
    individual_str = "".join(individual_str.split()).replace(",", ", ")
    return Individual.from_string(individual_str, pset, compile_individual)
Beispiel #4
0
def LinearSVC(pset):
    individual_str = """LinearSVC(data,
            LinearSVC.C=0.001,
            LinearSVC.dual=True,
            LinearSVC.loss='squared_hinge',
            LinearSVC.penalty='l2',
            LinearSVC.tol=1e-05)"""
    individual_str = ''.join(individual_str.split()).replace(',', ', ')
    return Individual.from_string(individual_str, pset, None)
Beispiel #5
0
    def update(self, force: bool = False) -> bool:
        if not force and not self.incomplete:
            return False

        with open(os.path.join(self._log_directory, "evaluations.log"),
                  "r") as fh:
            header = fh.readline()[:-1]
            self._last_tell = max(self._last_tell, fh.tell())
            fh.seek(self._last_tell)
            try:
                df = pd.read_csv(fh, sep=";", header=None, index_col=False)
            except pd.errors.EmptyDataError:
                return False
            self._last_tell = fh.tell()

            df.columns = header.split(";")
            df["n"] = df.index
            df = df.rename(
                columns=dict(t_start="start", t_wallclock="duration"))

            def tuple_to_metrics(tuple_str):
                return pd.Series(
                    [float(value) for value in tuple_str[1:-1].split(",")])

            df[self.metrics] = df.score.apply(tuple_to_metrics)
            df.start = pd.to_datetime(df.start)  # needed?
            df.duration = pd.to_timedelta(df.duration, unit="s")

            new_individuals = {
                id_: Individual.from_string(pipeline, pset)
                for id_, pipeline in zip(df.id, df.pipeline)
            }

            # Merge with previous records
            self.individuals.update(new_individuals)
            if self.evaluations.empty:
                self.evaluations = df
            else:
                df["n"] += self.evaluations.n.max() + 1
                self.evaluations = pd.concat([self.evaluations, df])
            df = self.evaluations

            search_start = df.start.min()
            for metric in self.metrics:
                df[f"{metric}_cummax"] = df[metric].cummax()
            if len(df.start) > 0:
                df["relative_end"] = ((df.start + df.duration) -
                                      search_start).dt.total_seconds()
            else:
                df["relative_end"] = pd.Series()
        return True
Beispiel #6
0
def ForestPipeline(pset):
    individual_str = """RandomForestClassifier(
            FeatureAgglomeration(
                    data,
                    FeatureAgglomeration.affinity='l2',
                    FeatureAgglomeration.linkage='complete'
                    ),
            RandomForestClassifier.bootstrap=True,
            RandomForestClassifier.criterion='gini',
            RandomForestClassifier.max_features=0.6,
            RandomForestClassifier.min_samples_leaf=7,
            RandomForestClassifier.min_samples_split=6,
            RandomForestClassifier.n_estimators=100)"""
    individual_str = "".join(individual_str.split()).replace(",", ", ")

    return Individual.from_string(individual_str, pset, None)
Beispiel #7
0
def SS_RBS_SS_BNB(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)",  # noqa: E501
        pset,
        compile_individual,
    )
Beispiel #8
0
def SS_BNB(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(data), alpha=0.1, fit_prior=True)",
        pset,
        compile_individual,
    )
Beispiel #9
0
def RS_MNB(pset):
    return Individual.from_string(
        "MultinomialNB(RobustScaler(data), alpha=1.0, fit_prior=True)",
        pset,
        compile_individual,
    )
Beispiel #10
0
def GNB(pset):
    return Individual.from_string("GaussianNB(data)", pset, compile_individual)
Beispiel #11
0
def execute_recommendations(X, y, cat_ind, recommendations, task, n_jobs=1):
    ''' Executes the recommendations made by the nearest neighbor model based on
            a learning task and sets the number of jobs to n_jobs for the estimators
            and preprocessing algorithms.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    cat_ind: list
        Contains boolean values to determine whether a column is categorical or
        not based.
    recommendations: list
        Contains the list with the recommendations made by the nearest neighbor model.
    task: str
        Contains the learning task (i.e. "classification" or "regression")
    n_jobs: int
        Contains what to set the number of jobs at for the estimators and preprocessing algorithms
            available in the recommended pipelines.

    Returns:
    --------
    list
        Contains scores of each pipeline run on X and y.
    '''
    categorical, numeric, string = category_numeric_or_string(X, cat_ind)

    if task.lower() == "classification":
        gama = GamaClassifier(scoring='accuracy')
    elif task.lower() == "regression":
        gama = GamaRegressor(scoring='r2')
    else:
        return "{} is not implemented, please try 'classification' or 'regression'".format(
            task)

    scores = []

    for recommendation in recommendations:
        pipeline, k, did = recommendation
        ind = Individual.from_string(pipeline, gama._pset)

        X_pipe = deepcopy(X)
        y_pipe = deepcopy(y)

        X_pipe, y_pipe = onehot_or_targ(X_pipe, y_pipe, categorical, k)

        pipeline = [eval(p.str_nonrecursive) for p in ind.primitives]
        pipeline.reverse()

        try:
            for component in pipeline:
                if pipeline.index(component) == len(pipeline) - 1:
                    try:
                        setattr(component, 'n_jobs', n_jobs)
                    except:
                        pass

                    X_train, X_test, y_train, y_test = train_test_split(
                        X_pipe, y_pipe, test_size=0.30, random_state=42)
                    cv_scores = cross_val_score(component,
                                                X_pipe,
                                                y_pipe,
                                                cv=10)
                    score = sum(cv_scores) / 10
                    #component.fit(X_train, y_train)
                    #score = component.score(X_test, y_test)
                    scores.append(score)
                else:
                    if isinstance(component, SelectPercentile) | isinstance(
                            component, SelectFwe):
                        X_pipe = component.fit_transform(X_pipe, y_pipe)
                    else:
                        X_pipe = component.fit_transform(X_pipe)
        except:
            scores.append(0)

    return scores
Beispiel #12
0
    def __init__(
        self,
        logfile: Optional[str] = None,
        log_lines: Optional[List[str]] = None,
        name: Optional[str] = None,
    ):
        """ Parse the logfile or log lines provided.

        Parameters
        ----------
        logfile: str, optional (default=None)
            Path to the log file. If not specified, loglines must be provided.
        log_lines: List[str], optional (default=None)
            A list with each element one line from the log file.
            If not specified, logfile must be provided.
        name: str, optional (default=None)
            Name of the report.
            If set to None, defaults to `logfile` if it is not None else 'nameless'.
        """
        if not ((logfile is None) ^ (log_lines is None)):
            raise ValueError("Must provide exactly one of 'logfile' or 'loglines'.")

        if log_lines is None:
            log_lines = _find_new_lines(cast(str, logfile))

        self._lines_read = len(log_lines)
        self._individuals = None
        self.name = (
            name
            if name is not None
            else (logfile if logfile is not None else "nameless")
        )

        events_by_type = _lines_to_dict(log_lines)

        if len(events_by_type[TOKENS.INIT]) == 0:
            raise ValueError("The log must contain at least contain an INIT string.")

        config = _find_metric_configuration(events_by_type[TOKENS.INIT])
        self.metrics, self.search_method, self.postprocessing, self._filename = config

        self.phases: List[Tuple[str, str, datetime, float]] = _find_phase_information(
            events_by_type
        )
        search_start = self.phases[1][2] if len(self.phases) > 1 else None
        self.evaluations: pd.DataFrame = _evaluations_to_dataframe(
            events_by_type[TOKENS.EVALUATION_RESULT],
            metric_names=self.metrics,
            search_start=search_start,
        )

        # This can take a while for long logs (e.g. ~1sec for 10k individuals)
        self.individuals: Dict[str, Individual] = {
            id_: Individual.from_string(pipeline, pset)
            for id_, pipeline in zip(self.evaluations.id, self.evaluations.pipeline)
        }

        parse_method_data: Dict[str, Callable[..., pd.DataFrame]] = defaultdict(
            lambda: lambda *args: None,
            AsynchronousSuccessiveHalving=_ASHA_data_to_dataframe,
        )
        # search_method is formatted like NAME(kwargs)
        # where kwargs could contain additional parentheses.
        method_name, _ = self.search_method.split("(", maxsplit=1)
        method_token = METHOD_TOKENS.get(method_name)
        self.method_data = parse_method_data[method_name](
            events_by_type[method_token], self.metrics
        )

        self.incomplete = len(self.phases) < 3
Beispiel #13
0
def BernoulliNBThreeScalers(pset):
    return Individual.from_string(
        "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)",
        pset, compile_individual)