def update(self) -> bool: new_lines = _find_new_lines(self._filename, start_from=self._lines_read) if len(new_lines) > 0: self._lines_read += len(new_lines) print(f"read {len(new_lines)} new lines") events_by_type = _lines_to_dict(new_lines) if len(self.evaluations) == 0: search_start = None else: search_start = self.evaluations.start.min() start_n = self.evaluations.n.max() if math.isnan(start_n): start_n = -1 new_evaluations = _evaluations_to_dataframe( events_by_type[TOKENS.EVALUATION_RESULT], metric_names=self.metrics, search_start=search_start, start_n=start_n + 1, ) self.evaluations = pd.concat([self.evaluations, new_evaluations]) for metric in self.metrics: self.evaluations[f"{metric}_cummax"] = self.evaluations[metric].cummax() new_individuals = { id_: Individual.from_string(pipeline, pset) for id_, pipeline in zip(new_evaluations.id, new_evaluations.pipeline) } self.individuals.update(new_individuals) return len(new_lines) > 0
def pipeline_to_children(pipeline, automl): ''' Converts pipeline format of Gama log file to individual scikit-learn components. Parameters: ----------- pipeline: pipeline set Contains pipeline in Gama format. automl: GamaClassifier or GamaRegressor Contains either a GamaClassifier object or GamaRegressor object. Returns: -------- scikit-learn predictor, scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional), scikit-learn preprocessor (Optional) Contains the GAMA individuals converted to the respective pipeline scikit-learn components. ''' ind = Individual.from_string(pipeline, automl._pset) inds = [p.str_nonrecursive for p in ind.primitives] if len(inds) == 1: return inds[0], np.nan, np.nan, np.nan elif len(inds) == 2: return inds[0], inds[1], np.nan, np.nan elif len(inds) == 3: return inds[0], inds[2], inds[1], np.nan else: return inds[0], inds[3], inds[2], inds[1]
def evaluate_individual( individual: Individual, evaluate_pipeline: Callable, timeout: float = 1e6, deadline: Optional[float] = None, add_length_to_score: bool = True, **kwargs, ) -> Evaluation: """ Evaluate the pipeline specified by individual, and record Parameters ---------- individual: Individual Blueprint for the pipeline to evaluate. evaluate_pipeline: Callable Function which takes the pipeline and produces validation predictions, scores, estimators and errors. timeout: float (default=1e6) Maximum time in seconds that the evaluation is allowed to take. Don't depend on high accuracy. A shorter timeout is imposed if `deadline` is in less than `timeout` seconds. deadline: float, optional A time in seconds since epoch. Cut off evaluation at `deadline` even if `timeout` seconds have not yet elapsed. add_length_to_score: bool (default=True) Add the length of the individual to the score result of the evaluation. **kwargs: Dict, optional (default=None) Passed to `evaluate_pipeline` function. Returns ------- Evaluation """ result = Evaluation(individual, pid=os.getpid()) result.start_time = datetime.now() if deadline is not None: time_to_deadline = deadline - time.time() timeout = min(timeout, time_to_deadline) with Stopwatch() as wall_time, Stopwatch( time.process_time) as process_time: evaluation = evaluate_pipeline(individual.pipeline, timeout=timeout, **kwargs) result._predictions, result.score, result._estimators, result.error = evaluation result.duration = wall_time.elapsed_time if add_length_to_score: result.score = result.score + (-len(individual.primitives), ) individual.fitness = Fitness( result.score, result.start_time, wall_time.elapsed_time, process_time.elapsed_time, ) return result
def InvalidLinearSVC(pset): individual_str = """LinearSVC(data, LinearSVC.C=0.001, LinearSVC.dual=True, LinearSVC.loss='squared_hinge', LinearSVC.penalty='l1', LinearSVC.tol=1e-05)""" individual_str = "".join(individual_str.split()).replace(",", ", ") return Individual.from_string(individual_str, pset, compile_individual)
def LinearSVC(pset): individual_str = """LinearSVC(data, LinearSVC.C=0.001, LinearSVC.dual=True, LinearSVC.loss='squared_hinge', LinearSVC.penalty='l2', LinearSVC.tol=1e-05)""" individual_str = ''.join(individual_str.split()).replace(',', ', ') return Individual.from_string(individual_str, pset, None)
def crossover_terminals( individual1: Individual, individual2: Individual) -> Tuple[Individual, Individual]: """ Crossover two individuals in-place by exchanging two Terminals with shared output type but different values. Parameters ---------- individual1: Individual The individual to crossover with individual2. individual2: Individual The individual to crossover with individual1. """ candidates = list( _shared_terminals(individual1, individual2, with_indices=True, value_match='different')) i, ind1_term, j, ind2_term = random.choice(candidates) individual1.replace_terminal(i, ind2_term) individual2.replace_terminal(j, ind1_term) return individual1, individual2
def crossover_terminals(ind1: Individual, ind2: Individual) -> Tuple[Individual, Individual]: """ Crossover two individuals in-place by exchanging two Terminals. Terminals must share output type but have different values. Parameters ---------- ind1: Individual The individual to crossover with individual2. ind2: Individual The individual to crossover with individual1. """ options = _shared_terminals(ind1, ind2, with_indices=True, value_match="different") i, ind1_term, j, ind2_term = random.choice(list(options)) ind1.replace_terminal(i, ind2_term) ind2.replace_terminal(j, ind1_term) return ind1, ind2
def update(self, force: bool = False) -> bool: if not force and not self.incomplete: return False with open(os.path.join(self._log_directory, "evaluations.log"), "r") as fh: header = fh.readline()[:-1] self._last_tell = max(self._last_tell, fh.tell()) fh.seek(self._last_tell) try: df = pd.read_csv(fh, sep=";", header=None, index_col=False) except pd.errors.EmptyDataError: return False self._last_tell = fh.tell() df.columns = header.split(";") df["n"] = df.index df = df.rename( columns=dict(t_start="start", t_wallclock="duration")) def tuple_to_metrics(tuple_str): return pd.Series( [float(value) for value in tuple_str[1:-1].split(",")]) df[self.metrics] = df.score.apply(tuple_to_metrics) df.start = pd.to_datetime(df.start) # needed? df.duration = pd.to_timedelta(df.duration, unit="s") new_individuals = { id_: Individual.from_string(pipeline, pset) for id_, pipeline in zip(df.id, df.pipeline) } # Merge with previous records self.individuals.update(new_individuals) if self.evaluations.empty: self.evaluations = df else: df["n"] += self.evaluations.n.max() + 1 self.evaluations = pd.concat([self.evaluations, df]) df = self.evaluations search_start = df.start.min() for metric in self.metrics: df[f"{metric}_cummax"] = df[metric].cummax() if len(df.start) > 0: df["relative_end"] = ((df.start + df.duration) - search_start).dt.total_seconds() else: df["relative_end"] = pd.Series() return True
def ForestPipeline(pset): individual_str = """RandomForestClassifier( FeatureAgglomeration( data, FeatureAgglomeration.affinity='l2', FeatureAgglomeration.linkage='complete' ), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.6, RandomForestClassifier.min_samples_leaf=7, RandomForestClassifier.min_samples_split=6, RandomForestClassifier.n_estimators=100)""" individual_str = "".join(individual_str.split()).replace(",", ", ") return Individual.from_string(individual_str, pset, None)
def _test_mutation(individual: Individual, mutation, mutation_check, pset): """ Test if an individual mutated by `mutation` passes `mutation_check` and compiles. :param individual: The individual to be mutated. :param mutation: function: ind -> (ind,). Should mutate the individual :param mutation_check: function: (ind1, ind2)->(bool, str). A function to check if ind2 could have been created by `mutation(ind1)`, see above functions. """ ind_clone = individual.copy_as_new() mutation(ind_clone, pset) applied, message = mutation_check(individual, ind_clone) assert applied, message # Should be able to compile the individual, will raise an Exception if not. compile_individual(ind_clone, pset)
def BernoulliNBThreeScalers(pset): return Individual.from_string( "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)", pset, compile_individual)
def __init__( self, logfile: Optional[str] = None, log_lines: Optional[List[str]] = None, name: Optional[str] = None, ): """ Parse the logfile or log lines provided. Parameters ---------- logfile: str, optional (default=None) Path to the log file. If not specified, loglines must be provided. log_lines: List[str], optional (default=None) A list with each element one line from the log file. If not specified, logfile must be provided. name: str, optional (default=None) Name of the report. If set to None, defaults to `logfile` if it is not None else 'nameless'. """ if not ((logfile is None) ^ (log_lines is None)): raise ValueError("Must provide exactly one of 'logfile' or 'loglines'.") if log_lines is None: log_lines = _find_new_lines(cast(str, logfile)) self._lines_read = len(log_lines) self._individuals = None self.name = ( name if name is not None else (logfile if logfile is not None else "nameless") ) events_by_type = _lines_to_dict(log_lines) if len(events_by_type[TOKENS.INIT]) == 0: raise ValueError("The log must contain at least contain an INIT string.") config = _find_metric_configuration(events_by_type[TOKENS.INIT]) self.metrics, self.search_method, self.postprocessing, self._filename = config self.phases: List[Tuple[str, str, datetime, float]] = _find_phase_information( events_by_type ) search_start = self.phases[1][2] if len(self.phases) > 1 else None self.evaluations: pd.DataFrame = _evaluations_to_dataframe( events_by_type[TOKENS.EVALUATION_RESULT], metric_names=self.metrics, search_start=search_start, ) # This can take a while for long logs (e.g. ~1sec for 10k individuals) self.individuals: Dict[str, Individual] = { id_: Individual.from_string(pipeline, pset) for id_, pipeline in zip(self.evaluations.id, self.evaluations.pipeline) } parse_method_data: Dict[str, Callable[..., pd.DataFrame]] = defaultdict( lambda: lambda *args: None, AsynchronousSuccessiveHalving=_ASHA_data_to_dataframe, ) # search_method is formatted like NAME(kwargs) # where kwargs could contain additional parentheses. method_name, _ = self.search_method.split("(", maxsplit=1) method_token = METHOD_TOKENS.get(method_name) self.method_data = parse_method_data[method_name]( events_by_type[method_token], self.metrics ) self.incomplete = len(self.phases) < 3
def SS_RBS_SS_BNB(pset): return Individual.from_string( "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)", # noqa: E501 pset, compile_individual, )
def SS_BNB(pset): return Individual.from_string( "BernoulliNB(StandardScaler(data), alpha=0.1, fit_prior=True)", pset, compile_individual, )
def RS_MNB(pset): return Individual.from_string( "MultinomialNB(RobustScaler(data), alpha=1.0, fit_prior=True)", pset, compile_individual, )
def GNB(pset): return Individual.from_string("GaussianNB(data)", pset, compile_individual)
def execute_recommendations(X, y, cat_ind, recommendations, task, n_jobs=1): ''' Executes the recommendations made by the nearest neighbor model based on a learning task and sets the number of jobs to n_jobs for the estimators and preprocessing algorithms. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. cat_ind: list Contains boolean values to determine whether a column is categorical or not based. recommendations: list Contains the list with the recommendations made by the nearest neighbor model. task: str Contains the learning task (i.e. "classification" or "regression") n_jobs: int Contains what to set the number of jobs at for the estimators and preprocessing algorithms available in the recommended pipelines. Returns: -------- list Contains scores of each pipeline run on X and y. ''' categorical, numeric, string = category_numeric_or_string(X, cat_ind) if task.lower() == "classification": gama = GamaClassifier(scoring='accuracy') elif task.lower() == "regression": gama = GamaRegressor(scoring='r2') else: return "{} is not implemented, please try 'classification' or 'regression'".format( task) scores = [] for recommendation in recommendations: pipeline, k, did = recommendation ind = Individual.from_string(pipeline, gama._pset) X_pipe = deepcopy(X) y_pipe = deepcopy(y) X_pipe, y_pipe = onehot_or_targ(X_pipe, y_pipe, categorical, k) pipeline = [eval(p.str_nonrecursive) for p in ind.primitives] pipeline.reverse() try: for component in pipeline: if pipeline.index(component) == len(pipeline) - 1: try: setattr(component, 'n_jobs', n_jobs) except: pass X_train, X_test, y_train, y_test = train_test_split( X_pipe, y_pipe, test_size=0.30, random_state=42) cv_scores = cross_val_score(component, X_pipe, y_pipe, cv=10) score = sum(cv_scores) / 10 #component.fit(X_train, y_train) #score = component.score(X_test, y_test) scores.append(score) else: if isinstance(component, SelectPercentile) | isinstance( component, SelectFwe): X_pipe = component.fit_transform(X_pipe, y_pipe) else: X_pipe = component.fit_transform(X_pipe) except: scores.append(0) return scores