Esempio n. 1
0
def stop_by_num_features_parallel(logs: ListLogListType,
                                  extractor: ExtractorFnType,
                                  metric_name: str,
                                  min_num_features: int = 50) -> bool:
    """
    Selects the best log out of a list to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of list of dict
        A list of log-like lists of dictionaries evaluations.

    extractor: function str -> float
        A extractor that take a string and returns the value of that string on a dict

    metric_name: str
        String with the name of the column that refers to the metric column to be extracted

    min_num_features: int (default 50)
        The minimun number of features the model can have before stopping

    Returns
    ----------
    stop: bool
        A boolean whether to stop recursion or not
    """

    best_log = get_best_performing_log(first(logs), extractor, metric_name)

    return stop_by_num_features([best_log], min_num_features)
Esempio n. 2
0
 def composer(self, tokens, **kwargs):
     rekey = []
     for i, token in enumerate(tokens):
         token = [token]
         if first(token[0]) is map:
             token[0][1] = [delayed(token[0][1][0])]
             token.append([Parallel(n_jobs=self.n_jobs), [], {}])
         rekey.extend(token)
     return super().composer(rekey)
Esempio n. 3
0
def auto_correlation(im, **kwargs):
    windowed = weiner_khinchin_auto_correlation.copy()
    for i, token in enumerate(windowed._tokens):
        if first(token) in (
                np.fft.fftpack.fftn,
                np.fft.fftpack.ifftn,
        ):
            windowed._tokens[i][2] = kwargs
    return windowed.value(im)
Esempio n. 4
0
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType,
              train_fn: LearnerFnType,
              eval_fn: EvalFnType) -> ValidatorReturnType:
    """
    Splits the training data into folds given by the split function and
    performs a train-evaluation sequence on each fold by calling
    ``validator_iteration``.

    Parameters
    ----------
    train_data : pandas.DataFrame
        A Pandas' DataFrame with training data

    split_fn : function pandas.DataFrame ->  list of tuple
        Partially defined split function that takes a dataset and returns
        a list of folds. Each fold is a Tuple of arrays. The fist array in
        each tuple contains training indexes while the second array
        contains validation indexes.

    train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs
        A partially defined learning function that takes a training set and
        returns a predict function, a dataset with training predictions and training
        logs.

    eval_fn : function pandas.DataFrame -> dict
        A partially defined evaluation function that takes a dataset with prediction and
        returns the evaluation logs.

    predict_oof : bool
        Whether to return out of fold predictions on the logs

    Returns
    ----------
    A list of log-like dictionary evaluations.
    """

    folds, logs = split_fn(train_data)

    def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType:
        (fold_num, (train_index, test_indexes)) = fold
        return validator_iteration(train_data, train_index, test_indexes,
                                   fold_num, train_fn, eval_fn)

    zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs))

    def _join_split_log(
            log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]:
        train_log = {}
        split_log, validator_log = log_tuple
        train_log["train_log"] = validator_log["train_log"]
        return train_log, assoc(dissoc(validator_log, "train_log"),
                                "split_log", split_log)

    train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs))
    first_train_log = first(train_logs)
    return assoc(first_train_log, "validator_log", list(validator_logs))
def _get_conditions(*conditions):
    return " AND ".join(
        concatv(
            [
                "gp.docstatus = 1",
                "gp.posting_date >= %(from)s",
                "gp.posting_date <= %(to)s",
            ],
            [first(x) for x in filter(lambda x: x, conditions)],
        ))
Esempio n. 6
0
def test_backward_subset_feature_selection(train_df, train_fn, eval_fn,
                                           split_fn, base_extractor,
                                           metric_name):
    features_sets = {
        "first": ["x1", "x2"],
        "second": ["x4", "x5"],
        "third": ["x3", "x6"]
    }

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=-1,
                                             early_stop=10,
                                             iter_limit=50,
                                             min_remaining_features=5)
    assert len(get_used_features(
        first(logs)[0])) <= 5  # Assert stop by remaining features

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=0,
                                             early_stop=10,
                                             iter_limit=1,
                                             min_remaining_features=3)

    assert len(logs) == 1  # Assert stop by iter limit

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=1,
                                             early_stop=2,
                                             iter_limit=50,
                                             min_remaining_features=1)

    assert len(logs) == 2  # Assert stop by early_stop
Esempio n. 7
0
def load_csv():
    '''Initialize data from csv.'''
    tablename = 'measurements'
    print("Checking if table %s exists." % tablename)
    tbl = check_table(tablename)

    if tbl is not None:
        print('Table exists, skipping. Drop it first?')
        return

    print('Table not found, initializing with csv data.')
    tbl = get_table(tablename)
    filepath = os.path.abspath(os.path.dirname(__file__))
    with open(filepath + '/data/old_entries.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:

            if row['site_id'] is not '3':
                print('Skip entry for test_site')
                continue

            # Remove empty items from dict
            cleaned_row = dict((k, v) for k, v in row.items() if v)
            # Take date part of time string gotten from postgres
            datestr = cleaned_row['date'][:10]
            st = strptime(datestr, "%Y-%m-%d")
            entry_date = date.fromtimestamp(mktime(st))

            typeval_dict = dissoc(cleaned_row, 'id', 'date', 'site_id')
            typeval = first(typeval_dict.items())

            entry = {
                'type': first(typeval),
                'value': second(typeval),
                'date': entry_date
            }
            print("Adding " + str(entry))
            tbl.insert(entry)
Esempio n. 8
0
    def render_stack(self, ents):
        for group in ents:
            ent = first(ents[group])
            self.config.print_fn("{}".format(
                colored.stylize(ent.file_capture(), colored.fg("green"))))

            for i, ent in enumerate(ents[group]):
                maybe_dotdot = "" if i == 0 else "\t:\n"
                self.config.print_fn("{}{}\t{}".format(
                    colored.stylize(maybe_dotdot, colored.fg("dark_gray")),
                    colored.stylize(ent.pos, colored.fg("dark_gray")),
                    pretty(ent.line).strip(),
                ))
            self.config.print_fn("")
Esempio n. 9
0
def subsample(graphs, targets, subsample_size=100):
    """subsample."""
    tg = zip(targets, graphs)
    num_classes = len(set(targets))
    class_graphs = groupby(lambda x: first(x), tg)
    subgraphs = []
    subtargets = []
    for y in class_graphs:
        class_subgraphs = class_graphs[y][:subsample_size / num_classes]
        class_subgraphs = [second(x) for x in class_subgraphs]
        subgraphs += class_subgraphs
        subtargets += [y] * len(class_subgraphs)
    subgraphs, subtargets = paired_shuffle(subgraphs, subtargets)
    return list(subgraphs), list(subtargets)
Esempio n. 10
0
def subsample(graphs, targets, subsample_size=100):
    """subsample."""
    tg = zip(targets, graphs)
    num_classes = len(set(targets))
    class_graphs = groupby(lambda x: first(x), tg)
    subgraphs = []
    subtargets = []
    for y in class_graphs:
        class_subgraphs = class_graphs[y][:subsample_size / num_classes]
        class_subgraphs = [second(x) for x in class_subgraphs]
        subgraphs += class_subgraphs
        subtargets += [y] * len(class_subgraphs)
    subgraphs, subtargets = paired_shuffle(subgraphs, subtargets)
    return list(subgraphs), list(subtargets)
Esempio n. 11
0
def load_csv():
    '''Initialize data from csv.'''
    tablename = 'measurements'
    print("Checking if table %s exists." % tablename)
    tbl = check_table(tablename)

    if tbl is not None:
        print('Table exists, skipping. Drop it first?')
        return

    print('Table not found, initializing with csv data.')
    tbl = get_table(tablename)
    filepath = os.path.abspath(os.path.dirname(__file__))
    with open(filepath + '/data/old_entries.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:

            if row['site_id'] is not '3':
                print('Skip entry for test_site')
                continue

            # Remove empty items from dict
            cleaned_row = dict((k, v) for k, v in row.items() if v)
            # Take date part of time string gotten from postgres
            datestr = cleaned_row['date'][:10]
            st = strptime(datestr, "%Y-%m-%d")
            entry_date = date.fromtimestamp(mktime(st))

            typeval_dict = dissoc(cleaned_row, 'id', 'date', 'site_id')
            typeval = first(typeval_dict.items())

            entry = {'type': first(typeval),
                     'value': second(typeval),
                     'date': entry_date}
            print("Adding " + str(entry))
            tbl.insert(entry)
Esempio n. 12
0
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn,
                                               split_fn, base_extractor,
                                               metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=0,
                                                 early_stop=10,
                                                 iter_limit=50,
                                                 min_remaining_features=5)
    assert len(get_used_features(
        first(logs))) <= 5  # Assert stop by remaining features

    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=0,
                                                 early_stop=10,
                                                 iter_limit=1,
                                                 min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=1,
                                                 early_stop=2,
                                                 iter_limit=50,
                                                 min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
Esempio n. 13
0
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn,
                                   base_extractor, metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=0,
                                     early_stop=10,
                                     iter_limit=50,
                                     min_remaining_features=5)

    assert len(get_used_features(
        first(logs))) <= 6  # Assert stop by remaining features

    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=0,
                                     early_stop=10,
                                     iter_limit=1,
                                     min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=1,
                                     early_stop=2,
                                     iter_limit=50,
                                     min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
Esempio n. 14
0
def stop_by_num_features(logs: ListLogListType,
                         min_num_features: int = 50) -> bool:
    """
    Checks for logs to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of dict
        A list of log-like lists of dictionaries evaluations.

    min_num_features: int (default 50)
        The minimun number of features the model can have before stopping

    Returns
    -------
    stop: bool
        A boolean whether to stop recursion or not
    """

    return len(get_used_features(first(logs))) <= min_num_features
Esempio n. 15
0
    def __getattr__(self, attr):
        # Try to do the dataframe things first.
        try:
            value = super().__getattr__(attr)
            if isinstance(value, pandas.DataFrame):
                value = value.pipe(self.__class__)
            return value
        except AttributeError as e:
            pass

        super().__getattribute__(first(self._get_param_names()))

        # If it ain't a dataframe thing then
        # try each of the extensions.
        if not attr.startswith('_'):
            try:
                return self.pipe(self.env.pipes, attr)
            except:
                pass

        return super().__getattr__(attr)
Esempio n. 16
0
def balance(graphs, targets, estimator, ratio=2):
    """balance."""
    class_counts = Counter(targets)
    majority_class = None
    max_count = 0
    minority_class = None
    min_count = 1e6
    for class_key in class_counts:
        if max_count < class_counts[class_key]:
            majority_class = class_key
            max_count = class_counts[class_key]
        if min_count > class_counts[class_key]:
            minority_class = class_key
            min_count = class_counts[class_key]

    desired_size = int(min_count * ratio)

    tg = zip(targets, graphs)
    class_graphs = groupby(lambda x: first(x), tg)
    maj_graphs = [second(x) for x in class_graphs[majority_class]]
    min_graphs = [second(x) for x in class_graphs[minority_class]]

    if estimator:
        # select only the instances in the majority class that
        # have a small margin
        preds = estimator.decision_function(maj_graphs)
    else:
        # select at random
        preds = [random.random() for i in range(len(maj_graphs))]
    preds = [abs(pred) for pred in preds]
    pred_graphs = sorted(zip(preds, maj_graphs))[:desired_size]
    maj_graphs = [g for p, g in pred_graphs]

    bal_graphs = min_graphs + maj_graphs
    bal_pos = [minority_class] * len(min_graphs)
    bal_neg = [majority_class] * len(maj_graphs)
    bal_targets = bal_pos + bal_neg

    return paired_shuffle(bal_graphs, bal_targets)
Esempio n. 17
0
def balance(graphs, targets, estimator, ratio=2):
    """balance."""
    class_counts = Counter(targets)
    majority_class = None
    max_count = 0
    minority_class = None
    min_count = 1e6
    for class_key in class_counts:
        if max_count < class_counts[class_key]:
            majority_class = class_key
            max_count = class_counts[class_key]
        if min_count > class_counts[class_key]:
            minority_class = class_key
            min_count = class_counts[class_key]

    desired_size = int(min_count * ratio)

    tg = zip(targets, graphs)
    class_graphs = groupby(lambda x: first(x), tg)
    maj_graphs = [second(x) for x in class_graphs[majority_class]]
    min_graphs = [second(x) for x in class_graphs[minority_class]]

    if estimator:
        # select only the instances in the majority class that
        # have a small margin
        preds = estimator.decision_function(maj_graphs)
    else:
        # select at random
        preds = [random.random() for i in range(len(maj_graphs))]
    preds = [abs(pred) for pred in preds]
    pred_graphs = sorted(zip(preds, maj_graphs))[:desired_size]
    maj_graphs = [g for p, g in pred_graphs]

    bal_graphs = min_graphs + maj_graphs
    bal_pos = [minority_class] * len(min_graphs)
    bal_neg = [majority_class] * len(maj_graphs)
    bal_targets = bal_pos + bal_neg

    return paired_shuffle(bal_graphs, bal_targets)
Esempio n. 18
0
def remove_features_subsets(
        log_list: LogListType,
        extractor: ExtractorFnType,
        metric_name: str,
        num_removed_by_step: int = 1) -> List[Tuple[str, ...]]:
    """
        Performs feature selection based on the best performing model out of
        several trained models

        Parameters
        ----------
        log_list : list of dict
            A list of log-like lists of dictionaries evaluations.

        extractor: function string -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 1)
            The number of features to remove

        Returns
        ----------
        keys: list of str
            The remaining keys of feature sets after choosing the current best subset

    """

    best_log = get_best_performing_log(log_list, extractor, metric_name)
    best_subset: List[str] = first(gen_dict_extract('used_subsets', best_log))

    return list(
        combinations(best_subset,
                     len(best_subset) - num_removed_by_step))
Esempio n. 19
0
def feature_importance_backward_selection(train_data: pd.DataFrame,
                                          param_train_fn: TuningLearnerFnType,
                                          features: List[str],
                                          split_fn: SplitterFnType,
                                          eval_fn: EvalFnType,
                                          extractor: ExtractorFnType,
                                          metric_name: str,
                                          num_removed_by_step: int = 5,
                                          threshold: float = 0.005,
                                          early_stop: int = 2,
                                          iter_limit: int = 50,
                                          min_remaining_features: int = 50,
                                          save_intermediary_fn: SaveIntermediaryFnType = None,
                                          n_jobs: int = 1) -> ListLogListType:
    """
        Performs train-evaluation iterations while subsampling the used features
        to compute statistics about feature relevance

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        auxiliary_columns: list of str
            List of columns from the dataset that are not used as features but are
            used for evaluation or cross validation. (id, date, etc)

        param_train_fn : function (DataFrame, List of Strings) -> prediction_function, predictions_dataset, logs
            A partially defined learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features: list of str
            Elements must be columns of the train_data

        split_fn : function pandas.DataFrame ->  list of tuple
            Partially defined split function that takes a dataset and returns
            a list of folds. Each fold is a Tuple of arrays. The fist array in
            each tuple contains training indexes while the second array
            contains validation indexes.

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 5)
            Number of features removed at each iteration

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn : function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        n_jobs : int
            Number of parallel processes to spawn.

        Returns
        ----------
        Logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_by_feature_importance(num_removed_by_step=num_removed_by_step)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                               threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features(min_num_features=min_remaining_features))

    train_fn = lambda df: param_train_fn(df, features)
    first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs)

    logs = [first_logs]
    while not stop_fn(logs):
        curr_log = first(logs)

        new_features = selector_fn(curr_log)
        new_train_fn = lambda df: param_train_fn(df, new_features)
        next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs)

        if save_intermediary_fn is not None:
            save_intermediary_fn(next_log)

        logs = [next_log] + logs

    return logs
Esempio n. 20
0
def filter_first_of_type(entries, wanted_type):
    return first([entry for entry in entries if entry['type'] == wanted_type
                  ]) or None
Esempio n. 21
0
 def q_num(cell):
     assert cell.metadata.tags
     return first(filter(lambda t: 'q' in t, cell.metadata.tags))
Esempio n. 22
0
def validator(train_data: pd.DataFrame,
              split_fn: SplitterFnType,
              train_fn: LearnerFnType,
              eval_fn: EvalFnType,
              perturb_fn_train: PerturbFnType = identity,
              perturb_fn_test: PerturbFnType = identity,
              predict_oof: bool = False) -> ValidatorReturnType:
    """
    Splits the training data into folds given by the split function and
    performs a train-evaluation sequence on each fold by calling
    ``validator_iteration``.

    Parameters
    ----------
    train_data : pandas.DataFrame
        A Pandas' DataFrame with training data

    split_fn : function pandas.DataFrame ->  list of tuple
        Partially defined split function that takes a dataset and returns
        a list of folds. Each fold is a Tuple of arrays. The fist array in
        each tuple contains training indexes while the second array
        contains validation indexes.

    train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs
        A partially defined learning function that takes a training set and
        returns a predict function, a dataset with training predictions and training
        logs.

    eval_fn : function pandas.DataFrame -> dict
        A partially defined evaluation function that takes a dataset with prediction and
        returns the evaluation logs.

    perturb_fn_train : PerturbFnType
        A partially defined corruption function that takes a dataset and returns
        a corrupted dataset. Perturbation applied at train-time.

    perturb_fn_test : PerturbFnType
        A partially defined corruption function that takes a dataset and returns
        a corrupted dataset. Perturbation applied at test-time.

    predict_oof : bool
        Whether to return out of fold predictions on the logs

    Returns
    ----------
    A list of log-like dictionary evaluations.
    """

    folds, logs = split_fn(train_data)

    train_fn = compose(train_fn, perturb_fn_train)
    eval_fn = compose(eval_fn, perturb_fn_test)

    def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType:
        (fold_num, (train_index, test_indexes)) = fold
        return validator_iteration(train_data, train_index, test_indexes,
                                   fold_num, train_fn, eval_fn, predict_oof)

    zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs))

    def _join_split_log(
            log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]:
        train_log = {}
        split_log, validator_log = log_tuple
        train_log["train_log"] = validator_log["train_log"]
        return train_log, assoc(dissoc(validator_log, "train_log"),
                                "split_log", split_log)

    def get_perturbed_columns(perturbator: PerturbFnType) -> List[str]:
        args = inspect.getfullargspec(perturbator).kwonlydefaults
        return args['cols'] if args else []

    train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs))
    first_train_log = first(train_logs)

    perturbator_log = {
        'perturbated_train': [],
        'perturbated_test': []
    }  # type: LogType
    if perturb_fn_train != identity:
        perturbator_log['perturbated_train'] = get_perturbed_columns(
            perturb_fn_train)
    if perturb_fn_test != identity:
        perturbator_log['perturbated_test'] = get_perturbed_columns(
            perturb_fn_test)
    first_train_log = assoc(first_train_log, "perturbator_log",
                            perturbator_log)

    return assoc(first_train_log, "validator_log", list(validator_logs))
Esempio n. 23
0
 def first_pane(self):
     first_row = first(self.panes.values())
     return first(first_row.values())
Esempio n. 24
0
def filter_first_of_type(entries, wanted_type):
    return first([entry for entry in entries if entry['type'] == wanted_type]) or None
Esempio n. 25
0
def test_remove_by_feature_importance(logs):
    log = first(logs)
    next_features = remove_by_feature_importance(log, num_removed_by_step=2)
    assert next_features == ["x1", "x3", "x5"]
Esempio n. 26
0
def get_used_features(log: Dict) -> List[str]:
    return first((gen_dict_extract('features', log)))
Esempio n. 27
0
def order_feature_importance_avg_from_logs(log: Dict) -> List[str]:
    d = first(gen_dict_extract('feature_importance', log))
    return sorted(d, key=d.get, reverse=True)
Esempio n. 28
0
def gen_key_avgs_from_iteration(key: str, log: Dict) -> Any:
    return first(gen_dict_extract(key, log))
Esempio n. 29
0
def modulemap(root, io):
    modules = dirs(root, io)
    return pipe(modules, map(lambda m: assoc({}, basename(m), io.yaml(join(m, RUNNER_YAML)))), # noqa
                         filter(lambda m: m[first(m)] is not None),
                         merge) # noqa yapf: disable
Esempio n. 30
0
def backward_subset_feature_selection(train_data: pd.DataFrame,
                                      param_train_fn: TuningLearnerFnType,
                                      features_sets: Dict[str, List[str]],
                                      split_fn: SplitterFnType,
                                      eval_fn: EvalFnType,
                                      extractor: ExtractorFnType,
                                      metric_name: str,
                                      threshold: float = 0.005,
                                      num_removed_by_step: int = 3,
                                      early_stop: int = 2,
                                      iter_limit: int = 50,
                                      min_remaining_features: int = 50,
                                      save_intermediary_fn: SaveIntermediaryFnType = None,
                                      n_jobs: int = 1) -> ListLogListType:
    """
        Performs train-evaluation iterations while testing the subsets of features
        to compute statistics about the importance of each feature category

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs
            A partially defined learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features_sets: dict of string -> list
            Each String Key on the dict is a subset of columns from the dataset, the function will
            analyse the influence of each group of features on the model performance

        split_fn : function pandas.DataFrame ->  list of tuple
            Partially defined split function that takes a dataset and returns
            a list of folds. Each fold is a Tuple of arrays. The fist array in
            each tuple contains training indexes while the second array
            contains validation indexes.

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 3)
            Number of features removed at each iteration

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn : function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        n_jobs : int
            Number of parallel processes to spawn.

        Returns
        ----------
        logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_features_subsets(extractor=extractor,
                                          metric_name=metric_name,
                                          num_removed_by_step=num_removed_by_step)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement_parallel(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                                        threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features_parallel(extractor=extractor, metric_name=metric_name,
                                      min_num_features=min_remaining_features)
    )

    used_subsets = [features_sets.keys()]

    used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets]

    trainers = [lambda df: param_train_fn(df, feat) for feat in used_features]

    first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]
    logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]]

    while not stop_fn(logs):
        curr_log = first(logs)

        new_subsets = selector_fn(curr_log)
        new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets]

        trainers = [lambda df: param_train_fn(df, feat) for feat in new_features]

        val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]

        new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)]

        if save_intermediary_fn is not None:
            save_intermediary_fn(new_logs)

        logs = [new_logs] + logs

    return logs