Beispiel #1
0
    def _random_forest_segmentation(self,
                                    dataset) -> List[List[Tuple[str, int]]]:
        log.debug('Dataset before running tree algorithm:\n'
                  f'{dataset.head().to_string()}\n')

        max_gain = 0.0
        max_segmentation: List[List[Tuple[str, int]]] = []
        log.info('Creating segmentation...')
        for _ in range(self._parameters.segment_num_trees):

            dataset.loc[:, 'norm_feedback'] = min_max_normalize(
                dataset['feedback'])

            segmentation_builder = TreeSegmentationBuilder(
                dataset.drop('feedback', axis=1), self._parameters)
            tree = segmentation_builder.build_tree(self._has_experian)
            segments = _get_segments(tree)
            segmentation_df = self._segment_dataframe(dataset, segments)
            tree_gain_vs_no_action = self._evaluate_segmentation(
                segmentation_df)

            if tree_gain_vs_no_action > max_gain:
                max_gain = tree_gain_vs_no_action
                max_segmentation = segments

        log.debug(
            f'Expected gain of the best tree vs no action = {max_gain}\n')
        log.info(f'Segments: {max_segmentation}')

        return max_segmentation
Beispiel #2
0
def _log_event_stats(unprepared_data, influx_exporter):
    events_per_date = unprepared_data.copy().assigned_date.apply(
        lambda x: datetime.strptime(str(x), '%Y%m%d')).value_counts()
    influx_exporter.send_event_stats(events_per_date)
    events_per_week = events_per_date.reset_index(name='count')
    events_per_week.loc[:, 'index'] = events_per_week['index'].apply(
        plds.datetime.datetime_to_pl_week)
    log.debug('Events per pl week:')
    log.debug(events_per_week.groupby('index').sum().sort_index().to_string())
Beispiel #3
0
    def prepare_output(self, country, model_id, prediction_date, action_desc):
        self._output_df[PROJECT] = project_parameters.project_name
        self._output_df[MODEL_VERSION] = project_parameters.model_version
        self._output_df[COUNTRY] = country
        self._output_df[MODEL_ID] = model_id
        self._output_df[ACTION_GENERATED_DATE] = prediction_date
        self._output_df[ACTION_DESC] = action_desc
        self._output_df = self._output_df[FINAL_MODEL_OUTPUT_COLUMNS]
        self._output_df = self._output_df.dropna(subset=[GIVER_ID])
        self._set_types()

        self._output_df.sort_index(axis=1, inplace=True)

        log.debug(f'model_output.columns = {self._output_df.columns}')
 def fit(self, df: pd.DataFrame):
     input_feature_columns = [
         feat.short_name for feat in self._categorical_features
     ]
     feature_df = df[input_feature_columns].copy()
     for feature in self._categorical_features:
         log.debug(f'Transforming feature: {feature.original_name}')
         feature_df.loc[:, feature.short_name] = self._filter_categories(
             feature_df[feature.short_name])
     feature_df = pd.get_dummies(feature_df,
                                 columns=input_feature_columns).fillna(0)
     self._feature_columns = feature_df.columns
     df = pd.concat([df.drop(input_feature_columns, axis=1), feature_df],
                    axis=1)
     return df
Beispiel #5
0
    def _evaluate_segmentation(self, segmentation_df):
        max_feedback_per_segment = segmentation_df[['segment', 'action_code', 'feedback']] \
            .groupby(['segment', 'action_code']).agg(self._parameters.reward_agg_func) \
            .reset_index()[['segment', 'feedback']] \
            .groupby('segment').max()

        num_samples_per_segment = segmentation_df[['segment', 'feedback']] \
            .groupby(['segment']).count() \
            .rename(columns={'feedback': 'num_samples'})

        total_num_samples = np.sum(num_samples_per_segment.values)
        expected_value = np.dot(
            max_feedback_per_segment.values.transpose(),
            num_samples_per_segment.values / total_num_samples)[0][0]
        log.debug(f'Expected value of segmentation = {expected_value}')

        # Expected value of zero action
        all_zero_exp = segmentation_df[segmentation_df.action_code ==
                                       0.0]['feedback'].agg(
                                           self._parameters.reward_agg_func)

        log.debug(f'Expected value of zero action = {all_zero_exp}')

        tree_gain_vs_no_action = expected_value - all_zero_exp
        log.debug(
            f'Expected gain of the tree compared to zero action = {tree_gain_vs_no_action}'
        )
        return tree_gain_vs_no_action
    def select_split_feature(self, total_num_samples, total_num_features):
        segment_num_samples = len(self._df)
        features_df = self._calculate_feature_gains(segment_num_samples)
        features_df[CHERNOFF_CONFIDENT] = self._is_chernoff_confident(
            features_df, segment_num_samples, total_num_samples,
            total_num_features)
        features_df[
            'candidate_split'] = features_df.chernoff_confident & features_df.sample_confident

        log.debug('Split feature candidates:')
        log.debug(list(features_df[features_df.candidate_split][FEATURE]))

        split_candidates = features_df[features_df.candidate_split]
        if split_candidates.empty:
            raise NoSplitCandidatesException
        selected_feature_idx = np.argmax(
            np.random.multinomial(
                1, _softmax(split_candidates[FEATURE_GAIN].values)))
        selected_feature = split_candidates[FEATURE].values[
            selected_feature_idx]

        return selected_feature
Beispiel #7
0
    def _sample_actions(self, df: pd.DataFrame) -> pd.DataFrame:
        df = pd.merge(df[[GIVER_ID, SEGMENT]],
                      self._eps_greedy_distribution,
                      how='left',
                      on=SEGMENT)

        log.debug('Sampling actions from customer distributions...')
        df.loc[:, ACTION_IDX] = df.apply(_select_action, axis=1)
        log.debug('Getting action codes...')
        df.loc[:, ACTION_CODE] = df[ACTION_IDX].apply(
            lambda x: self._available_actions[x])
        log.debug('Computing logprobs...')
        df.loc[:, LOGPROB] = df.apply(self._logprob, axis=1)

        return df
Beispiel #8
0
    def build_tree(self, has_experian=False) -> SegmentNode:
        root = SegmentNode(None, None, 0, [], self._universe_df,
                           self._parameters)
        self._leaves_to_explore.append(root)
        while self._leaves_to_explore:
            log.debug(f'Number of segments so far:')
            log.debug(len(_get_segments(root)))

            log.debug(f'Number of unfinished segments:')
            log.debug(len(self._leaves_to_explore))
            log.debug(f'Unfinished segments: { self._leaves_to_explore }')

            segment_node: SegmentNode = self._leaves_to_explore.popleft()

            log.debug(f'Trying to split segment {segment_node}...')

            if has_experian and segment_node.depth == 1:
                split_feature = HAS_EXPERIAN
            else:
                try:
                    split_feature = segment_node.select_split_feature(
                        self._total_num_samples, self._total_num_features)
                except NoSplitCandidatesException:
                    log.debug(
                        f'No more confident _features for segment {segment_node}'
                    )
                    continue

            log.debug(
                f'Splitting segment {segment_node} on feature {split_feature}...'
            )
            left, right = segment_node.split_segment(split_feature)
            if left.depth < self._parameters.tree_max_depth and left.has_features and right.has_features:
                self._leaves_to_explore.extend([left, right])

        log.debug('No more confident features for any of the segments.')
        log.debug('Segmentation done!')

        return root