Exemple #1
0
def test_retail_binary(ftens_file='retail_binary_files/ftens.csv',
                       labels_file='retail_binary_files/labels.csv',
                       fl_file='retail_binary_files/fl.p'):
    ftens, labels, fl = construct_retail_example(ftens_file, labels_file,
                                                 fl_file)
    baseline_ftens = (ftens.reset_index(
        'customer_id',
        drop=False).drop_duplicates('customer_id',
                                    keep='last').set_index('customer_id'))
    baseline_ftens, baseline_fl = ft.encode_features(baseline_ftens, fl)
    baseline_ftens, baseline_fl = remove_low_information_features(
        baseline_ftens, baseline_fl)
    train_customers, test_customers = train_test_split(
        baseline_ftens.index.values, shuffle=True, test_size=0.1)
    train_labels = labels.loc[train_customers]
    test_labels = labels.loc[test_customers]
    train_ftens = ftens.loc[(train_customers, slice(None)), :]
    test_ftens = ftens.loc[(test_customers, slice(None)), :]
    baseline_train_fm = baseline_ftens.loc[train_customers, :]
    baseline_test_fm = baseline_ftens.loc[test_customers, :]

    dl_model = DLDB(regression=False,
                    classes=[False, True],
                    recurrent_layer_sizes=(32, ),
                    dense_layer_sizes=(32, 32),
                    categorical_max_vocab=10)
    dl_model.fit(train_ftens, train_labels, fl=fl, epochs=1, batch_size=32)
    predictions = dl_model.predict(test_ftens)
    score = roc_auc_score(test_labels, predictions)

    baseline_scores = score_baseline_pipeline(baseline_train_fm, train_labels,
                                              baseline_test_fm, test_labels)
    return score, baseline_scores
Exemple #2
0
def remove_li_features(df):
    """Remove low information features"""
    old_shape = df.shape[1]
    df = selection.remove_low_information_features(df)
    print('Removed features from df: {}'.format(old_shape - df.shape[1]))

    return df
def create_feature_set(data: pd.DataFrame, train_table: str, test_table: str):
    es = create_entity_set(data, train_table, test_table)

    print(f"\nBeginning automated feature engineering using entity set")
    print(f"  MAX_FEATURES={MAX_FEATURES}")
    print(f"  MAX_FT_DEPTH={MAX_FT_DEPTH}")

    start = time.monotonic()
    feature_matrix, feature_names = ft.dfs(entityset=es,
                                           target_entity='combined_train_test',
                                           max_depth=MAX_FT_DEPTH,
                                           max_features=MAX_FEATURES,
                                           verbose=True)
    end = time.monotonic()

    print(
        f"Automated feature engineering completed in {round(end - start)} seconds"
    )

    feature_matrix = selection.remove_low_information_features(feature_matrix)

    print(f"  Found {feature_matrix.shape[1]} features")

    train_data: pd.DataFrame = pd.DataFrame(
        feature_matrix[feature_matrix['DATA_SET'] == 0])
    test_data: pd.DataFrame = pd.DataFrame(
        feature_matrix[feature_matrix['DATA_SET'] == 1])

    return train_data, test_data
Exemple #4
0
Fichier : T.py Projet : lokcyi/AI
def autoFeatureEngineering(es, target_entityName):
    from featuretools.selection import remove_low_information_features

    fm, features = ft.dfs(
        entityset=es,
        target_entity=target_entityName,
        #agg_primitives=['Sum', 'Mean', 'Percent_True'],
        trans_primitives=['divide_numeric',
                          'multiply_numeric'],  #'add_numeric',
        # trans_primitives=['Hour'],
        max_depth=1,
        # approximate='2m',
        #cutoff_time=cutoff_times[1000:],
        ignore_variables={'toolgkpi': ['MFG_DATE', targetColumn]},
        verbose=True)

    # -------------------------用one_hot编码特征矩阵-------------------------
    fm_enc, f_enc = ft.encode_features(fm, features)
    #print("用one_hot编码特征矩阵 Number of features %s" % len(fm_enc))
    # -------------------------na 用0取代-------------------------
    fm_enc = fm_enc.fillna(0)
    #print("fillna Number of features %s" % len(fm_enc))
    # -------------------------移除较小信息的特征-------------------------
    fm_enc = remove_low_information_features(fm_enc)
    # print("移除较小信息的特征 Number of features %s" % len(fm_enc))
    # -------------------------

    # feature = feature_names[14]
    # ft.graph_feature(feature)
    # ft.describe_feature(feature)
    fm_enc.replace([np.inf, -np.inf], np.nan)  # np.inf都用np.nan代替
    print(fm_enc.isnull().sum())
    # print(fm_enc,f_enc)
    print(fm_enc.columns)
    return fm_enc
Exemple #5
0
    def _fit_and_return_result(self,
                               *,
                               timeout: float = None,
                               iterations: int = None):

        if self._entityset is None:
            raise ValueError(
                'Must call .set_training_data() before calling .fit()')

        ignore_variables = {self._target_entity: [self._target]}
        time_index = self._entityset[self._target_entity].time_index
        index = self._entityset[self._target_entity].index
        cutoff_time = None
        if time_index:
            target_df = self._entityset[self._target_entity].df
            cutoff_time = target_df[[index, time_index]]
            ignore_variables = None

        features_only = (not self.hyperparams['encode']
                         and not self.hyperparams['remove_low_information'])

        agg_primitives = [
            name[12:] for name, value in self.hyperparams.items()
            if name.startswith('aggregation_') and value
        ]
        trans_primitives = [
            name[10:] for name, value in self.hyperparams.items()
            if name.startswith('transform_') and value
        ]

        res = ft.dfs(entityset=self._entityset,
                     target_entity=self._target_entity,
                     cutoff_time=cutoff_time,
                     cutoff_time_in_index=False,
                     features_only=features_only,
                     ignore_variables=ignore_variables,
                     max_depth=self.hyperparams['max_depth'],
                     agg_primitives=agg_primitives,
                     trans_primitives=trans_primitives)

        if not features_only:
            if self.hyperparams['encode']:
                fm, self._features = ft.encode_features(
                    *res,
                    top_n=self.hyperparams['top_n'],
                    include_unknown=self.hyperparams['include_unknown'])

            if self.hyperparams['remove_low_information']:
                fm, self._features = remove_low_information_features(
                    fm, self._features)

            self._fitted = True

            return fm

        else:
            self._fitted = True
            self._features = res
def test_remove_low_information_features(es, feature_matrix):
    features = [Feature(v) for v in es['test'].variables]
    feature_matrix, features = remove_low_information_features(feature_matrix,
                                                               features)
    assert feature_matrix.shape == (3, 5)
    assert len(features) == 5
    for f in features:
        assert f.get_name() in feature_matrix.columns
    assert 'one_value' not in feature_matrix.columns
    assert 'all_null' not in feature_matrix.columns
Exemple #7
0
def test_remove_low_information_features(test_es, feature_matrix):
    features = [Feature(v) for v in test_es['test'].variables]
    feature_matrix, features = remove_low_information_features(
        feature_matrix, features)
    assert feature_matrix.shape == (3, 5)
    assert len(features) == 5
    for f in features:
        assert f.get_name() in feature_matrix.columns
    assert 'one_value' not in feature_matrix.columns
    assert 'all_null' not in feature_matrix.columns
Exemple #8
0
    def dfs(self,
            X=None,
            target_entity=None,
            entityset=None,
            entities=None,
            relationships=None):
        if not entities and not entityset:
            target_entity = 'X'
        else:
            target_entity = target_entity or self.target_entity

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities,
                                            relationships)

        if self.training_window is not None:
            entityset.add_last_time_indexes()

        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]

        self.features = ft.dfs(
            cutoff_time=cutoff_time,
            max_depth=self.max_depth,
            entityset=entityset,
            target_entity=target_entity,
            features_only=True,
            agg_primitives=self.agg_primitives,
            trans_primitives=self.trans_primitives,
            max_features=self.max_features,
            training_window=self.training_window,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
        )

        if self.encode or self.remove_low_information:
            X = ft.calculate_feature_matrix(
                self.features,
                entityset=entityset,
                cutoff_time=cutoff_time,
                training_window=self.training_window,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            if self.encode:
                X, self.features = ft.encode_features(X, self.features)

            if self.remove_low_information:
                X, self.features = remove_low_information_features(
                    X, self.features)
def create_features(es, label='Outcome', custom_agg=[]):
    cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', label]]
    fm, features = ft.dfs(entityset=es,
                          target_entity='transactions',
                          agg_primitives=[Sum, Mean] + custom_agg,
                          trans_primitives=[Hour],
                          max_depth=3,
                          approximate='2m',
                          cutoff_time=cutoff_times,
                          verbose=True)
    fm_enc, _ = ft.encode_features(fm, features)
    fm_enc = fm_enc.fillna(0)
    fm_enc = remove_low_information_features(fm_enc)
    labels = fm.pop(label)
    return (fm_enc, labels)
Exemple #10
0
    def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None):
        if not entities and not entityset:
            target_entity = 'X'
        else:
            target_entity = target_entity or self.target_entity

        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities, relationships)

        instance_ids = None
        cutoff_time = None
        if self.time_index:
            cutoff_time = X[[self.index, self.time_index]]
        elif self.index:
            instance_ids = X[self.index]
        else:
            instance_ids = X.index.values

        self.features = ft.dfs(
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
            max_depth=self.max_depth,
            entityset=entityset,
            target_entity=target_entity,
            features_only=True,
            agg_primitives=self.agg_primitives,
            trans_primitives=self.trans_primitives
        )

        X = ft.calculate_feature_matrix(
            self.features,
            entityset=entityset,
            cutoff_time=cutoff_time,
            instance_ids=instance_ids,
        )

        if self.encode:
            X, self.features = ft.encode_features(X, self.features)

        if self.remove_low_information:
            X, self.features = remove_low_information_features(X, self.features)
Exemple #11
0
    def produce(self,
                X,
                instance_ids=None,
                include_unknown=True,
                remove_low_information=True,
                **kwargs):

        if instance_ids is not None:
            feature_matrix = ft.calculate_feature_matrix(
                self.features, instance_ids=instance_ids, **kwargs)

            feature_matrix = (feature_matrix.reset_index('time').loc[
                instance_ids, :].set_index('time', append=True))

        else:
            feature_matrix = ft.calculate_feature_matrix(self.features,
                                                         cutoff_time=X,
                                                         **kwargs)

        for f in self.features:
            if issubclass(f.variable_type, vtypes.Discrete):
                feature_matrix[f.get_name()] = feature_matrix[
                    f.get_name()].astype(object)
            elif issubclass(f.variable_type, vtypes.Numeric):
                feature_matrix[f.get_name()] = pd.to_numeric(
                    feature_matrix[f.get_name()])
            elif issubclass(f.variable_type, vtypes.Datetime):
                feature_matrix[f.get_name()] = pd.to_datetime(
                    feature_matrix[f.get_name()])

        encoded_fm, encoded_fl = ft.encode_features(feature_matrix,
                                                    self.features)

        if remove_low_information:
            encoded_fm, encoded_fl = remove_low_information_features(
                encoded_fm, encoded_fl)

        encoded_fm.reset_index('time', drop=True, inplace=True)

        return encoded_fm.fillna(0)
Exemple #12
0
    def dfs(self,
            X=None,
            target_entity='X',
            entityset=None,
            entities=None,
            relationships=None):
        if entityset is None:
            entityset = self._get_entityset(X, target_entity, entities,
                                            relationships)

        target = entityset[target_entity]
        time_index = target.time_index
        index = target.index

        cutoff_time = None
        if time_index:
            cutoff_time = target.df[[index, time_index]]

        instance_ids = X[index].values.copy()

        self.features = ft.dfs(cutoff_time=cutoff_time,
                               max_depth=self.max_depth,
                               entityset=entityset,
                               target_entity=target_entity,
                               features_only=True,
                               instance_ids=instance_ids)

        X = ft.calculate_feature_matrix(self.features,
                                        entityset=entityset,
                                        instance_ids=instance_ids)

        if self.encode:
            X, self.features = ft.encode_features(X, self.features)

        if self.remove_low_information:
            X, self.features = remove_low_information_features(
                X, self.features)
Exemple #13
0
def feature_tool(df_x):
    '''
    :param df_x: df
    :return: 80,089 features: 283 + ( 283/2*283 ) * 2
    #https://danwertheimer.github.io/rapid-model-prototyping-with-deep-feature-synthesis-and-xgboost
    '''
    print(f'start featuretools')

    # Make an entityset and add the entity
    es = ft.EntitySet(id='sp500')
    es = es.entity_from_dataframe(entity_id='sp500',
                                  dataframe=df_x,
                                  make_index=True,
                                  index='index')

    # es.normalize_entity(base_entity_id='sp500',
    #                      new_entity_id='sessions',
    #                      index        ='session'
    #                       )

    primitives_aggregate = [
        Std, Count
    ]  #'std', 'min', 'count', 'max', 'mean', 'median',  'mode', 'num_true', 'num_unique', 'sum','skew', 'percent_true', 'last', 'trend', 'n_most_common', 'time_since_last','avg_time_between'] #create a single value
    primitives_where = ['std', 'min', 'max', 'mean', 'count']
    primitives_groupby = [
        'cum_sum', 'cum_count', 'cum_mean', 'cum_min', 'cum_max'
    ]  #group by id  # [1, 2, 3, 4, 5]).tolist() = [1, 3, 6, 10, 15]
    primitives_transform = [  #'add_numeric'       #Element-wise       addition of 2 lists. create 283/2*283 = 40,044 new features
        #   MultiplyNumeric
        # , ModuloNumeric
        #, 'multiply_numeric'  #Element-wise multiplication of 2 lists. create 283/2*283 = 40,044 new features
        # 'subtract_numeric'  #Element-wise subtraction    of 2 lists. create 283/2*283 = 40,044 new features
        # , 'modulo_numeric'    #Element-wise modulo         of 2 lists. create 283/2*283 = 40,044 new features
        #, 'and'               #Element-wise logical AND    of 2 lists. create 283/2*283 = 40,044 new features
        #, 'or'                #Element-wise logical OR     of 2 lists. create 283/2*283 = 40,044 new features
        'absolute',
        'percentile'  #, 'cum_count', 'cum_sum', 'cum_mean', 'cum_min', 'cum_max', 'cum_mean'
    ]
    # 'absolute','percentile', 'cum_count', 'cum_sum', 'cum_mean', 'cum_min', 'cum_max', 'cum_mean', 'subtract', 'divide','time_since_previous', 'latitude', 'longitude', isin is_null is_weekend year week log]
    # Run deep feature synthesis with transformation primitives
    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_entity='sp500',
        agg_primitives=primitives_aggregate,
        trans_primitives=primitives_transform,
        groupby_trans_primitives=primitives_groupby,
        where_primitives=primitives_where,
        max_features=89000
        #, drop_contains             = 'target'
        #, seed_features             = ['sepal length']
        ,
        max_depth=1,
        n_jobs=1  #-1 will use all cores
        ,
        verbose=True)

    print(f'finished featuretools. feature_matrix=\n{feature_matrix.head()}')
    #print(f'finished2 es={es}')
    #print(f'finished3 es={es["sp500"]}')
    #print(f'feature_matrix.columns.tolist()={feature_matrix.columns.tolist()}')
    #print(f'ft.list_primitives() {ft.list_primitives()}')
    #print(f'ft.list_primitives() {ft.show_info()}')

    feature_matrix = selection.remove_low_information_features(feature_matrix)
    return feature_matrix
Exemple #14
0
def test_remove_low_information_feature_names(feature_matrix):
    feature_matrix = remove_low_information_features(feature_matrix)
    assert feature_matrix.shape == (3, 5)
    assert 'one_value' not in feature_matrix.columns
    assert 'all_null' not in feature_matrix.columns
def test_remove_low_information_feature_names(feature_matrix):
    feature_matrix = remove_low_information_features(feature_matrix)
    assert feature_matrix.shape == (3, 5)
    assert 'one_value' not in feature_matrix.columns
    assert 'all_null' not in feature_matrix.columns
Exemple #16
0
 def _reduce_feats(self, df):
     df = remove_low_information_features(df)
     df = remove_single_value_features(df, count_nan_as_value=True)
     df.drop(duplicate_columns(df), 1, inplace=True)
     return df
Exemple #17
0
    def run_dfs(self,
                max_depth=1,
                features_only=True,
                ignore_variables=None,
                reduce_mem=False,
                reduce_feats=True,
                trans_primitives=None,
                agg_primitives=None,
                chunk_size=None,
                n_jobs=1,
                **kwargs):
        """Deep Feature Synthesisf
        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
                DateTime: ['time_since_last', 'time_since_first', 'trend']

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        groupby_trans_primitives (list[str or :class:`.primitives.TransformPrimitive`], optional):
            list of Transform primitives to make GroupByTransformFeatures with

        """
        if ignore_variables is None:
            # ignore_variables = [self.target_entity_id, self.index]
            # ignore_variables = ["__id"]  # 忽略单值id 会少了一些count特征
            ignore_variables = []

        if trans_primitives is None:
            trans_primitives = [
                "year",
                "month",
                "day",
                "hour",
                "minute",
                "week",
                "weekday",
                "is_weekend",
                'time_since_previous',
                # diff # https://stackoverflow.com/questions/60324672/how-is-time-since-previous-computed-in-featuretools
                Quarter(),
            ]

        _ = ft.dfs(
            entityset=self.es,
            target_entity=self.
            target_entity_id,  # 具有唯一ID: 不重复id的base_es或者normalize_entity生成的唯一id es
            features_only=features_only,
            max_depth=max_depth,
            ignore_variables={self.entity_id: ignore_variables},
            chunk_size=chunk_size,
            n_jobs=n_jobs,
            verbose=1,
            agg_primitives=agg_primitives,
            trans_primitives=trans_primitives,
            **kwargs)

        if features_only:
            return _
        else:
            df_ = _[0].add_prefix(f'{self.entity_id}_').reset_index()

            if reduce_feats:
                cprint("remove_low_information_features")
                df_ = remove_low_information_features(df_)

                cprint("remove_single_value_features")
                df_ = remove_single_value_features(df_,
                                                   count_nan_as_value=True)

                cprint("remove_duplicate_features")
                dups = duplicate_columns(df_)
                df_ = df_.drop(dups, 1)

            if reduce_mem:
                df_ = reduce_mem_usage(df_)

            return df_
Exemple #18
0
    def transform(
        self,
        groups: Optional[Dict[str, Sequence[str]]] = None,
        use_forgotten: bool = False,
        trans_primitives: Optional[Sequence[str]] = None,
        max_depth: int = 1,
        entity_set_folder_name: Optional[str] = None,
        features_file_name: Optional[str] = None,
        n_jobs: int = 1,
        verbose: bool = True,
    ) -> pd.DataFrame:
        """
        Create new features.

        Wraps Featuretools Deep Feature Synthesis.
        Default Featuretools trans primitives are:
            - "add_numeric"
            - "subtract_numeric"
            - "multiply_numeric"
            - "divide_numeric"
            - "greater_than"
            - "less_than"
            - "and"
            - "or"

        Use relationship groups to relate variables. This avoid wasting
        time creating features from other totally unrelated features.

        This is specially useful when working with datasets with several
        features. Be careful with bias.

        This method does not support multiples entities (consequently
        agg_primitives) yet. Groups are not entities, but only clusters
        of related features.


        Args:
            groups: Dict of related features groups. None to not use
                relationships. (default: None)
            use_forgotten: Create a relationship group for the forgotten
                features in the the arg groups. (default: None)
            trans_primitives: Featuretools trans primitives to use.
                None to use default. (default: None)
            max_depth: Number of iterations in the feature creation
                process. (default: 1)
            entity_set_folder_name: Folder name to store entity set with
                created features. (default: None)
            features_file_name: File name to store created features
                names. Must be JSON. (default: None)
            n_jobs: Number of parallel workers. (default: 1)
            verbose: Verbosity. (default: False)

        Returns:
            DataFrame with new features.
        """
        # Manage groups.
        if not groups:
            groups = self._set_group(self.features)
        groups = self._fix_groups(features=self.features,
                                  groups=groups,
                                  use_forgotten=use_forgotten)

        es = self._set_entity_set(data=self._x, groups=groups)

        old_n_features = self._x.shape[1]  # For comparing later.

        if not trans_primitives:
            trans_primitives = self._TRANS_PRIMITIVES

        index_name = self._index_name(self._x)

        # Define kwargs outside the function just to improve readability.
        dfs_kwargs = {
            "entityset": es,
            "ignore_variables": {group: [index_name]
                                 for group in groups},
            "trans_primitives": trans_primitives,
            "max_depth": max_depth,
            "n_jobs": n_jobs,
            "verbose": False,
        }

        # Create features for each group.
        dfs = [
            ft.dfs(target_entity=key, **dfs_kwargs) for key in groups.keys()
        ]
        # DFS returns a tuple (df and features). Split them.
        features = [features for _, features in dfs for features in features]
        dfs = [matrix for matrix, _ in dfs]

        # Concat all params from all groups to form the new dataset.
        self._x = pd.concat(dfs, axis=1)
        # Do a little cleaning just to remove useless features.
        self._x = selection.remove_low_information_features(self._x)

        # Keep only feature names that are still in the dataset.
        # noinspection PyProtectedMember
        features = [
            feature for feature in features if feature._name in self._x.columns
        ]
        # Update property.
        # noinspection PyProtectedMember
        self.features = [feature._name for feature in features]

        # Export params.
        if entity_set_folder_name:
            es.to_csv(entity_set_folder_name)
        if features_file_name:
            ft.save_features(features, features_file_name)

        # Compare number of features.
        n_new_features = self._x.shape[1] - old_n_features
        if verbose:
            print(f"{n_new_features} features created.")

        return self._x
Exemple #19
0
    def pillar(name='busi', countries=['Chad']):
        url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
        df = pd.read_csv(url + name + '_train.csv')
        df = df.drop(['Unnamed: 0'], axis=1)
        for i in df.columns:
            if i.find('year') > -1:
                df = df.drop([i], axis=1)
        y = df[name]

        df = df.drop(['rank_' + name, name], axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df
        problem_type = 'regression'
        objective = 'auto'

        automl = evalml.automl.AutoMLSearch(problem_type=problem_type,
                                            objective=objective)

        best_pipeline = automl.load(name + '_best_pipeline')

        df = pd.read_csv(url + name + '_test.csv')
        df = df.drop(['Unnamed: 0'], axis=1)

        for i in df.columns:
            if i.find('year') > -1:
                df = df.drop([i], axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        predictions = best_pipeline.predict(df)

        result = pd.DataFrame()

        result[name] = predictions

        df = pd.read_csv(url + name + '_test.csv')
        temp = df[['country', 'year']]

        result = pd.merge(left=temp,
                          right=result,
                          how="left",
                          on=[temp.index, result.index])
        result = result.drop(['key_0', 'key_1'], axis=1)

        result['rank_' + name] = result.groupby("year")[name].rank(
            "dense", ascending=False)
        result['rank_' + name] = result['rank_' + name].astype('int')

        result = result[result['country'].isin(countries)]
        metric = pd.read_csv(
            'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/'
            + name + '_metrics.csv')

        return result, metric
Exemple #20
0
    def prosperity(
            countries=['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']):

        url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
        df = pd.read_csv(url + 'merged.csv')
        df = df.drop(['Unnamed: 0'], axis=1)

        metrics = [
            'educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove',
            'envi'
        ]
        ranks = ['rank_' + metric for metric in metrics]
        drop = metrics + ranks + ['year', 'prosperity_score']

        y = df['prosperity_score']

        df = df.drop(drop, axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df

        problem_type = 'regression'
        objective = 'auto'

        automl = evalml.automl.AutoMLSearch(problem_type=problem_type,
                                            objective=objective)

        #automl.search(X,y)
        #best_pipeline = automl.best_pipeline
        #best_pipeline.fit(X,y)
        #best_pipeline.save('prosperity_best_pipeline')

        best_pipeline = automl.load('prosperity_best_pipeline')

        test = pd.read_csv(url + 'test.csv', index_col=0)

        drop = ['year']
        df = test.copy()
        df = df.drop(drop, axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df

        predictions = best_pipeline.predict(X)

        result = pd.DataFrame()

        result['prosperity'] = predictions

        df = pd.read_csv(url + 'test.csv')
        temp = df[['country', 'year']]

        result = pd.merge(left=temp,
                          right=result,
                          how="left",
                          on=[temp.index, result.index])
        result = result.drop(['key_0', 'key_1'], axis=1)

        result['rank_prosperity'] = result.groupby("year")["prosperity"].rank(
            "dense", ascending=False)
        result['rank_prosperity'] = result['rank_prosperity'].astype('int')

        result = result[result['country'].isin(countries)]

        metric = pd.read_csv(
            'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/prosperity_metrics.csv'
        )

        return result, metric
# Feature importances can be used for dimensionality reduction. They can also be used to help us better understand a problem. For example, we could use the most important features in order to concentrate on these aspects of a client when evaluating a potential loan. Let's look at the number of features with 0 importance which almost certainly can be removed from the featureset.

# In[32]:

print('There are %d features with 0 importance' % sum(fi['importance'] == 0.0))

# ## Remove Low Importance Features
#
# Feature selection is an entire topic by itself, but one thing we can do is remove any features that have only a single unique value or are all null. Featuretools has a default method for doing this available in the `selection` module.

# In[33]:

from featuretools import selection

# Remove features with only one unique value
feature_matrix2 = selection.remove_low_information_features(feature_matrix)

print('Removed %d features' %
      (feature_matrix.shape[1] - feature_matrix2.shape[1]))

# ## Align Train and Test Sets
#
# We also want to make sure the train and test sets have the same exact features. We can first one-hot encode the data (we'll have to do this anyway for our model) and then align the dataframes on the columns.

# In[34]:

# Separate out the train and test sets
train = feature_matrix2[feature_matrix2['set'] == 'train']
test = feature_matrix2[feature_matrix2['set'] == 'test']

# One hot encoding
Exemple #22
0
    # Adjust Entity Set
    cutoff_times = es['transactions'].df[['Transaction Id', 'End Time', 'Outcome']]

    pd.options.display.max_columns = 500

    fm, features = ft.dfs(entityset=es,
                          target_entity='transactions',
                          agg_primitives=aggPrimitives,
                          trans_primitives=transPrimitives,
                          max_depth=maxDepth,
                          cutoff_time=cutoff_times[1000:],
                          verbose=True)

    if encodeOutput == "1":
        # Encode the feature matrix using One-Hot encoding
        fm_enc, f_enc = ft.encode_features(fm, features)
        fm_enc = fm_enc.fillna(0)
        fm_enc = remove_low_information_features(fm_enc)

        # Write Output to CSV
        fm_enc.to_csv("output.csv")
    else:
        # Write Output to CSV
        fm.to_csv("output.csv")

    # Remove Pickle Directory
    shutil.rmtree(dir_name)

    # Close the input file
    inFile.close()