def invalue_to_similarity(invalue_df, orientation_df):
    """
    invalue_df: converted DataFrame of user inputs
    orientation_df: DataFrame of all people of that orientation
    """

    # concat input values to orientation df to prep for cosine similarity
    df = pd.concat([orientation_df, invalue_df])

    # ohe
    df_encoded = OneHotEncoder(use_cat_names=True).fit_transform(df)

    # make cosine_similarity input (input X)
    cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T

    # drop last encoded row (input Y -- data for input X to reference)
    df_encoded.drop(df_encoded.tail(1).index, inplace=True)

    # cosine_similarity(X, y)
    similarity = cosine_similarity(cosine_input, df_encoded)

    # return top 5 matches
    top5 = pd.DataFrame(similarity.tolist()[0],
                        columns=['similarity'],
                        index=df_encoded.index).sort_values(
                            by='similarity', ascending=False).iloc[:5]

    # return top 5 matches in a df with cosine similarities
    results = pd.DataFrame(columns=cupid.columns)

    for i in top5.index:
        results = results.append(pd.DataFrame(cupid.loc[i]).T)

    return results
Exemple #2
0
    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
Exemple #3
0
def process(
    naive_file,
    treated_file,
    metadata_file,
    resistance_files,
    outfile,
    subtype="All",
    truncate=[41, 235],
):
    print("reading sequences and metadata")
    raw_sequences, consensus = reader(naive_file, treated_file, truncate)

    metadata = read_metadata(metadata_file)

    print(f"choosing {subtype} subtype(s)")
    chosen_sequences, dataset_subtypes = choose_subtype(
        raw_sequences, metadata, subtype)

    print("Filling with consensus AAs")
    AA_sequences = fill_consensus_AAs(chosen_sequences, consensus)
    freqs = get_single_AA_freqs(AA_sequences.drop("label", axis=1))
    single_AA_sequences = get_single_AAs(AA_sequences, freqs)

    print("OneHot encoding")
    columns_to_encode = single_AA_sequences.columns.drop("label")
    encoder = OneHotEncoder(use_cat_names=True,
                            handle_unknown="ignore",
                            cols=columns_to_encode.tolist())
    encoded_sequences = encoder.fit_transform(single_AA_sequences)

    print("removing consensus features")
    features_to_remove = get_features_to_remove(dataset_subtypes)
    total_sequences = encoded_sequences.drop(columns=features_to_remove,
                                             errors="ignore")

    total_sequences["encoded_label"] = total_sequences["label"].apply({
        "treated":
        1,
        "naive":
        0
    }.get)

    drms = get_all_DRMs()
    total_sequences["hasDRM"] = (total_sequences.filter(
        drms, axis=1).any(axis=1).astype(int))

    total_sequences["is_resistant"] = (total_sequences[[
        "encoded_label", "hasDRM"
    ]].any(axis=1).astype(int))

    print("getting resistance scores")
    resistance_scores = get_resistance_scores(resistance_files)

    print("saving dataset to disk")
    joined = total_sequences.join(resistance_scores)
    joined.to_csv(outfile, sep="\t", index=True, header=True)
Exemple #4
0
 def assign_cat_scaler(self,) :    
     self.cat_method = self.cat_info.get("method", None)
     self.cat_cols = self.cat_info.get("cols", [])
     if self.cat_method is None : 
         self.cat_encoder = Empty() 
     elif self.cat_method == "OrdinalEncoder" :
         self.cat_encoder = OrdinalEncoder(cols = self.cat_cols)
     elif self.cat_method == "OneHotEncoder" :
         self.cat_encoder = OneHotEncoder(cols = self.cat_cols)
     else :
         raise NotImplementedError("아직 나머지 구현 안함")
 def fit(self, data):
     '''
     fits the catagorical encoder, coecces to a pd data frame, save input and feature names
     :param data: a pandas data frame, or list
     :return: nothing, fitted encoder is saved as encoder
     '''
     from category_encoders import OneHotEncoder
     ohe = OneHotEncoder(return_df=self.return_df, handle_unknown=self.handle_unknown)
     x = self.replace_infrequent_df(data)
     self.input_names = x.columns
     ohe.fit(x)
     self.encoder = ohe
     self.feature_names_from_cat_encoder()
Exemple #6
0
def one_hot_encoded_result(df_orig):
    df = df_orig.copy(deep=True)
    one_hot_enc = OneHotEncoder(cols=['ordinal_result'], use_cat_names=True)
    one_hot_cols = one_hot_enc.fit_transform(df[['ordinal_result']])
    new_one_hot_col_names = [col[:-2] for col in one_hot_cols.columns]
    mapping_dict = {
        old: new
        for old, new in zip(one_hot_cols.columns, new_one_hot_col_names)
    }
    one_hot_cols.rename(columns=mapping_dict, inplace=True)
    one_hot_cols = one_hot_cols[sorted(one_hot_cols.columns)]
    df_with_new_cols = pd.concat([df, one_hot_cols], axis=1)
    return df_with_new_cols
Exemple #7
0
	def one_hot_encode(self,data):
		"""
		複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。
		ベクトル化したデータセットを返します。
		変換規則はenc_dictに保存されています。

		:param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る)
		"""
		#self.enc_dict={}
		oe=OneHotEncoder(cols=self.columns,handle_unknown="inpute")
		oe_data=oe.fit_transform(data)
		self.model=oe
		#oe_data=oe_data.ix[:,org_order]
		return oe_data
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')
        # get column names for categorical and numerical features
        categorical_vars = self.X.select_dtypes(include='object').columns
        numerical_vars = self.X.columns.difference(categorical_vars)

        ordinal = pd.Index([
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ])
        nominal = categorical_vars.difference(ordinal)

        standard_mapping = {
            'NA': 0,
            'Po': 1,
            'Fa': 2,
            'TA': 3,
            'Gd': 4,
            'Ex': 5
        }
        mapping_for_ordinals = [{
            'col': column,
            'mapping': standard_mapping
        } for column in ordinal]

        x_num = self.X[numerical_vars]
        x_test_num = self.X_test[numerical_vars]

        # one hot encode categorical columns
        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        label_encoder = OrdinalEncoder(drop_invariant=True,
                                       mapping=mapping_for_ordinals,
                                       handle_unknown='error')

        x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal])
        x_cat_ord = label_encoder.fit_transform(self.X[ordinal])
        x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal])
        x_test_cat_ord = label_encoder.transform(self.X_test[ordinal])

        self.X = x_num.join(x_cat_ord).join(x_cat_nom)
        self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom)
        logging.info(f'#{self._step_index} - DONE!')
Exemple #9
0
def predict(user_input: Dict):
    user_input1 = create_df(user_input)
    """Returns a random true or false value"""
    train, test = train_test_split(df, train_size=0.80, test_size=0.20, 
                                    stratify=df['project_success'], random_state=42)
    # select our target 
    target = 'project_success'

    # make train without our target or id
    train_features = train.drop(columns=[target])

    # make numeric features
    numeric_features = train_features.select_dtypes(include='number').columns.tolist()

    # make a cardinality feature to help filter
    cardinality = train_features.select_dtypes(exclude='number').nunique()

    # get a list of relevant categorical data
    categorical_features = cardinality[cardinality <=50].index.tolist()

    # Combine the lists 
    features = numeric_features + categorical_features

    X_train = train[features]
    y_train = train[target]
    X_test = test[features]
    y_test = test[target]
    # print(features)
    # print(X_train.shape, X_test.shape)

    lrmodel = Pipeline([
                    ('ohe', OneHotEncoder(use_cat_names=True)),
                    ('scaler', StandardScaler()),  
                    ('impute', SimpleImputer()),
                    ('classifier', LogisticRegressionCV())
                    ])
    lrmodel.fit(X_train, y_train)

    row = X_test.iloc[[4]]
    # print(X_train)
    # print('training accuracy:', lrmodel.score(X_train, y_train))
    # print('test accuracy:', lrmodel.score(X_test, y_test))
    # if lrmodel.predict(row) == 1:
    #   return 'Your Kickstarter project is likely to succeed!'
    # else:
    #   return 'Your Kickstarter project is likely to fail.'
    # print(X_test.head())
    # print(user_input)
    # print(y_test.head())
    # print(y_test.iloc[[0]])

    if lrmodel.predict(user_input1) == 1:
        predict = {'predict': 'Your Kickstarter project is likely to succeed!'}
        user_input.update(predict)
        return user_input
    else:
        predict = {'predict':'Your Kickstarter project is likely to fail.'}
        user_input.update(predict)
        return user_input
    def fit(self, X, y=None):
        self._dim = X.shape[1]

        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.dummy_encoder = OneHotEncoder(cols=self.cols,
                                           handle_unknown='value',
                                           handle_missing='value')

        self.dummy_encoder = self.dummy_encoder.fit(X)
        self.mapping = self.generate_mapping(X, y)

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = list(X_temp.columns)

        return self
Exemple #11
0
def preproc_data(data, features):
    '''
    Simple preproc data:* LabelEncoded target
                        * One Hot Encoding cat_features
                        * Clean Nans as .median()
                        * split data on X, y

    data: pd.DataFrame()
    cat_features: list() # categorical variables in df
    '''
    # LabelEncoded Target
    for i in features.items():
        if 'target' in i:
            target_col = i[0]
            data[target_col] = data[target_col].astype('category').cat.codes

    y = data[target_col]
    X = data.drop([target_col], axis=1)

    cat_features = []
    for feature in features.items():
        if ('categorical'
                in feature) and (X[feature[0]].nunique(dropna=False) > 2):
            cat_features.append(feature[0])

    # LabelEncoded Binary Features
    for feature in X.columns:
        if (X[feature].nunique(dropna=False) < 3):
            X[feature] = X[feature].astype('category').cat.codes
            if len(cat_features) > 0:
                if feature in cat_features:
                    cat_features.remove(feature)

    # One Hot Encoding
    if len(cat_features) > 0:
        encoder = OneHotEncoder(cols=cat_features, drop_invariant=True)
        X = encoder.fit_transform(X)

    # Nans
    nan_columns = list(X.columns[X.isnull().sum() > 0])
    if nan_columns:
        for nan_column in nan_columns:
            X[nan_column + 'isNAN'] = pd.isna(X[nan_column]).astype('uint8')
        X[nan_columns].fillna(X[nan_columns].median(), inplace=True)

    return (X, y)
Exemple #12
0
 def vartypes(self, df, cat_cols):
     """ Checks which variables in df are categorical and 
         fits One Hot Encoders for each
     """
     for x in df.columns:
         if is_categorical(df[x]) or x in cat_cols:
             self.categorical_var_list.append(x)
             self.ohencoders_dict[x] = OneHotEncoder().fit(df[x])
def train_test_split(dataset, categorical_cols, train_fraction):
    """
    Splits the dataset into a train and a test set
    :param dataset: data to be split
    :param categorical_cols: list of the column names of the categorical columns (previously identified automatically)
    :param train_fraction: portion of dataset to be used as train set
    :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set]
    """
    dataset_encoded = OneHotEncoder(cols=categorical_cols,
                                    use_cat_names=True).fit_transform(dataset)
    train_len = int(len(dataset.index) * train_fraction)
    train_set = dataset.sample(n=train_len, random_state=1)
    train_set_encoded = dataset_encoded.loc[train_set.index].reset_index(
        drop=True)
    test_set = dataset.drop(train_set.index).reset_index(drop=True)
    test_set_encoded = dataset_encoded.drop(
        train_set.index).reset_index(drop=True)
    return train_set.reset_index(
        drop=True), train_set_encoded, test_set, test_set_encoded
Exemple #14
0
 def OneHot_Encoding(self,
                     handle_missing='indicator',
                     handle_unknown='indicator'):
     """
     one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0
     :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列
     :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列
     :return:
     """
     self.encoder = OneHotEncoder(cols=self.cols,
                                  handle_missing=handle_missing,
                                  handle_unknown=handle_unknown)
Exemple #15
0
def train_test_split(dataset, train_fraction):
    """
    Splits the dataset into a train and a test set
    :param dataset: data to be split
    :param categorical_cols: list of the column names of the categorical columns (previously identified automatically)
    :param train_fraction: portion of dataset to be used as train set
    :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set]
    """

    #### Default - set string values as categorial except target column   ####
    ##########################################################################
    categorical_cols = []
    for (columnName, columnData) in dataset.iteritems():
        for value in columnData.values:
            if type(value) is str:
                categorical_cols.append(columnName)
                break

    if (categorical_cols[-1] == "CLASS"):
        categorical_cols = np.delete(categorical_cols, -1)

    ## AR: improve categorial selction

    dataset_encoded = OneHotEncoder(cols=categorical_cols,
                                    use_cat_names=True).fit_transform(dataset)
    if (train_fraction == 1):
        return dataset_encoded, dataset_encoded, dataset_encoded, dataset_encoded

    train_len = int(len(dataset.index) * train_fraction)
    train_set = dataset.sample(n=train_len, random_state=1)
    train_set_encoded = dataset_encoded.loc[train_set.index].reset_index(
        drop=True)
    test_set = dataset.drop(train_set.index).reset_index(drop=True)
    test_set_encoded = dataset_encoded.drop(
        train_set.index).reset_index(drop=True)

    return train_set.reset_index(
        drop=True), train_set_encoded, test_set, test_set_encoded
Exemple #16
0
def fit_onehot(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the One-hot encoder by fitting it through the given DataFrame
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_onehot` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    drop_cols = ["{}_nan".format(col) for col in cols]

    encoder = OneHotEncoder(cols=cols, use_cat_names=True)
    encoder = encoder.fit(df)

    result_df = encoder.transform(df)

    for drop_col in drop_cols:
        if drop_col in result_df.columns:
            result_df = result_df.drop(columns=[drop_col])

    model = {
        "encoder": encoder,
        "cols": cols,
        "na_value": na_value,
        "drop_cols": drop_cols,
    }
    return result_df, model
def encode_low_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode low cardinality categorical features using OneHot Encoding and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'low_card_categorical_encoder')
    else:
        encoder = unpickle_obj('low_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
Exemple #18
0
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        #TODO: handle multiclass / Regression
        if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str):
            large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
            small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]
        elif isinstance(X, pd.DataFrame):
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold]
        else:
            large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold]
            small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["target", "binary", "catboost"]

        if len(small_cardinal_cats) > 0:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if len(large_cardinal_cats) > 0:
            if (objective_type == "classification" and n_classes == 1):
                cat_enc_types.append("woe")

            cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 6  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
Exemple #19
0
def one_hot_encode(columns: Union[List[str], str]) -> CategoryEncoder:
    """Performs simple one-hot encoding

    An alias to stl.category_encode(OneHotEncoder(), columns).

    Args:
        columns: list of columns to be encoded. Treats string as a list of length 1

    Returns:
        A feature constructor returning a concatenation of one-hot encoding of each column.

    Examples:
        >>> stl.one_hot_encode(['Sex', 'Embarked'])
        >>> stl.one_hot_encode('Embarked')
    """
    enc = OneHotEncoder()
    return category_encode(enc, columns=columns, targets=None)
Exemple #20
0
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state):
    if tags["handles categorical"] == False:
        large_threshold = 6
        #TODO: handle numpy arrays with categorical?
        large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold]
        small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold]

        enc_pipe = None
        cat_enc_types = ["binary", "catboost", "woe", "target"]

        if small_cardinal_cats is not None:
            enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True))

        if large_cardinal_cats is not None:
            if (objective_type == "classification" and n_classes > 2): #multiclass
                cat_enc_types = ["binary"]

            cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types)

            if cat_enc_type == "binary":
                # mapping = get_mapping(X, large_cardinal_cats)
                enc = BinaryEncoder(cols=large_cardinal_cats,
                                    drop_invariant=True,
                                    # mapping=mapping
                                    )

            elif cat_enc_type == "woe":
                enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True)

            elif cat_enc_type == "target":
                min_samples_leaf = 10  # TODO: calculate percentage or something else
                enc = TargetEncoder(min_samples_leaf=min_samples_leaf,
                                    cols=large_cardinal_cats,
                                    drop_invariant=True)

            else: # catboost
                enc = CatBoostEncoder(cols=large_cardinal_cats,
                                      drop_invariant=True,
                                      random_state=random_state)  # TODO: replace SEED
                # TODO: permute to the dataset beforehand

            enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc)
    return enc_pipe
def fit_model(model, df, target, numeric_columns, categorical_columns, param_grid):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42)

    imputer = IterativeImputer(max_iter=30, random_state=42)
    scaler = MinMaxScaler()

    frequent = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
    onehot = OneHotEncoder()
    
    pca = PCA(n_components=round(x_train.shape[1]*0.8))

    preprocess = make_column_transformer(
        (make_pipeline(imputer, scaler), numeric_columns),
        (make_pipeline(frequent, onehot), categorical_columns)
    )

    pipe = make_pipeline(preprocess,
                         pca,
                       GridSearchCV(model, param_grid=param_grid, verbose=10))
    
    return pipe.fit(x_train, y_train)
Exemple #22
0
def generate_model(X, y, prefix, param):
    ''' Runs MLP with softmax output activation method, which allows multiclass
    classification. We will need to test All VS One to check performance.'''

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
    print(X_train.shape); print(X_test.shape)
    
    ppl_model1 = Pipeline(steps=[
            ('one-hot encoder', OneHotEncoder()),
            ('mlp', MLPClassifier(hidden_layer_sizes=(5,), activation='relu',
                                  solver='lbfgs', alpha=1e-5, max_iter=500,
                                  random_state=1, verbose=True)) ])
    ppl_model1.out_activation_ = 'softmax'
    
    ppl_model1.fit(X_train, y_train)
    
    predict_train = ppl_model1.predict(X_train)
    predict_test = ppl_model1.predict(X_test)
    
    print(confusion_matrix(y_train, predict_train))
    print(classification_report(y_train, predict_train))
    
    print(confusion_matrix(y_test, predict_test))
    print(classification_report(y_test, predict_test))
    
#    print("weights between input and first hidden layer:")
#    print(ppl_model1[1].coefs_[0])
#    print("\nweights between first hidden and second hidden layer:")
#    print(ppl_model1[1].coefs_[1])
#    
#    print("Bias values for first hidden layer:")
#    print(ppl_model1[1].intercepts_[0])
#    print("\nBias values for second hidden layer:")
#    print(ppl_model1[1].intercepts_[1])
    
    #sklite_file = "draft/mlp_sweep_model.json"
    #lazy = LazyExport(ppl_model1)
    nameout = 'output/'+prefix+'_model'+param.curr_datetime+'.joblib'
    dump(ppl_model1[1], nameout)
Exemple #23
0
    def categoricals(self,
                     model_name='onehot_model.pkl',
                     cols=None,
                     owr=False,
                     model_bin=None):
        """Onehot encoder on categoricals."""

        self.log('Apply onehot encoder on categorical')
        model_path = os.path.join(self.model_path, model_name)
        if cols is None:
            cols = self.data.cat_cols

        if ((not os.path.isfile(model_path)) or owr) and (model_bin is None):
            self.log('\nTrain model\n')
            model_bin = OneHotEncoder(
                cols=cols,
                use_cat_names=True,
                handle_unknown='error',
                drop_invariant=False,
                impute_missing=False)
            model_bin.fit(self.data._X)
            self.data._X = model_bin.transform(self.data._X)
            setattr(model_bin, 'data_schema', self.data._X.columns.values)

            # Save model
            if self.auto_save:
                joblib.dump(model_bin, model_path)

        elif os.path.isfile(model_path):
            # File exists/prediction:
            model_bin = joblib.load(model_path)
            self.data._X = model_bin.transform(self.data._X)
            self.data.check_schema(model_bin, '_X')

        else:
            # Prediction in pipeline
            self.data._X = model_bin.transform(self.data._X)
            self.data.check_schema(model_bin, '_X')

        return model_bin
Exemple #24
0
def fit_model(model, df, target, numeric_columns, categorical_columns, param_grid):
    x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42)

    imputer = IterativeImputer(max_iter=30, random_state=42)
    scaler = MinMaxScaler()

    frequent = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
    onehot = OneHotEncoder()
    
    #smt = SMOTETomek('auto')
    
#     over_samp = SMOTE(sampling_strategy={0: count_class_0})
#     under_samp = NearMiss(sampling_strategy={1: count_class_1})

    preprocess = make_column_transformer(
        (make_pipeline(imputer, scaler), numeric_columns),
        (make_pipeline(frequent, onehot), categorical_columns)
    )

    pipe = make_pipeline(preprocess,
                       GridSearchCV(model, param_grid=param_grid, verbose=10))
    
    return pipe.fit(x_train, y_train)
Exemple #25
0
def defineBestModelPipeline(df, target, categorical_columns, numeric_columns):

    # Splitting original data into Train and Test
    x_train, x_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.1,
                                                        random_state=42)
    y_train = y_train.to_numpy(
    )  # Transforming training targets into numpy arrays
    y_test = y_test.to_numpy()  # Transforming test targets into numpy arrays

    # # If desired, we can balance training classes using one of the functions below
    # # Obtaining balanced data for modeling using Random Under Sampling
    x_train, y_train = balancingClassesRus(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTEENN
    #x_train, y_train = balancingClassesSmoteenn(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTE
    #x_train, y_train = balancingClassesSmote(x_train, y_train)

    # 1st -> Numeric Transformers
    # Here, we are creating different several different data transformation pipelines
    # to be applied in our numeric features
    numeric_transformer_1 = Pipeline(
        steps=[('imp', IterativeImputer(max_iter=30, random_state=42)
                ), ('scaler', MinMaxScaler())])

    numeric_transformer_2 = Pipeline(
        steps=[('imp', IterativeImputer(max_iter=20, random_state=42)
                ), ('scaler', StandardScaler())])

    numeric_transformer_3 = Pipeline(
        steps=[('imp',
                SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())])

    numeric_transformer_4 = Pipeline(
        steps=[('imp',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    # 2nd -> Categorical Transformer
    # Despite my option of not doing it, you can also choose to create different
    # data transformation pipelines for your categorical features.
    categorical_transformer = Pipeline(
        steps=[('frequent', SimpleImputer(strategy='most_frequent')
                ), ('onehot', OneHotEncoder(use_cat_names=True))])
    # 3rd -> Combining both numerical and categorical pipelines
    # Here, we are creating different ColumnTransformers, each one with a different numerical transformation
    data_transformations_1 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_1,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_2 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_2,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_3 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_3,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_4 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_4,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    # And finally, we are going to apply these different data transformations to RandomSearchCV,
    # trying to find the best imputing strategy, the best feature engineering strategy
    # and the best model with it's respective parameters.
    # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps.
    pipe = Pipeline(steps=[
        (
            'data_transformations', data_transformations_1
        ),  # Initializing data transformation step by choosing any of the above
        (
            'feature_eng', PCA()
        ),  # Initializing feature engineering step by choosing any desired method
        ('clf', SVC())
    ])  # Initializing modeling step of the pipeline with any model object
    #memory='cache_folder') -> Used to optimize memory when needed

    # Now, we define the grid of parameters that RandomSearchCV will use. It will randomly chose
    # options for each step inside the dictionaries ('data transformations', 'feature_eng', 'clf'
    # and 'clf parameters'). In the end of it's iterations, RandomSearchCV will return the best options.
    params_grid = [{
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors':
        stats.randint(1, 30),
        'clf__metric': ['minkowski', 'euclidean']
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C':
        stats.uniform(0.01, 10)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [SVC()],
        'clf__C':
        stats.uniform(0.01, 1),
        'clf__gamma':
        stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [DecisionTreeClassifier()],
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [RandomForestClassifier()],
        'clf__n_estimators':
        stats.randint(10, 175),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)],
        'clf__random_state':
        stats.randint(1, 49)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [ExtraTreesClassifier()],
        'clf__n_estimators':
        stats.randint(10, 150),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [GradientBoostingClassifier()],
        'clf__n_estimators':
        stats.randint(10, 100),
        'clf__learning_rate':
        stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [LGBMClassifier()],
        'clf__n_estimators':
        stats.randint(1, 100),
        'clf__learning_rate':
        stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [XGBClassifier()],
        'clf__n_estimators':
        stats.randint(5, 125),
        'clf__eta':
        stats.uniform(0.01, 1),
        'clf__max_depth': [None, stats.randint(1, 6)],
        'clf__gamma':
        stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [
            StackingClassifier(estimators=[
                ('svc', SVC(C=1, gamma=1)),
                ('rf',
                 RandomForestClassifier(max_depth=7,
                                        max_features=None,
                                        n_estimators=60,
                                        n_jobs=-1,
                                        random_state=42)),
                ('xgb',
                 XGBClassifier(eta=0.6,
                               gamma=0.7,
                               max_depth=None,
                               n_estimators=30))
            ],
                               final_estimator=LogisticRegression(C=1))
        ]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [
            VotingClassifier(estimators=[
                ('gbt',
                 GradientBoostingClassifier(learning_rate=0.8,
                                            max_depth=None,
                                            n_estimators=30)),
                ('lgbm',
                 LGBMClassifier(n_estimators=30,
                                learning_rate=0.6,
                                max_depth=None)),
                ('xgb',
                 XGBClassifier(eta=0.8,
                               gamma=0.8,
                               max_depth=None,
                               n_estimators=40))
            ],
                             voting='soft')
        ]
    }]
    # Now, we fit a RandomSearchCV to search over the grid of parameters defined above
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    best_model_pipeline = RandomizedSearchCV(pipe,
                                             params_grid,
                                             n_iter=500,
                                             scoring=metrics,
                                             refit='accuracy',
                                             n_jobs=-1,
                                             cv=5,
                                             random_state=42)

    best_model_pipeline.fit(x_train, y_train)

    # At last, we check the final results
    print(
        "\n\n#---------------- Best Data Pipeline found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[0])
    print(
        "\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[1])
    print(
        "\n\n#---------------- Best Classifier found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[2])
    print(
        "\n\n#---------------- Best Estimator's average Accuracy Score on CV (validation set) ----------------#\n\n",
        best_model_pipeline.best_score_)

    return x_train, x_test, y_train, y_test, best_model_pipeline
Exemple #26
0
                     'GuaranteeGroup', 'FamilySizeBin','IsBoy', 'IsFemale', 'FareLow', 'DataPartition',
                     'PassengerId', 'Survived']]
###########################################################################
# Split data into train and test                                          #
###########################################################################
trainData = fullData.loc[fullData.DataPartition == 'train']
testData = fullData.loc[fullData.DataPartition == 'test']
###########################################################################
# One hot encode                                                          #
###########################################################################
# https://github.com/scikit-learn-contrib/categorical-encoding
# http://contrib.scikit-learn.org/categorical-encoding/onehot.html
from category_encoders import OneHotEncoder
categories = list(set(trainData.select_dtypes(['category']).columns))
target = trainData.Survived
enc = OneHotEncoder(cols=categories,return_df = 1, handle_unknown = 'ignore').fit(trainData, target)
trainData = enc.transform(trainData)
testData = enc.transform(testData)
###########################################################################
# Drop multi collinear levels and no longer required                      #
###########################################################################
dropColumns = ['DataPartition']
trainData = trainData.drop(columns=dropColumns)
testData = testData.drop(columns=dropColumns)
testData = testData.drop(columns='Survived')
###########################################################################
# Start h2o cloud                                                         #
###########################################################################
import h2o
h2o.init()
h2o.remove_all  # clean slate, in case cluster was already running
def train_pipeline(X, y):
    """
    Builds and trains a machine learning pipeline
    """

    numerical_col = [
        'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions',
        'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews',
        'Hits', 'Created to arrival'
    ]
    categorical_col = [
        'Language', 'Website', 'Enquiry type', 'Enquiry status',
        'Client budget', 'Country code', 'GA source', 'GA medium', 'Device',
        'Created month'
    ]

    binary_col = [
        'Flights booked', 'User agent', 'User repeat', 'User referral'
    ]
    text_col = ['Click path', 'GA keyword']
    target = ['is booking']

    # Numerical pipeline

    numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col),
                                       SimpleImputer(strategy="median"),
                                       StandardScaler())

    # Categorical pipeline

    categorical_pipeline = make_pipeline(
        ColumnSelector(cols=categorical_col),
        SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder())

    # Binary pipeline

    binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col),
                                    SimpleImputer(strategy="most_frequent"),
                                    BinaryEncoder())

    # Text pipelines

    text_pipeline_1 = make_pipeline(
        ColumnSelector(cols=['Click path']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), HashingVectorizer(n_features=2**11),
        DenseTransformer())

    text_pipeline_2 = make_pipeline(
        ColumnSelector(cols=['GA keyword']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), TfidfVectorizer(), DenseTransformer())

    # Pipeline union

    processing_pipeline = make_union(numerical_pipeline, categorical_pipeline,
                                     binary_pipeline, text_pipeline_1,
                                     text_pipeline_2)

    estimator = BalancedRandomForestClassifier(bootstrap=False,
                                               class_weight=None,
                                               criterion='gini',
                                               max_depth=60,
                                               max_features='sqrt',
                                               max_leaf_nodes=None,
                                               min_impurity_decrease=0.0,
                                               min_samples_leaf=1,
                                               min_samples_split=5,
                                               min_weight_fraction_leaf=0.0,
                                               n_estimators=472,
                                               n_jobs=1,
                                               oob_score=False,
                                               random_state=None,
                                               replacement=False,
                                               sampling_strategy='auto',
                                               verbose=0,
                                               warm_start=False)

    predictive_pipeline = make_pipeline(processing_pipeline, estimator)

    predictive_pipeline.fit(X, y)

    return predictive_pipeline
    st.subheader(f"Breaking Down {y_axis} by: {x_axis}")
    
    if chart_type == 'line':
        st.line_chart(df.groupby(x_axis)[y_axis].mean())
    elif chart_type == 'bar':
        st.bar_chart(df.groupby(x_axis)[y_axis].mean())
    elif chart_type == 'box':
        chart = sns.catplot(x=x_axis, y=y_axis, kind='box', aspect=2, data=df)
        if df[x_axis].nunique() > 8:
            chart.set_xticklabels(rotation=90)
        st.pyplot(chart)
   
if page in ['Model Explorer', 'Causal Impact']:

    st.cache()
    pipe = make_pipeline(OneHotEncoder(use_cat_names=True), xgb.XGBRegressor())
    
    st.cache()
    X_train, X_val, y_train, y_val = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2, random_state=1985)    
    
if page == 'Model Explorer':
    num_rounds      = st.sidebar.number_input('Number of Boosting Rounds',
                                 min_value=100, max_value=5000, step=100)
    
    tree_depth      = st.sidebar.number_input('Tree Depth',
                                 min_value=2, max_value=8, step=1, value=3)
    
    learning_rate   = st.sidebar.number_input('Learning Rate',
                                    min_value=.001, max_value=1.0, step=.05, value=0.1)
    
    validation_size = st.sidebar.number_input('Validation Size',
def aggregate_per_time_interval(date_interval):

    ### Importing
    customer_data = pd.read_csv('Data/olist_customers_dataset.csv')
    geolocation_data = pd.read_csv('Data/olist_geolocation_dataset.csv')
    order_items_data = pd.read_csv('Data/olist_order_items_dataset.csv')
    order_payments_data = pd.read_csv('Data/olist_order_payments_dataset.csv')
    order_reviews_data = pd.read_csv('Data/olist_order_reviews_dataset.csv')
    olist_order_data = pd.read_csv('Data/olist_orders_dataset.csv')
    olist_products_data = pd.read_csv('Data/olist_products_dataset.csv')
    olist_sellers_data = pd.read_csv('Data/olist_sellers_dataset.csv')
    olist_product_category_data = pd.read_csv(
        'Data/product_category_name_translation.csv')

    ### Converts column of interest to datetime format

    olist_order_data['order_purchase_timestamp'] = pd.to_datetime(
        olist_order_data['order_purchase_timestamp'])

    ### Keeps dates that are between the given date limits

    mask = (olist_order_data['order_purchase_timestamp'] >=
            date_interval[0]) & (olist_order_data['order_purchase_timestamp'] <
                                 date_interval[1])
    olist_order_data = olist_order_data[mask]

    ### Rest of function is the same as in first notebook of the project

    ### Olist_products_dataset merge to get product category name in english
    olist_products_data = olist_products_data.merge(
        olist_product_category_data, how='left', on='product_category_name')

    ### Merge order items dataset with products dataset
    order_items_data = order_items_data.merge(olist_products_data,
                                              how='left',
                                              on='product_id')

    ### Count number of occurrences for each order ID
    count = order_items_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_items per order')

    ### Numeric data will be aggregated by mean
    num_order_items_data = pd.concat([
        order_items_data['order_id'],
        order_items_data.select_dtypes('float64')
    ],
                                     axis=1)

    num_order_items_data = num_order_items_data.groupby('order_id').mean()

    ### Aggregate each order's products category names by its most frequent value
    cat_order_items_data = order_items_data[[
        'order_id', 'product_category_name_english'
    ]].groupby('order_id').agg(lambda g: g.value_counts().index[0]
                               if np.any(g.notnull()) else np.nan)

    order_items_data = pd.concat(
        [count, num_order_items_data, cat_order_items_data], axis=1)

    olist_order_data = olist_order_data.merge(order_items_data,
                                              how='left',
                                              on='order_id')

    ### Number of payments
    ###1. Count the number

    ### Count number of payments per order

    count = order_payments_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_payments per order')

    ### One hot encode payment type feature

    enc = OneHotEncoder(cols=['payment_type'], use_cat_names=True)
    order_payments_data = enc.fit_transform(order_payments_data)

    order_payments_data = order_payments_data.drop('payment_type_not_defined',
                                                   axis=1)

    order_payments_data = order_payments_data.groupby('order_id').mean()

    order_payments_data = pd.concat([order_payments_data, count], axis=1)

    olist_order_data = olist_order_data.merge(order_payments_data,
                                              how='left',
                                              on='order_id')

    ### Number of reviews per order

    count = order_reviews_data.groupby('order_id').count().iloc[:, 0].rename(
        'n_reviews per order').astype('float64')

    order_reviews_data = order_reviews_data[['order_id', 'review_score'
                                             ]].groupby('order_id').mean()

    order_reviews_data = pd.concat([count, order_reviews_data], axis=1)

    olist_order_data = olist_order_data.merge(order_reviews_data,
                                              how='left',
                                              on='order_id')

    ### Merging customer table with order tables

    customer_data = customer_data.merge(olist_order_data,
                                        how='inner',
                                        on='customer_id')

    ### Cutomer data aggregation
    count = customer_data.groupby(
        'customer_unique_id').count().iloc[:,
                                           0].rename('n_orders per customer')

    ### Numeric features aggregated by mean
    numeric_customer_data = pd.concat([
        customer_data.select_dtypes('float64'),
        customer_data['customer_unique_id']
    ],
                                      axis=1)

    numeric_customer_data = numeric_customer_data.groupby(
        'customer_unique_id').mean()

    ### Categorical features aggregated by most frequent value
    cat_customer_data = customer_data[[
        'customer_unique_id', 'product_category_name_english'
    ]].groupby('customer_unique_id').agg(lambda g: g.value_counts().index[0]
                                         if np.any(g.notnull()) else np.nan)

    customer_data = pd.concat(
        [count, numeric_customer_data, cat_customer_data], axis=1)

    return customer_data
Exemple #30
0
def main():
    # Preprocess the data
    # start your code here

    # Load data
    data = pd.read_csv("bank.csv")

    # Fix typo in column name
    data.rename(columns={"subcribed": "subscribed"}, inplace=True)

    # Encoding features
    data = data.replace({"yes": 1, "no": 0})
    ohe = OneHotEncoder(
        cols=["job", "marital", "education", "contact", "month", "poutcome"],
        use_cat_names=True,
        return_df=True,
    )
    data = ohe.fit_transform(data)

    # print(data.head())

    # Get features and target
    X = data.drop(columns=["subscribed"])
    y = data["subscribed"]

    # Split training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=100)

    # end your code here

    # print(
    #     "\n\nDecision Tree: -------------------------------------------------------------------------\n\n"
    # )
    # # start your code here

    # tree_classifier = DecisionTreeClassifier(
    #     max_depth=4,
    #     max_leaf_nodes=4,
    #     random_state=100,
    # )

    # tree_classifier.fit(X_train, y_train)
    # y_pred_tree = tree_classifier.predict(X_test)
    # evaluate(y_test, y_pred_tree)

    # # feature_imp_tree = pd.Series(
    # #     tree_classifier.feature_importances_, index=X_train.columns
    # # ).sort_values(ascending=False)[:10]
    # # print(feature_imp_tree)

    # # plt.figure(figsize=(20, 10))

    # # plot_tree(
    # #     tree_classifier,
    # #     feature_names=X_train.columns,
    # #     class_names=["no", "yes"],
    # #     rounded=True,
    # # )
    # # plt.savefig("decision_tree.svg", bbox_inches="tight")
    # # plt.show()

    # # end your code here

    # print(
    #     "\n\nRandom Forest: -------------------------------------------------------------------------\n\n"
    # )
    # # start your code here
    # rf_classifier = RandomForestClassifier(
    #     # bootstrap=False,
    #     criterion="entropy",
    #     max_depth=9,
    #     max_leaf_nodes=21,
    #     min_samples_leaf=5,
    #     random_state=100,
    # )

    # rf_classifier.fit(X_train, y_train)
    # y_pred_rf = rf_classifier.predict(X_test)
    # evaluate(y_test, y_pred_rf)

    # feature_imp_rf = pd.Series(
    #     rf_classifier.feature_importances_, index=X_train.columns
    # ).sort_values(ascending=False)[:10]
    # print(feature_imp_rf)
    # # end your code here

    print(
        "\n\nXGBoost: -------------------------------------------------------------------------\n\n"
    )
    # start your code here
    xgb_classifier = xgb.XGBClassifier(
        objective="binary:logistic",
        learning_rate=0.1,
        max_depth=3,
        min_child_weight=5,
        use_label_encoder=False,
        colsample_bytree=0.3,
    )

    xgb_classifier.fit(X_train, y_train)
    y_pred_xgb = xgb_classifier.predict(X_test)
    evaluate(y_test, y_pred_xgb)
import os
print(os.listdir("../input"))
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head()
train.info()
test_id = test['id'] # save for submission
del train['id']
del test['id']
train['type'].unique(), train['color'].unique()
sns.violinplot(x='bone_length', y='type', data=train)
sns.boxplot(x='hair_length', y='type', data=train)
sns.pairplot(train)
from category_encoders import OneHotEncoder

encoder = OneHotEncoder(cols=['color'], use_cat_names=True)

train = encoder.fit_transform(train)
test = encoder.fit_transform(test)
train.head()
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(train['type'])

print(encoder.classes_)

train['type_no'] = encoder.transform(train['type'])
train.head()
sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train))