예제 #1
0
    def xgb_class(X_train, y_train, X_test, y_test):
        """
        Baseline XGB Classifier that prints out ROC score for Train and Test 
        sets provided.
        """
        class_index = 1
        # processor = make_pipeline(
        #    ce.ordinal.OrdinalEncoder(),
        #    SimpleImputer(strategy='median')
        # )

        # X_train_processed = processor.fit_transform(X_train)
        # X_test_processed = processor.transform(X_test)

        encoder = OrdinalEncoder()
        imputer = SimpleImputer()

        X_train_encoded = encoder.fit(X_train)
        X_train_encoded = encoder.transform(X_train)
        X_train_imputed = imputer.fit_transform(X_train_encoded)

        X_test_encoded = encoder.fit(X_test)
        X_test_encoded = encoder.transform(X_test)
        X_test_imputed = imputer.fit_transform(X_test_encoded)

        model = XGBClassifier(n_estimators=100, n_jobs=-1, max_depth=10)

        model.fit(X_train_imputed, y_train, eval_metric='auc')

        # Getting the predicted probabilities
        y_pred = model.predict(X_test_processed)
        y_pred_proba_train = model.predict_proba(X_train_imputed)[:,
                                                                  class_index]
        y_pred_proba_test = model.predict_proba(X_test_imputed)[:, class_index]

        train_roc = roc_auc_score(y_train, y_pred_proba_train)
        test_roc = roc_auc_score(y_test, y_pred_proba_test)

        # Making a new Series for mean baseline print
        s1 = pd.Series(y_train)
        s2 = pd.Series(y_test)
        s3 = s1.append(s2)

        print('Mean Baseline of Target')
        print(s3.value_counts(normalize=True))
        print()
        print(f'Train ROC AUC for class: {train_roc} \n')
        print(f'Test ROC AUC for class: {test_roc}')

        return
예제 #2
0
    def encode_cat_col(self):
        enc = OrdinalEncoder(return_df=False).fit(self.categ_col)
        self.categ_col = enc.transform(self.categ_col)

        # DEBUG
        print(self.DS)
        print(self.categ_col)
예제 #3
0
def fit_label(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the label encoder by fitting it through the given DataFrame
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_label` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    encoder = OrdinalEncoder(cols=cols)
    encoder = encoder.fit(df)
    for idx in range(len(encoder.mapping)):
        encoder.mapping[idx]["mapping"].loc[np.nan] = -2

    result_df = encoder.transform(df)

    for col in cols:
        result_df[col] = result_df[col].replace({-1: 0, -2: 0})
        result_df[col] = result_df[col].astype(int)

    model = {"encoder": encoder, "cols": cols, "na_value": na_value}
    return result_df, model
예제 #4
0
    def test_display_dataset_analysis_3(self, mock_correlation_matrix):
        """
        Test we don't have a problem when only categorical features
        """
        df = self.df.copy()
        df['x1'] = 'a'
        df['x2'] = df['x2'].astype(str)
        encoder = OrdinalEncoder(
            cols=['x1', 'x2'],
            handle_unknown='ignore',
            return_df=True).fit(df)

        df = encoder.transform(df)

        clf = cb.CatBoostClassifier(n_estimators=1).fit(df[['x1', 'x2']], df['y'])
        xpl = SmartExplainer()
        xpl.compile(model=clf, x=df[['x1', 'x2']])
        report = ProjectReport(
            explainer=xpl,
            project_info_file=os.path.join(current_path, '../../data/metadata.yaml'),
            x_train=df[['x1', 'x2']],
        )

        report.display_dataset_analysis()

        self.assertEqual(mock_correlation_matrix.call_count, 0)
    def encode_cat_col(self):  # TODO pandas to numpy
        from category_encoders import OrdinalEncoder
        enc = OrdinalEncoder(return_df=False).fit(self.categ_col)
        self.categ_col = enc.transform(self.categ_col)

        # DEBUG
        print(self.DS)
        print(self.categ_col)
예제 #6
0
    def __init__(self, df_train: pd.DataFrame, df_valid: pd.DataFrame, df_test: pd.DataFrame, use_columns, label_column):
        encoder = OrdinalEncoder(cols=use_columns, handle_unknown='impute').fit(df_train)
        df_train_X = encoder.transform(df_train).astype('int64')
        df_valid_X = encoder.transform(df_valid).astype('int64')
        df_test_X  = encoder.transform(df_test).astype('int64')

        self.train_X = torch.from_numpy(df_train_X[use_columns].values).long()
        self.train_y = df_train[label_column].values
        self.valid_X = torch.from_numpy(df_valid_X[use_columns].values).long()
        self.valid_y = df_valid[label_column].values
        self.test_X = torch.from_numpy(df_test_X[use_columns].values).long()
        self.test_y = df_test[label_column].values

        field_dims = list(df_train_X[use_columns].max())
        self.field_dims = list(map(add_one, field_dims))

        self.data_num = self.train_X.size()[0]
예제 #7
0
def main():
    print('started experimnent')
    with neptune.create_experiment(
            name='feature engineering',
            tags=['feature-extraction', FEATURE_NAME],
            upload_source_files=get_filepaths(),
            properties={'feature_version': FEATURE_NAME}):
        print('loading data')
        train = load_and_merge(RAW_DATA_PATH, 'train',
                               NROWS)[ID_COLS + V1_COLS + ['isFraud']]
        test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS]

        categorical_cols = set(V1_CAT_COLS)
        print('cleaning data')
        email_cols = ['P_emaildomain', 'R_emaildomain']
        train, new_email_cols = clean_email(train, email_cols)
        test, _ = clean_email(test, email_cols)

        categorical_cols.update(new_email_cols)
        for col in email_cols:
            categorical_cols.remove(col)
        categorical_cols = list(categorical_cols)
        neptune.set_property('categorical_columns', str(categorical_cols))

        print('encoding categoricals')
        encoder = OrdinalEncoder(cols=categorical_cols).fit(
            train[ID_COLS + categorical_cols])
        train[ID_COLS + categorical_cols] = encoder.transform(
            train[ID_COLS + categorical_cols])
        test[ID_COLS + categorical_cols] = encoder.transform(
            test[ID_COLS + categorical_cols])

        train_features_path = os.path.join(
            FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME))
        print('saving train to {}'.format(train_features_path))
        train.to_csv(train_features_path, index=None)
        log_data_version(train_features_path, prefix='train_features_')

        test_features_path = os.path.join(
            FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME))
        print('saving test to {}'.format(test_features_path))
        test.to_csv(test_features_path, index=None)
        log_data_version(test_features_path, prefix='test_features_')
class df_OrdinalEncoder(TransformerMixin):
    def __init__(self, handle_unknown='ignore'):
        self.handle_unknown = handle_unknown
        
    def fit(self, X, y=None):
        self.enc = OrdinalEncoder(handle_unknown=self.handle_unknown)
        self.enc.fit(X)
        return self
    
    def transform(self, X):
        X_encoded = self.enc.transform(X)
        X_encoded_df = pd.DataFrame(data=X_encoded, index=X.index, columns=X.columns)
        return X_encoded_df
예제 #9
0
    def _encode_categories(self):
        """
        This private method stands for encoding categorical variables. Label encoding used for ordinal categories and
        one-hot encoding used for nominal categories.
        """

        logging.info(f'#{self._index()} - Encoding categorical columns...')
        # get column names for categorical and numerical features
        categorical_vars = self.X.select_dtypes(include='object').columns
        numerical_vars = self.X.columns.difference(categorical_vars)

        ordinal = pd.Index([
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
        ])
        nominal = categorical_vars.difference(ordinal)

        standard_mapping = {
            'NA': 0,
            'Po': 1,
            'Fa': 2,
            'TA': 3,
            'Gd': 4,
            'Ex': 5
        }
        mapping_for_ordinals = [{
            'col': column,
            'mapping': standard_mapping
        } for column in ordinal]

        x_num = self.X[numerical_vars]
        x_test_num = self.X_test[numerical_vars]

        # one hot encode categorical columns
        one_hot_encoder = OneHotEncoder(use_cat_names=True)
        label_encoder = OrdinalEncoder(drop_invariant=True,
                                       mapping=mapping_for_ordinals,
                                       handle_unknown='error')

        x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal])
        x_cat_ord = label_encoder.fit_transform(self.X[ordinal])
        x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal])
        x_test_cat_ord = label_encoder.transform(self.X_test[ordinal])

        self.X = x_num.join(x_cat_ord).join(x_cat_nom)
        self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom)
        logging.info(f'#{self._step_index} - DONE!')
예제 #10
0
def out_of_folds_predict(X, y):
    callbacks = [
        EarlyStopping(
            # Stop training when loss is no longer improving
            monitor="loss",
            # "no longer improving" being defined as "no better than 1e-2 less"
            min_delta=1e-5,
            # "no longer improving" being further defined as "for at least 2 epochs"
            patience=2,
            verbose=0,
        )
    ]

    preds = np.zeros(X.shape[0])

    n_splits = 4

    if y.sum() < 2:
        kfold = KFold(n_splits=n_splits)
    else:
        kfold = StratifiedKFold(n_splits=n_splits)

    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        print(f'Split {i+1} of {n_splits}...')
        pipe = build_pipe()

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        encoder = OrdinalEncoder()
        X_train = encoder.fit_transform(X_train, y_train).astype(np.float)

        pipe.fit(X_train, y_train, epochs=20, callbacks=callbacks, verbose=0)

        X_test = encoder.transform(X_test).astype(np.float)
        pipe.evaluate(X_test, y_test, verbose=1)

        preds[test_index] = pipe.predict(X_test).flatten()

    pipe = build_pipe()

    return preds
예제 #11
0
def create_label(df, test_df, topic):
    result_dict = {}
    feature_df = df[['title']].copy()
    label_df = df.drop(columns=['itemid', 'title', 'image_path']).copy()

    feature_df['title'] = feature_df['title'].apply(lambda x: text_process(x))
    feature_array = feature_df['title'].values.tolist()
    feature_encoder = TfidfVectorizer()
    feature_encoder.fit(feature_array)
    feature_attr = feature_encoder.transform(feature_array)
    feature_decomposer = TruncatedSVD(500)
    feature_decomposer.fit(feature_attr)
    feature_attr = feature_decomposer.transform(feature_attr)

    test_df['title'] = test_df['title'].apply(lambda x: text_process(x))
    test_array = test_df['title'].values.tolist()
    test_attr = feature_encoder.transform(test_array)
    test_attr = feature_decomposer.transform(test_attr)

    train_itemid = df['itemid']
    test_itemid = test_df['itemid']

    result_dict['itemid_train_{}'.format(topic)] = train_itemid
    result_dict['itemid_test_{}'.format(topic)] = test_itemid
    result_dict['X_train_{}'.format(topic)] = feature_attr
    result_dict['X_encoder_{}'.format(topic)] = feature_encoder
    result_dict['X_decomposer_{}'.format(topic)] = feature_decomposer
    result_dict['X_test_{}'.format(topic)] = test_attr

    for column in label_df.columns:
        label_encoder = OrdinalEncoder(cols=[column], handle_unknown='impute')
        label_encoder.fit(label_df[[column]])
        label_attr = label_encoder.transform(label_df[[column]])

        result_dict['Y_train_{}_{}'.format(topic, column)] = label_attr
        result_dict['Y_encoder_{}_{}'.format(topic, column)] = label_encoder
        result_dict['Y_colname_{}_{}'.format(topic,
                                             column)] = label_attr.columns

    return result_dict
def encode_ordinal_df(dataframe, fit=False):
   """
    Encode ordinal features, preserving the notion of order and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), ordinal features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'ordinal_encoder')
    else:
        encoder = unpickle_obj('ordinal_encoder')

    # transform data
    return encoder.transform(dataframe)
예제 #13
0
application_train = pd.read_feather('../data/input/application_train.ftr')
application_test = pd.read_feather('../data/input/application_test.ftr')

train = application_train
test = application_test

categorical_columns = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
    'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'
]
enc = OrdinalEncoder(cols=categorical_columns, verbose=1)
train[categorical_columns] = enc.fit_transform(train[categorical_columns])
test[categorical_columns] = enc.transform(test[categorical_columns])

X_train = train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y_train = train.TARGET.values
X_test = test.drop(['SK_ID_CURR'], axis=1)

params = {
    'metric': ['auc'],
    'learning_rate': [0.1],
    'num_leaves': [i * 10 for i in range(2, 6)],
    'min_data_in_leaf': [5, 10, 15, 20],
    'random_state': [SEED],
    'verbose': [1]
}

cv = StratifiedKFold(3, shuffle=True, random_state=SEED)
예제 #14
0
                           X,
                           op_version=opv,
                           output_names=operator.outputs[:1])
    cat.add_to(scope, container)


update_registered_converter(OrdinalEncoder, "CategoricalEncoderOrdinalEncoder",
                            ordinal_encoder_shape_calculator,
                            ordinal_encoder_converter)

###################################
# Let's compute the output one a short example.

enc = OrdinalEncoder(cols=[0, 1])
enc.fit(X)
print(enc.transform(X[:5]))

###################################
# Let's check the ONNX conversion produces the same results.

ord_onx = to_onnx(enc, X[:1], target_opset=14)
sess = InferenceSession(ord_onx.SerializeToString())
print(sess.run(None, {'X': X[:5]})[0])

######################################
# That works.
#
# Custom converter for WOEEncoder
# +++++++++++++++++++++++++++++++
#
# We start from example :ref:`l-plot-custom-converter`
예제 #15
0
from skimpute import MissForest

df = pd.read_csv("../exmaple/train_classification.csv")
start = time()
df.pop("Name")
df.pop("Ticket")
df.pop("PassengerId")
y = df.pop("Survived").values
cv = ShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_ix, test_ix = next(cv.split(df, y))
train_X = (df.iloc[train_ix, :])
train_y = y[train_ix]
test_X = (df.iloc[test_ix, :])
test_y = y[test_ix]
imputer = MissForest()
imputer.fit(df)
train_X = imputer.transform(train_X)
test_X = imputer.transform(test_X)
print(train_X)
print(train_X.dtypes)
print(time() - start)
encoder = OrdinalEncoder()
encoder.fit(df)
train_X = encoder.transform(train_X)
test_X = encoder.transform(test_X)
rf = RandomForestClassifier(random_state=42)
rf.fit(train_X, train_y)
score = rf.score(test_X, test_y)
print(score)  # 0.8295964125560538
예제 #16
0
#     LGBMRegressor()
# )

# pipe.fit(X_train, y_train)
# print('훈련 R^2: ', pipe.score(X_train, y_train))
# print('검증 R^2: ', pipe.score(X_val, y_val))
# print('TEST R^2: ', pipe.score(X_test, y_test))

# print('\n훈련 MAE: ', mean_absolute_error(pipe.predict(X_train), y_train))
# print('검증 MAE: ', mean_absolute_error(pipe.predict(X_val), y_val))
# print('TEST MAE: ', mean_absolute_error(pipe.predict(X_test), y_test))

## 인코딩
encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

# ## 파라미터 튜닝
# lightGB = LGBMRegressor(learning_rate=0.01, max_depth=15, n_estimators=300000, num_leaves=250,
#                       random_state=1, reg_alpha=1, reg_lambda=1, subsample=0.7)

# eval_set = [(X_train_encoded, y_train),
#             (X_val_encoded,y_val),
#             (X_test_encoded, y_test)]

# lightGB.fit(X_train_encoded, y_train,
#           eval_set=eval_set,
#           early_stopping_rounds=1000,
#           eval_metric='mae',
#           verbose=100
예제 #17
0
from shapash.data.data_loader import data_loading
from shapash.explainer.smart_explainer import SmartExplainer

house_df, house_dict = data_loading('house_prices')
y_df = house_df['SalePrice'].to_frame()
X_df = house_df[house_df.columns.difference(['SalePrice'])]
house_df.head()

categorical_features = [
    col for col in X_df.columns if X_df[col].dtype == 'object'
]
encoder = OrdinalEncoder(cols=categorical_features,
                         handle_unknown='ignore',
                         return_df=True).fit(X_df)
X_df = encoder.transform(X_df)

Xtrain, Xtest, ytrain, ytest = train_test_split(X_df,
                                                y_df,
                                                train_size=0.75,
                                                random_state=1)

regressor = LGBMRegressor(n_estimators=200).fit(Xtrain, ytrain)

y_pred = pd.DataFrame(regressor.predict(Xtest),
                      columns=['pred'],
                      index=Xtest.index)

xpl = SmartExplainer(features_dict=house_dict)

xpl.compile(x=Xtest, model=regressor, preprocessing=encoder, y_pred=y_pred)
예제 #18
0
    evals=[(dt, "training"), (dv, "valid")],
    num_boost_round=MAX_ROUNDS,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=REPORT_ROUNDS,
)
score = model.best_score
print(f"{METRIC}: {score:.4f}")

xgb.plot_importance(
    model,
    grid=False,
    max_num_features=20,
    importance_type="gain",
)
figure_path = Path("figures")
figure_path.mkdir(exist_ok=True)
plt.savefig(figure_path / "xgboost_importance.png")

# test score
TEST = True
if TEST:
    df_test = pd.read_csv("data/test.csv")
    df_test[obj_cols] = df_test[obj_cols].astype("category")
    df_test = enc.transform(df_test)
    X_test = df_test.drop(DROP_FEATURES, axis=1)
    dtest = xgb.DMatrix(X_test)
    y_pred = model.predict(dtest)
    submission = pd.Series(y_pred, index=df_test["id"])
    submission.name = "target"
    submission.to_csv("submission/xgboost.csv")
class OrdinalEncoder():
    """Maps each categorical value to one column using ordinal encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'ordinal'

    def __init__(self, cols=None):
        self.encoder = Ordinal(cols=cols)

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self
        """
        self.encoder.fit(X, y=None)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe)
        """
        X_new = self.encoder.transform(X)
        feature_names = []
        for feature in self.features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        X_new.columns = feature_names
        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe)
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping the ordinal encoder.
        returns mapping (dict)
        """
        if isinstance(category, str):
            for map in self.encoder.mapping:
                if map['col'] == category:
                    return map['mapping']
        return self.encoder.mapping[category]['mapping']

    def encode_features_list(self, X, features):
        feature_list = []
        index = 0
        for f in features:
            if f.get_name() in self.encoder.cols:
                f = ft.Feature([f], primitive=OrdinalEnc(self, index))
                index += 1
            feature_list.append(f)
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
예제 #20
0
def load_dataset(refresh_days=1,
                 dataset='general',
                 thresh=0.7,
                 simfin_api_key='free',
                 simfin_directory='simfin_data/',
                 data_directory=DATA_DIR,
                 shareprices_df=''):

    # Set Simfin Settings
    sf.set_api_key(simfin_api_key)
    sf.set_data_dir(simfin_directory)

    derived_shareprice_df = sf.load_derived_shareprices(variant='latest',
                                                        market='us')
    derived_shareprice_df.to_csv(data_directory / 'stock_derived.csv')

    company_df = sf.load_companies(market='us', refresh_days=1)
    company_df.to_csv(data_directory / 'company.csv')

    industry_df = sf.load_industries(refresh_days=1)
    industry_df.to_csv(data_directory / 'industry.csv')

    if dataset == 'general':

        # Load Data from Simfin
        income_df = sf.load_income(variant='ttm',
                                   market='us',
                                   refresh_days=refresh_days)
        income_df = income_df.sort_index(level=['Ticker', 'Report Date'],
                                         ascending=[1, 1])
        income_quarterly_df = sf.load_income(variant='quarterly',
                                             market='us',
                                             refresh_days=refresh_days)
        income_quarterly_df = income_quarterly_df.sort_index(
            level=['Ticker', 'Report Date'], ascending=[1, 1])
        income_df.groupby('Ticker').last().to_csv(data_directory /
                                                  'general_income.csv')

        balance_df = sf.load_balance(variant='ttm',
                                     market='us',
                                     refresh_days=refresh_days)
        balance_df = balance_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        balance_quarterly_df = sf.load_balance(variant='quarterly',
                                               market='us',
                                               refresh_days=refresh_days)
        balance_quarterly_df = balance_quarterly_df.sort_index(
            level=['Ticker', 'Report Date'], ascending=[1, 1])
        balance_df.groupby('Ticker').last().to_csv(data_directory /
                                                   'general_balance.csv')

        cashflow_df = sf.load_cashflow(variant='ttm',
                                       market='us',
                                       refresh_days=refresh_days)
        cashflow_df = cashflow_df.sort_index(level=['Ticker', 'Report Date'],
                                             ascending=[1, 1])
        cashflow_quarterlay_df = sf.load_cashflow(variant='quarterly',
                                                  market='us',
                                                  refresh_days=refresh_days)
        cashflow_quarterlay_df = cashflow_quarterlay_df.sort_index(
            level=['Ticker', 'Report Date'], ascending=[1, 1])
        cashflow_df.groupby('Ticker').last().to_csv(data_directory /
                                                    'general_cashflow.csv')

        derived_df = sf.load_derived(variant='ttm',
                                     market='us',
                                     refresh_days=refresh_days)
        derived_df = derived_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        derived_df.groupby('Ticker').last().to_csv(
            data_directory / 'general_fundamental_derived.csv')

        cache_args = {
            'cache_name': 'financial_signals',
            'cache_refresh': refresh_days
        }

        fin_signal_df = sf.fin_signals(df_income_ttm=income_df,
                                       df_balance_ttm=balance_df,
                                       df_cashflow_ttm=cashflow_df,
                                       **cache_args)

        growth_signal_df = sf.growth_signals(
            df_income_ttm=income_df,
            df_income_qrt=income_quarterly_df,
            df_balance_ttm=balance_df,
            df_balance_qrt=balance_quarterly_df,
            df_cashflow_ttm=cashflow_df,
            df_cashflow_qrt=cashflow_quarterlay_df,
            **cache_args)

        # Remove Columns that exist in other Fundamental DataFrames
        balance_columns = balance_df.columns[~balance_df.columns.isin(set(
        ).union(income_df.columns))]
        cashflow_columns = cashflow_df.columns[~cashflow_df.columns.isin(set(
        ).union(income_df.columns))]
        derived_df_columns = derived_df.columns[~derived_df.columns.isin(set(
        ).union(income_df.columns, growth_signal_df.columns, fin_signal_df.
                columns))]

        # Merge the fundamental data into a single dataframe
        fundamental_df = income_df.join(balance_df[balance_columns]).join(
            cashflow_df[cashflow_columns]).join(fin_signal_df).join(
                growth_signal_df).join(derived_df[derived_df_columns])

        fundamental_df['Dataset'] = 'general'

    elif dataset == 'banks':

        # Load Data from Simfin
        income_df = sf.load_income_banks(variant='ttm',
                                         market='us',
                                         refresh_days=refresh_days)
        income_df = income_df.sort_index(level=['Ticker', 'Report Date'],
                                         ascending=[1, 1])
        income_df.groupby('Ticker').last().to_csv(data_directory /
                                                  'banks_income.csv')

        balance_df = sf.load_balance_banks(variant='ttm',
                                           market='us',
                                           refresh_days=refresh_days)
        balance_df = balance_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        balance_df.groupby('Ticker').last().to_csv(data_directory /
                                                   'banks_balance.csv')

        cashflow_df = sf.load_cashflow_banks(variant='ttm',
                                             market='us',
                                             refresh_days=refresh_days)
        cashflow_df = cashflow_df.sort_index(level=['Ticker', 'Report Date'],
                                             ascending=[1, 1])
        cashflow_df.groupby('Ticker').last().to_csv(data_directory /
                                                    'banks_cashflow.csv')

        derived_df = sf.load_derived_banks(variant='ttm',
                                           market='us',
                                           refresh_days=refresh_days)
        derived_df = derived_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        derived_df.groupby('Ticker').last().to_csv(
            data_directory / 'banks_fundamental_derived.csv')
        derived_df.groupby('Ticker').last().to_csv(
            data_directory / 'banks_fundamental_derived.csv')

        # Remove Columns that exist in other Fundamental DataFrames
        balance_columns = balance_df.columns[~balance_df.columns.isin(set(
        ).union(income_df.columns))]
        cashflow_columns = cashflow_df.columns[~cashflow_df.columns.isin(set(
        ).union(income_df.columns))]
        derived_df_columns = derived_df.columns[~derived_df.columns.isin(set(
        ).union(income_df.columns))]

        # Merge the fundamental data into a single dataframe
        fundamental_df = income_df.join(balance_df[balance_columns]).join(
            cashflow_df[cashflow_columns]).join(derived_df[derived_df_columns])

        fundamental_df['Dataset'] = 'banks'

    elif dataset == 'insurance':

        # Load Data from Simfin
        income_df = sf.load_income_insurance(variant='ttm',
                                             market='us',
                                             refresh_days=refresh_days)
        income_df = income_df.sort_index(level=['Ticker', 'Report Date'],
                                         ascending=[1, 1])
        income_df.groupby('Ticker').last().to_csv(data_directory /
                                                  'insurance_income.csv')

        balance_df = sf.load_balance_insurance(variant='ttm',
                                               market='us',
                                               refresh_days=refresh_days)
        balance_df = balance_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        balance_df.groupby('Ticker').last().to_csv(data_directory /
                                                   'insurance_balance.csv')

        cashflow_df = sf.load_cashflow_insurance(variant='ttm',
                                                 market='us',
                                                 refresh_days=refresh_days)
        cashflow_df = cashflow_df.sort_index(level=['Ticker', 'Report Date'],
                                             ascending=[1, 1])
        cashflow_df.groupby('Ticker').last().to_csv(data_directory /
                                                    'insurance_cashflow.csv')

        derived_df = sf.load_derived_insurance(variant='ttm',
                                               market='us',
                                               refresh_days=refresh_days)
        derived_df = derived_df.sort_index(level=['Ticker', 'Report Date'],
                                           ascending=[1, 1])
        derived_df.groupby('Ticker').last().to_csv(
            data_directory / 'insurance_fundamental_derived.csv')

        # Remove Columns that exist in other Fundamental DataFrames
        balance_columns = balance_df.columns[~balance_df.columns.isin(set(
        ).union(income_df.columns))]
        cashflow_columns = cashflow_df.columns[~cashflow_df.columns.isin(set(
        ).union(income_df.columns))]
        derived_df_columns = derived_df.columns[~derived_df.columns.isin(set(
        ).union(income_df.columns))]

        # Merge the fundamental data into a single dataframe
        fundamental_df = income_df.join(balance_df[balance_columns]).join(
            cashflow_df[cashflow_columns]).join(derived_df[derived_df_columns])

        fundamental_df['Dataset'] = 'insurance'

    # Drop Columns with more then 1-thresh nan values
    fundamental_df = fundamental_df.dropna(thresh=int(thresh *
                                                      len(fundamental_df)),
                                           axis=1)

    # Drop Duplicate Index
    fundamental_df = fundamental_df[~fundamental_df.index.duplicated(
        keep='first')]

    # Replace Report Date with the Publish Date because the Publish Date is when the Fundamentals are known to the Public
    fundamental_df['Published Date'] = fundamental_df['Publish Date']
    fundamental_df = fundamental_df.reset_index().set_index(
        ['Ticker', 'Publish Date'])

    df = sf.reindex(df_src=fundamental_df,
                    df_target=shareprices_df,
                    group_index=TICKER,
                    method='ffill').dropna(how='all').join(shareprices_df)

    # General
    # Clean Up
    df = df.drop([
        'SimFinId', 'Currency', 'Fiscal Year', 'Report Date', 'Restated Date',
        'Fiscal Period', 'Published Date'
    ],
                 axis=1)

    if dataset == 'general':
        # Remove Share Prices Over Amazon Share Price
        df = df[df['Close'] <= df.loc['AMZN']['Close'].max()]

        df = df.dropna(subset=[
            'Shares (Basic)', 'Shares (Diluted)', 'Revenue', 'Earnings Growth'
        ])

        non_per_share_cols = [
            'Currency', 'Fiscal Year', 'Fiscal Period', 'Published Date',
            'Restated Date', 'Shares (Basic)', 'Shares (Diluted)', 'Close',
            'Dataset'
        ] + fin_signal_df.columns.tolist() + growth_signal_df.columns.tolist(
        ) + derived_df_columns.difference(
            ['EBITDA', 'Total Debt', 'Free Cash Flow']).tolist()

    else:
        df = df.dropna(
            subset=['Shares (Basic)', 'Shares (Diluted)', 'Revenue'])

        non_per_share_cols = [
            'Currency', 'Fiscal Year', 'Fiscal Period', 'Published Date',
            'Restated Date', 'Shares (Basic)', 'Shares (Diluted)', 'Close',
            'Dataset'
        ] + derived_df_columns.difference(
            ['EBITDA', 'Total Debt', 'Free Cash Flow']).tolist()

    df = df.replace([np.inf, -np.inf], 0)
    df = df.fillna(0)

    per_share_cols = df.columns[~df.columns.isin(non_per_share_cols)]

    df[per_share_cols] = df[per_share_cols].div(df['Shares (Diluted)'], axis=0)

    # Add Company and Industry Information and Categorize
    df = df.join(company_df).merge(
        industry_df, left_on='IndustryId', right_index=True).drop(
            columns=['IndustryId', 'Company Name', 'SimFinId'])

    categorical_features = [
        col for col in df.columns if df[col].dtype == 'object'
    ]

    encoder = OrdinalEncoder(cols=categorical_features,
                             handle_unknown='ignore',
                             return_df=True).fit(df)

    df = encoder.transform(df)

    # Sort
    df = df.sort_index(level=['Ticker', 'Date'], ascending=[1, 1])

    return df
예제 #21
0
class Encoder():
    encode_methods = {
        'OrdinalEncoder': OrdinalEncoder,
        'OneHotEncoder': OneHotEncoder,
        'CountEncoder': CountEncoder,
        'TargetEncoder': TargetEncoder,
    }

    # spark_encode_methods = {
    #     'mean_encoder':,
    #     'target_encoder':,
    #     'label_encoder':,
    #     'onehot_encoder'
    # }
    # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码
    # label_encoder,onehot_encoder可以

    def __init__(self,
                 sparksess=None,
                 logdir='/encoder',
                 handle_unknown='-99999',
                 save_encoder=False):
        self.spark = sparksess
        self.logdir = logdir
        self.save_encoder

        self.ordinal_encoder_features = []
        self.onehot_encoder_features = []
        self.count_encoder_features = []
        self.target_encoder_features = []
        self.ordinal_encoder = OrdinalEncoder(
            cols=self.ordinal_encoder_features,
            return_df=True,
            handle_unknown=handle_unknown)
        self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)
        self.count_encoder = CountEncoder(cols=self.count_encoder_features,
                                          return_df=True,
                                          handle_unknown=handle_unknown)
        self.target_encoder = TargetEncoder(cols=self.target_encoder_features,
                                            return_df=True,
                                            handle_unknown=handle_unknown)

    def fit(self,
            x_train,
            x_val=None,
            y_train=None,
            y_val=None,
            method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame

        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        for feat in method_mapper:
            if method_mapper[feat] == 'OrdinalEncoder':
                self.ordinal_encoder_features.append(feat)
            elif method_mapper[feat] == 'OneHotEncoder':
                self.onehot_encoder_features.append(feat)
            elif method_mapper[feat] == 'CountEncoder':
                self.count_encoder_features.append(feat)
            elif method_mapper[feat] == 'TargetEncoder':
                self.target_encoder_features.append(feat)
            else:
                raise ValueError(
                    '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s'
                    % feat)

        if self.spark is None:
            if len(self.ordinal_encoder_features) != 0 or len(
                    self.onehot_encoder_features) != 0:
                x_whole = x_train.append(x_val)
                y_whole = None
                if not y_train is None and not y_val is None:
                    y_whole = y_train.append(y_val)

                x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole)
                x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole)
                x_train = x_whole[:len(x_train)]
                x_val = x_whole[len(x_train):]

            x_train = self.count_encoder.fit_transform(x_train, y_train)
            x_val = self.count_encoder.transform(x_val, y_val)
            x_train = self.target_encoder.fit_transform(x_train, y_train)
            x_val = self.target_encoder.transform(x_val, y_val)

            if self.save_encoder:
                self.save_encoder()
        return x_train, y_train, x_val, y_val

    def transform(self, x, y=None):
        x = self.ordinal_encoder.transform(x, y)
        x = self.onehot_encoder.transform(x, y)
        x = self.count_encoder.transform(x, y)
        x = self.target_encoder.transform(x, y)
        return x, y

    def fit_transform(self,
                      x_train,
                      x_val=None,
                      y_train=None,
                      y_val=None,
                      method_mapper=None):
        """
        Parameters
        ----------

        x_train: pd.DataFrame

        x_val: pd.DataFrame

        y_train: pd.DataFrame

        y_val: pd.DataFrame
        
        method_mapper: dict
            a mapping of feature to EncodeMethod
            example mapping: 
            {
                'feature1': OrdinalEncoder,
                'feature2': OneHotEncoder,
                'feature3': CountEncoder,
                'feature4': TargetEncoder,
            }
        """
        self.fit(x_train, x_val, y_train, y_val, method_mapper)
        x_train, y_train = self.transform(x_train, y_train)
        if x_val is not None:
            x_val, y_val = self.transform(x_val, y_val)
        return x_train, y_train, x_val, y_val

    def save_encoder(self):
        now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
        os.makedirs(os.path.join(self.logdir, now))

        with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'),
                  'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(
                os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'),
                'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)

    def load_encoder(self, logdir=None):
        with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f:
            pickle.dump(self.ordinal_encoder, f)
        with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f:
            pickle.dump(self.onehot_encoder, f)
        with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f:
            pickle.dump(self.count_encoder, f)
        with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f:
            pickle.dump(self.target_encoder, f)

        with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.ordinal_encoder_features, f)
        with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.onehot_encoder_features, f)
        with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.count_encoder_features, f)
        with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'),
                  'w') as f:
            json.dump(self.target_encoder_features, f)