Ejemplo n.º 1
0
def trafficPercentByState(df):
    '''
    Returns dfframe with total aircraft movements and percentage of totals
    Note that only works when all states are present - needs a sufficiently large sample
    '''
    df['dest_state'] = df.dest_city_name.str[-2:]
    df['origin_state'] = df.origin_city_name.str[-2:]
    movements = df[['origin_state', 'fl_date'
                    ]].groupby('origin_state').count()['fl_date'] + df[[
                        'dest_state', 'fl_date'
                    ]].groupby('dest_state').count()['fl_date']
    movements = pd.dfFrame(movements)
    movements[
        'percentage_of_total'] = movements.fl_date / movements.fl_date.sum(
        ) * 100
    return movements
Ejemplo n.º 2
0
 def unpack_multitext(self, df):
     """
     unpack a pd.Series of tuples/list (e.g ("'Wifi','Microwave'") into different pd.Series and one-hot encode
     Return the columns
     """
     l = list(df.amenities)
     l = [[word.strip('[" ]') for word in row[1:-1].split(',')]
          for row in l]  # Strip symbols
     cols = set(word for row in l
                for word in row)  #forming a set of distinct text
     cols.remove('')
     # One-hot encode
     new_df = pd.dfFrame(columns=cols)
     for col in cols:
         new_df[col] = df.amenities.apply(lambda x: int(col in x))
     return new_df
Ejemplo n.º 3
0
def get_df_audit_metrics(df) -> dict:
    # Process float fields.
    d = df.dtypes[df.dtypes == 'float64'].index.values
    df[d] = df[d].astype('float64')
    mean = pd.dfFrame({'mean': df[d].mean()})
    std_dev = pd.dfFrame({'std_dev': df[d].std()})
    missing = pd.dfFrame({'missing': df[d].isnull().sum()})
    missing_perc = pd.dfFrame(
        {'missing_perc': df[d].isnull().sum() / df[d].shape[0]})
    minimum = pd.dfFrame({'min': df[d].min()})
    maximum = pd.dfFrame({'max': df[d].max()})
    unique = pd.dfFrame(
        {'unique': df[d].apply(lambda x: len(x.unique()), axis=0)})
    DQ1 = pd.concat(
        [mean, std_dev, missing, missing_perc, minimum, maximum, unique],
        axis=1)

    # Process integer fields.
    d = df.dtypes[df.dtypes == 'int64'].index.values
    df[d] = df[d].astype('float64')
    mean = pd.dfFrame({'mean': df[d].mean()})
    std_dev = pd.dfFrame({'std_dev': df[d].std()})
    missing = pd.dfFrame({'missing': df[d].isnull().sum()})
    missing_perc = pd.dfFrame(
        {'missing_perc': df[d].isnull().sum() / df[d].shape[0]})
    minimum = pd.dfFrame({'min': df[d].min()})
    maximum = pd.dfFrame({'max': df[d].max()})
    unique = pd.dfFrame(
        {'unique': df[d].apply(lambda x: len(x.unique()), axis=0)})
    DQ2 = pd.concat(
        [mean, std_dev, missing, missing_perc, minimum, maximum, unique],
        axis=1)

    # Process string fields
    d = df.dtypes[df.dtypes == 'object'].index.values
    mean = pd.dfFrame({'mean': np.repeat('Not Applicable', len(d))}, index=d)
    std_dev = pd.dfFrame({'std_dev': np.repeat('Not Applicable', len(d))},
                         index=d)
    missing = pd.dfFrame({'missing': df[d].isnull().sum()})
    missing_perc = pd.dfFrame(
        {'missing_perc': df[d].isnull().sum() / df[d].shape[0]})
    minimum = pd.dfFrame({'min': np.repeat('Not Applicable', len(d))}, index=d)
    maximum = pd.dfFrame({'max': np.repeat('Not Applicable', len(d))}, index=d)
    unique = pd.dfFrame(
        {'unique': df[d].apply(lambda x: len(x.unique()), axis=0)})
    DQ3 = pd.concat(
        [mean, std_dev, missing, missing_perc, minimum, maximum, unique],
        axis=1)

    # Process datetime fields
    d = df.dtypes[df.dtypes == 'datetime64[ns, UTC]'].index.values
    mean = pd.dfFrame({'mean': np.repeat('Not Applicable', len(d))}, index=d)
    std_dev = pd.dfFrame({'std_dev': np.repeat('Not Applicable', len(d))},
                         index=d)
    missing = pd.dfFrame({'missing': df[d].isnull().sum()})
    missing_perc = pd.dfFrame(
        {'missing_perc': df[d].isnull().sum() / df[d].shape[0]})
    minimum = pd.dfFrame({'min': np.repeat('Not Applicable', len(d))}, index=d)
    maximum = pd.dfFrame({'max': np.repeat('Not Applicable', len(d))}, index=d)
    unique = pd.dfFrame(
        {'unique': df[d].apply(lambda x: len(x.unique()), axis=0)})
    DQ4 = pd.concat(
        [mean, std_dev, missing, missing_perc, minimum, maximum, unique],
        axis=1)
    DQ = pd.concat([DQ1, DQ2, DQ3, DQ4])
    return DQ.to_dict()
Ejemplo n.º 4
0
df_model['buy_view_ratio'] = df_model.apply(div('product_action_purchase',
                                                'product_action_detail'),
                                            axis=1)
df_model['view_dur_ratio'] = df_model.apply(div('product_action_detail',
                                                'duration'),
                                            axis=1)
df_model['buy_dur_ratio'] = df_model.apply(div('product_action_purchase',
                                               'duration'),
                                           axis=1)

#################################################################################################
# minmax scaling ################################################################################
df_model.fillna(0, inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
df_model = pd.dfFrame(min_max_scaler.fit_transform(df_model.values),
                      columns=df_model.columns,
                      index=df_model.index)
df_model.info()

#################################################################################################
# to check total columns  #######################################################################
total_cnt = 0
cnt = 0
for col in colcat + coltime:
    cnt = len(df[col].unique())
    total_cnt += cnt
    print(" Total unique values of - {} is {} | Total columns = {}".format(
        col, cnt, total_cnt))

# the count matches (remmeber for dummy subtract 2 from total cnt)
df_model = df_model.reset_index()
Ejemplo n.º 5
0
#AI-TECHGYM-2-8-A-1
#特徴量エンジニアリング

#インポート
import pandas as pd
import os
import urllib.request

#ファイルがなければダウンロードする
title = "FIFA_df.csv"
if not os.path.exists(title):
    print(title + " DOWNLOAD.")
    url = "https://raw.githubusercontent.com/amanthedorkknight/fifa18-all-player-statistics/master/2019/df.csv"
    urllib.request.urlretrieve(url,"{0}".format(title))
else :
    print(title + " EXIST.")

df=pd.read_csv('./FIFA_df.csv')

#必要に応じて表示
display(df.head())

#欠損値があるcolumnsの確認
FIFA_isnull_sum = df.isnull().sum()
df_nan = pd.dfFrame(FIFA_isnull_sum[FIFA_isnull_sum > 0])

#NaNがあるcolumnsを表示
display(df_nan.index)
Ejemplo n.º 6
0
def m5_dataset():
    """
     https://www.kaggle.com/ratan123/m5-forecasting-lightgbm-with-timeseries-splits


    """
    import gc

    def read_df():
        print('Reading files...')
        calendar = pd.read_csv(
            '/kaggle/input/m5-forecasting-accuracy/calendar.csv')
        calendar = reduce_mem_usage(calendar)
        print('Calendar has {} rows and {} columns'.format(
            calendar.shape[0], calendar.shape[1]))
        sell_prices = pd.read_csv(
            '/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
        sell_prices = reduce_mem_usage(sell_prices)
        print('Sell prices has {} rows and {} columns'.format(
            sell_prices.shape[0], sell_prices.shape[1]))
        sales_train_validation = pd.read_csv(
            '/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
        print('Sales train validation has {} rows and {} columns'.format(
            sales_train_validation.shape[0], sales_train_validation.shape[1]))
        submission = pd.read_csv(
            '/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
        return calendar, sell_prices, sales_train_validation, submission

    def melt_and_merge(calendar,
                       sell_prices,
                       sales_train_validation,
                       submission,
                       nrows=55000000,
                       merge=False):

        # melt sales df, get it ready for training
        sales_train_validation = pd.melt(sales_train_validation,
                                         id_vars=[
                                             'id', 'item_id', 'dept_id',
                                             'cat_id', 'store_id', 'state_id'
                                         ],
                                         var_name='day',
                                         value_name='demand')
        print(
            'Melted sales train validation has {} rows and {} columns'.format(
                sales_train_validation.shape[0],
                sales_train_validation.shape[1]))
        sales_train_validation = reduce_mem_usage(sales_train_validation)

        # seperate test dfframes
        test1_rows = [row for row in submission['id'] if 'validation' in row]
        test2_rows = [row for row in submission['id'] if 'evaluation' in row]
        test1 = submission[submission['id'].isin(test1_rows)]
        test2 = submission[submission['id'].isin(test2_rows)]

        # change column names
        test1.columns = [
            'id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919',
            'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925',
            'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931',
            'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937',
            'd_1938', 'd_1939', 'd_1940', 'd_1941'
        ]
        test2.columns = [
            'id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947',
            'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953',
            'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959',
            'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965',
            'd_1966', 'd_1967', 'd_1968', 'd_1969'
        ]

        # get product table
        product = sales_train_validation[[
            'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
        ]].drop_duplicates()

        # merge with product table
        test1 = test1.merge(product, how='left', on='id')
        test2 = test2.merge(product, how='left', on='id')

        #
        test1 = pd.melt(test1,
                        id_vars=[
                            'id', 'item_id', 'dept_id', 'cat_id', 'store_id',
                            'state_id'
                        ],
                        var_name='day',
                        value_name='demand')
        test2 = pd.melt(test2,
                        id_vars=[
                            'id', 'item_id', 'dept_id', 'cat_id', 'store_id',
                            'state_id'
                        ],
                        var_name='day',
                        value_name='demand')

        sales_train_validation['part'] = 'train'
        test1['part'] = 'test1'
        test2['part'] = 'test2'

        df = pd.concat([sales_train_validation, test1, test2], axis=0)

        del sales_train_validation, test1, test2

        # get only a sample for fst training
        df = df.loc[nrows:]

        # drop some calendar features
        calendar.drop(['weekday', 'wday', 'month', 'year'],
                      inplace=True,
                      axis=1)

        # delete test2 for now
        df = df[df['part'] != 'test2']

        if merge:
            # notebook crash with the entire dfset (maybee use tensorflow, dask, pyspark xD)
            df = pd.merge(df,
                          calendar,
                          how='left',
                          left_on=['day'],
                          right_on=['d'])
            df.drop(['d', 'day'], inplace=True, axis=1)
            # get the sell price df (this feature should be very important)
            df = df.merge(sell_prices,
                          on=['store_id', 'item_id', 'wm_yr_wk'],
                          how='left')
            print('Our final dfset to train has {} rows and {} columns'.format(
                df.shape[0], df.shape[1]))
        else:
            pass

        gc.collect()

        return df

    calendar, sell_prices, sales_train_validation, submission = read_df()
    df = melt_and_merge(calendar,
                        sell_prices,
                        sales_train_validation,
                        submission,
                        nrows=27500000,
                        merge=True)
    gc.collect()

    def transform(df):

        nan_features = [
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
        ]
        for feature in nan_features:
            df[feature].fillna('unknown', inplace=True)

        encoder = preprocessing.LabelEncoder()
        df['id_encode'] = encoder.fit_transform(df['id'])

        cat = [
            'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
        ]
        for feature in cat:
            encoder = preprocessing.LabelEncoder()
            df[feature] = encoder.fit_transform(df[feature])

        return df

    df = transform(df)
    gc.collect()

    def simple_fe(df):

        # demand features
        df['lag_t28'] = df.groupby(
            ['id'])['demand'].transform(lambda x: x.shift(28))
        df['lag_t29'] = df.groupby(
            ['id'])['demand'].transform(lambda x: x.shift(29))
        df['lag_t30'] = df.groupby(
            ['id'])['demand'].transform(lambda x: x.shift(30))
        df['rolling_mean_t7'] = df.groupby([
            'id'
        ])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
        df['rolling_std_t7'] = df.groupby(
            ['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
        df['rolling_mean_t30'] = df.groupby([
            'id'
        ])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
        df['rolling_mean_t90'] = df.groupby([
            'id'
        ])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
        df['rolling_mean_t180'] = df.groupby([
            'id'
        ])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
        df['rolling_std_t30'] = df.groupby([
            'id'
        ])['demand'].transform(lambda x: x.shift(28).rolling(30).std())

        # price features
        df['lag_price_t1'] = df.groupby(
            ['id'])['sell_price'].transform(lambda x: x.shift(1))
        df['price_change_t1'] = (df['lag_price_t1'] -
                                 df['sell_price']) / (df['lag_price_t1'])
        df['rolling_price_max_t365'] = df.groupby([
            'id'
        ])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
        df['price_change_t365'] = (df['rolling_price_max_t365'] -
                                   df['sell_price']) / (
                                       df['rolling_price_max_t365'])
        df['rolling_price_std_t7'] = df.groupby(
            ['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
        df['rolling_price_std_t30'] = df.groupby(
            ['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
        df.drop(['rolling_price_max_t365', 'lag_price_t1'],
                inplace=True,
                axis=1)

        # time features
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['week'] = df['date'].dt.week
        df['day'] = df['date'].dt.day
        df['dayofweek'] = df['date'].dt.dayofweek

        return df

    x = df[df['date'] <= '2016-04-24']
    y = x.sort_values('date')['demand']
    test = df[(df['date'] > '2016-04-24')]
    x = x.sort_values('date')
    test = test.sort_values('date')
    del df

    n_fold = 3  #3 for timely purpose of the kernel
    folds = TimeSeriesSplit(n_splits=n_fold)

    columns = [
        'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year',
        'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1',
        'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
        'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7',
        'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90',
        'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1',
        'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30'
    ]
    splits = folds.split(x, y)
    y_preds = np.zeros(test.shape[0])
    y_oof = np.zeros(x.shape[0])
    feature_importances = pd.dfFrame()
    feature_importances['feature'] = columns
    mean_score = []
    for fold_n, (train_index, valid_index) in enumerate(splits):
        print('Fold:', fold_n + 1)
        X_train, X_valid = x[columns].iloc[train_index], x[columns].iloc[
            valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        dtrain = lgb.dfset(X_train, label=y_train)
        dvalid = lgb.dfset(X_valid, label=y_valid)
        clf = lgb.train(params,
                        dtrain,
                        2500,
                        valid_sets=[dtrain, dvalid],
                        early_stopping_rounds=50,
                        verbose_eval=100)
        feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
        y_pred_valid = clf.predict(X_valid, num_iteration=clf.best_iteration)
        y_oof[valid_index] = y_pred_valid
        val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))
        print(f'val rmse score is {val_score}')
        mean_score.append(val_score)
        y_preds += clf.predict(test[columns],
                               num_iteration=clf.best_iteration) / n_fold
        del X_train, X_valid, y_train, y_valid
        gc.collect()
    print('mean rmse score over folds is', np.mean(mean_score))
    test['demand'] = y_preds
Ejemplo n.º 7
0
LRAnalysis(X_test_filt2=X_test_filt2_RFECV, 
y_train=y_train, y_test=y_test, y=y, X=X, 
X_train_filt2=X_train_filt2_RFECV) 
 

import feather as ft
from openpyxl import load_workbook
from openpyxl.styles import Font

df = [X_train, y_train, X_test, y_test] 
path1 = 'X_train.feather' 
feather.write_dfframe(X_train, path)
path2 = 'X_test.feather' 
feather.write_dfframe(X_test, path) 
path3 = 'y_test.feather' 
feather.write_dfframe(pd.dfFrame(y_test), path)
path4 = 'y_train.feather' 
feather.write_dfframe(pd.dfFrame(y_train), path)  
pd.dfFrame(y_train)
  
import matplotlib.pyplot as plt 
import numpy as np 
X.columns 
methods=['full','RFE','LASSO','Corr','Relief']   
a = np.array([[1,1,1,1,1,1,1,1,1],[0,0,1,1,0,0,0,1,0],[1,1,0,1,1,0,0,0,0],[
0,1,1,0,1,0,0,1,0],[0,0,1,0,0,0,0,0,0]]) 
a 
fig, ax = plt.subplots() 
plt.imshow(a, cmap='copper', interpolation='nearest') 
plt.colorbar() 
ax.set_xticks(np.arange(len(X.columns)))